diff --git a/.bazeliskrc b/.bazeliskrc index f5f1a08e94b8..bd4a53557e49 100644 --- a/.bazeliskrc +++ b/.bazeliskrc @@ -1 +1 @@ -USE_BAZEL_VERSION=5.4.0 +USE_BAZEL_VERSION=5.4.1 diff --git a/.bazelrc b/.bazelrc index d38be3f8323a..e81c646c1569 100644 --- a/.bazelrc +++ b/.bazelrc @@ -200,5 +200,5 @@ try-import %workspace%/.llvm-local.bazelrc # It picks up the system headers when someone has protobuf installed via Homebrew. # Work around for https://github.com/bazelbuild/bazel/issues/8053 build:macos --sandbox_block_path=/usr/local/ -#This option controls whether javac checks for missing direct dependencies. -build --strict_java_deps=off +# This option controls whether javac checks for missing direct dependencies. +build --experimental_strict_java_deps=off diff --git a/.buildkite/copy_files.py b/.buildkite/copy_files.py index 29302f0e3136..7180bf0ed84f 100644 --- a/.buildkite/copy_files.py +++ b/.buildkite/copy_files.py @@ -46,7 +46,7 @@ def perform_auth(): def handle_docker_login(resp): pwd = resp.json()["docker_password"] - subprocess.call( + subprocess.check_call( ["docker", "login", "--username", "raytravisbot", "--password", pwd] ) diff --git a/.buildkite/pipeline.arm64.yml b/.buildkite/pipeline.arm64.yml index 6ac4232a5f79..61744b193471 100644 --- a/.buildkite/pipeline.arm64.yml +++ b/.buildkite/pipeline.arm64.yml @@ -60,7 +60,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.7 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -70,7 +70,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.7 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -80,7 +80,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.8 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -90,7 +90,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.8 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -100,7 +100,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.9 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -110,7 +110,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.9 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -120,7 +120,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.10 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -130,7 +130,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: arm64-medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.10 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - python ./ci/build/build-docker-images.py --py-versions py310 -T cu113 -T cu116 -T cu118 --build-type BUILDKITE --build-base --suffix aarch64 diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml index cb227306cf39..8bdd31723559 100644 --- a/.buildkite/pipeline.build.yml +++ b/.buildkite/pipeline.build.yml @@ -88,7 +88,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.7 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -98,7 +98,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.7 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -108,7 +108,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.8 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -118,7 +118,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.8 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -128,7 +128,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.9 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -138,7 +138,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.9 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -148,7 +148,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.10 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - ./ci/env/env_info.sh - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi @@ -158,7 +158,7 @@ conditions: ["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED", "RAY_CI_DOCKER_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED"] instance_size: medium commands: - - LINUX_WHEELS=1 ./ci/ci.sh build + - LINUX_WHEELS=1 BUILD_ONE_PYTHON_ONLY=3.10 ./ci/ci.sh build - pip install -q docker aws_requests_auth boto3 - if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then python .buildkite/copy_files.py --destination docker_login; fi - python ./ci/build/build-docker-images.py --py-versions py310 -T cu111 -T cu112 -T cu113 -T cu116 -T cu118 --build-type BUILDKITE --build-base @@ -368,6 +368,9 @@ - DL=1 ./ci/env/install-dependencies.sh - bash ./ci/ci.sh prepare_docker - ./ci/env/env_info.sh + # This is needed or else the Ray Client tests run into a gRPC forking problem + # similar to https://github.com/grpc/grpc/issues/31885 + - pip install pip install grpcio==1.50.0 - bazel test --config=ci $(./ci/run/bazel_export_options) --test_tag_filters=client_tests,small_size_python_tests -- python/ray/tests/... @@ -378,6 +381,8 @@ --test_env=DOCKER_CERT_PATH=/certs/client --test_env=DOCKER_TLS_CERTDIR=/certs -- python/ray/tests/... + - bazel test --config=ci $(./ci/run/bazel_export_options) + -- python/ray/autoscaler/v2/... - label: ":python: (Large)" conditions: ["RAY_CI_PYTHON_AFFECTED"] @@ -416,6 +421,9 @@ - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - DL=1 ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh + # This is needed or else the Ray Client tests run into a gRPC forking problem + # similar to https://github.com/grpc/grpc/issues/31885 + - pip install pip install grpcio==1.50.0 - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=client_tests,small_size_python_tests --test_env=TEST_EXTERNAL_REDIS=1 @@ -491,6 +499,8 @@ - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - pip uninstall -y ray - RAY_DEBUG_BUILD=debug ./ci/ci.sh build + # Install latest pyspark. We cannot move this to the requirements file as subdependencies conflict + - pip install -U https://ml-team-public-read.s3.us-west-2.amazonaws.com/spark-pkgs/pyspark-3.4.0.dev0-0cb0fa313979e1b82ddd711a05d8c4e78cf6c9f5.tar.gz - ./ci/env/env_info.sh - bazel test --config=ci-debug $(./ci/run/bazel_export_options) --test_env=RAY_ON_SPARK_BACKGROUND_JOB_STARTUP_WAIT=1 diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 2f993cd96546..e15ee57050ea 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -49,3 +49,16 @@ - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/... + +- label: ":zap: :python: Lightning 2.0 Train GPU tests" + conditions: + ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - NO_DASHBOARD=1 ./ci/env/install-minimal.sh 3.8 + - PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt + - pip uninstall -y pytorch-lightning + - pip install lightning==2.0.0 + - ./ci/env/env_info.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=ptl_v2 python/ray/train/... \ No newline at end of file diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index 79a7fb78a0a2..ad474cf46e67 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -196,7 +196,7 @@ - ./ci/env/env_info.sh - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_tag_filters=-learning_tests,-memory_leak_tests,-examples,-tests_dir,-documentation,-multi_gpu,-multi_gpu + --test_tag_filters=-learning_tests,-memory_leak_tests,-examples,-tests_dir,-documentation,-multi_gpu,-no_cpu --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... - label: ":brain: RLlib: RLModule tests" @@ -326,10 +326,10 @@ - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/tests/horovod/... - bazel test --config=ci $(./ci/run/bazel_export_options) python/ray/tests/ray_lightning/... -### NEW EXECUTION PATH +### OLD EXECUTION PATH COMPAT -- label: ":octopus: :sunny: New execution path: Tune tests and examples (small)" +- label: ":octopus: :last_quarter_moon_with_face: Old execution path: Tune tests and examples (small)" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"] instance_size: small parallelism: 3 @@ -339,11 +339,11 @@ - ./ci/env/env_info.sh - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_env=TUNE_NEW_EXECUTION=1 + --test_env=TUNE_NEW_EXECUTION=0 --test_tag_filters=-medium_instance,-py37,-soft_imports,-gpu_only,-rllib,-multinode,-exclude_new_execution python/ray/tune/... -- label: ":octopus: :sunny: New execution path:Tune tests and examples (medium)" +- label: ":octopus: :last_quarter_moon_with_face: Old execution path: Tune tests and examples (medium)" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"] instance_size: medium commands: @@ -351,11 +351,11 @@ - TUNE_TESTING=1 DATA_PROCESSING_TESTING=1 ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_env=TUNE_NEW_EXECUTION=1 + --test_env=TUNE_NEW_EXECUTION=0 --test_tag_filters=medium_instance,-py37,-soft_imports,-gpu_only,-rllib,-multinode,-exclude_new_execution python/ray/tune/... -- label: ":octopus: :brain: :sunny: New execution path: Tune tests and examples {using RLlib}" +- label: ":octopus: :brain: :last_quarter_moon_with_face: Old execution path: Tune tests and examples {using RLlib}" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_RLLIB_AFFECTED"] instance_size: large commands: @@ -363,10 +363,10 @@ - TUNE_TESTING=1 ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_env=TUNE_NEW_EXECUTION=1 + --test_env=TUNE_NEW_EXECUTION=0 --test_tag_filters=-gpu_only,rllib,-exclude_new_execution python/ray/tune/... -- label: ":octopus: :sunny: New execution path: Tune tests and examples. Python 3.7" +- label: ":octopus: :last_quarter_moon_with_face: Old execution path: Tune tests and examples. Python 3.7" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"] instance_size: small commands: @@ -374,10 +374,10 @@ - TUNE_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only - --test_env=TUNE_NEW_EXECUTION=1 + --test_env=TUNE_NEW_EXECUTION=0 --test_tag_filters=py37,-client python/ray/tune/... -- label: ":octopus: :sunny: New execution path: ML library integrations tests and examples. Python 3.7" +- label: ":octopus: :last_quarter_moon_with_face: Old execution path: ML library integrations tests and examples. Python 3.7" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"] instance_size: small commands: @@ -402,7 +402,7 @@ instance_size: medium commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - TUNE_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh + - TUNE_TESTING=1 DATA_PROCESSING_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=client --test_env=RAY_CLIENT_MODE=1 python/ray/util/dask/... - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=client python/ray/tune/... @@ -528,3 +528,30 @@ - ./ci/env/env_info.sh - python ./ci/env/setup_credentials.py wandb comet_ml - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=needs_credentials,-timeseries_libs,-gpu,-py37,-post_wheel_build doc/... + + +- label: ":exploding_death_star: RLlib Contrib: A3C Tests" + conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_CONTRIB_AFFECTED"] + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - (cd rllib_contrib/a3c && pip install -r requirements.txt && pip install -e .) + - ./ci/env/env_info.sh + - pytest rllib_contrib/a3c/tests/test_a3c.py + +- label: ":exploding_death_star: RLlib Contrib: MAML Tests" + conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_CONTRIB_AFFECTED"] + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + + # Install mujoco necessary for the testing environments + - sudo apt install libosmesa6-dev libgl1-mesa-glx libglfw3 patchelf -y + - wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz + - mkdir /root/.mujoco + - mv mujoco210-linux-x86_64.tar.gz /root/.mujoco/. + - (cd /root/.mujoco && tar -xf /root/.mujoco/mujoco210-linux-x86_64.tar.gz) + - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/.mujoco/mujoco210/bin' >> /root/.bashrc + - source /root/.bashrc + + - (cd rllib_contrib/maml && pip install -r requirements.txt && pip install -e .) + - ./ci/env/env_info.sh + - pytest rllib_contrib/maml/tests/test_maml.py diff --git a/.buildkite/pipeline.test.yml b/.buildkite/pipeline.test.yml index 947055b58dd3..3de2ecf6d92d 100644 --- a/.buildkite/pipeline.test.yml +++ b/.buildkite/pipeline.test.yml @@ -12,7 +12,6 @@ instance_size: small commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - pip install -e release/ - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2b995c957d37..0a630e281eda 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -11,18 +11,19 @@ # NOTE: Add @ray-project/ray-docs to all following docs subdirs. /doc/ @ray-project/ray-docs /doc/source/use-cases.rst @ericl @pcmoritz +/doc/source/templates @justinvyu @sofianhnaide # ==== Ray core ==== # API compatibility -/src/ray/protobuf/common.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen @iycheng @scv119 -/src/ray/protobuf/gcs.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen @iycheng @scv119 -/src/ray/protobuf/gcs_service.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen @iycheng @scv119 +/src/ray/protobuf/common.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen @ray-project/ray-core +/src/ray/protobuf/gcs.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen @ray-project/ray-core +/src/ray/protobuf/gcs_service.proto @wuisawesome @ericl @ameerhajali @robertnishihara @pcmoritz @raulchen @ray-project/ray-core /dashboard/modules/snapshot @wuisawesome @ijrsvt @edoakes @alanwguo @architkulkarni /python/ray/autoscaler/_private/monitor.py @wuisawesome @DmitriGekhtman # Autoscaler -/python/ray/autoscaler/ @wuisawesome @DmitriGekhtman @ericl +/python/ray/autoscaler/ @wuisawesome @DmitriGekhtman @ericl @ray-project/ray-core # Metrics /src/ray/stats/metric_defs.h @ray-project/ray-core @@ -75,8 +76,8 @@ # ==== Libraries and frameworks ==== # Ray data. -/python/ray/data/ @ericl @scv119 @c21 @amogkam @scottjlee @bveeramani -/doc/source/data/ @ericl @scv119 @c21 @amogkam @scottjlee @bveeramani @maxpumperla @ray-project/ray-docs +/python/ray/data/ @ericl @scv119 @c21 @amogkam @scottjlee @bveeramani @raulchen +/doc/source/data/ @ericl @scv119 @c21 @amogkam @scottjlee @bveeramani @raulchen @maxpumperla @ray-project/ray-docs # Ray workflows. /python/ray/workflow/ @ericl @iycheng @stephanie-wang @suquark @@ -102,8 +103,8 @@ /doc/source/ray-air/ @richardliaw @gjoliver @krfricke @xwjiang2010 @amogkam @matthewdeng @Yard1 @maxpumperla @ray-project/ray-docs # ML Docker Dependencies -/python/requirements/ml/requirements_dl.txt @amogkam @sven1977 @richardliaw @matthewdeng -/python/requirements/ml/requirements_ml_docker.txt @amogkam @sven1977 @richardliaw @matthewdeng +/python/requirements/ml/requirements_dl.txt @amogkam @krfricke @richardliaw @matthewdeng +/python/requirements/ml/requirements_ml_docker.txt @amogkam @krfricke @richardliaw @matthewdeng # Ray symbol export /src/ray/ray_version_script.lds @iycheng @ericl @scv119 diff --git a/.github/workflows/external-code-affected.yml b/.github/workflows/external-code-affected.yml new file mode 100644 index 000000000000..2917e32a939b --- /dev/null +++ b/.github/workflows/external-code-affected.yml @@ -0,0 +1,135 @@ +# Check if code checked into external resources (blogs, tutorials) +# that we also track in our CI is affected by a PR. +# In that case, we add a label to the PR (`external-code-affected`) and +# add a comment to make sure that the external code still works and is +# eventually updated. +name: External code check + +on: pull_request_target + +jobs: + check-changes: + permissions: write-all + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 + + - name: Check for changes in tracked files + run: | + set -xe + git clone https://github.com/ray-project/buildkite-ci-pipelines.git ./pipelines + + # Find changed files + GIT_DIFF=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}) + + echo "All changed files:" + echo "$GIT_DIFF" + + GIT_DIFF_SERIALIZED=$(echo "$GIT_DIFF" | tr '\n' '|') + echo "GIT_DIFF_SERIALIZED=$GIT_DIFF_SERIALIZED" >> $GITHUB_ENV + + - name: Add label and comment if a tracked file changed + uses: actions/github-script@v5 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + const { + deserializeIntoArray, + filterFilesByNames, + getCommentContentChanged, + getCommentContentNotChanged, + parseTrackedFilesToURIs, + readFileContent + } = require('./pipelines/external_code_tracker/track_code'); + + const fs = require("fs"); + + const commentHeader = `## Attention: External code changed` + const externalCodeFile = "doc/external/external_code.txt" + + // Get existing comments + const existingComments = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number + }); + + // Find comment by the bot that starts with the header + let commentToUpdate = existingComments.data.find(comment => + comment.user.login === 'github-actions[bot]' && comment.body.startsWith(commentHeader) + ); + + let externCodeFileContent; + let trackedFilesToURIs; + + // Read and parse external_code.txt file + try { + externCodeFileContent = fs.readFileSync(externalCodeFile, "utf8"); + trackedFilesToURIs = parseTrackedFilesToURIs(externCodeFileContent); + } catch (error) { + console.error("An error occurred reading the external code file:", error); + trackedFilesToURIs = {}; + } + + console.log("trackedFileToURIs"); + console.log(trackedFilesToURIs); + + // Get changed files from environment variable + let changedFiles = await deserializeIntoArray(process.env.GIT_DIFF_SERIALIZED) + + console.log("changedFiles"); + console.log(changedFiles); + + // Filter associative array + let changedFileToURIs = filterFilesByNames(trackedFilesToURIs, changedFiles); + + console.log("changedFileToURIs"); + console.log(changedFileToURIs); + console.log(changedFileToURIs.length); + + if (Object.keys(changedFileToURIs).length === 0) { + console.log("No changes to tracked files detected"); + commentBody = getCommentContentNotChanged(commentHeader); + if (commentToUpdate && commentBody !== commentToUpdate.body) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: commentToUpdate.id, + body: commentBody + }); + } + } else { + console.log("Changes to tracked files detected"); + commentBody = getCommentContentChanged(commentHeader, changedFileToURIs); + + if (commentToUpdate) { + // Only update if content changed + if (commentBody !== commentToUpdate.body) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: commentToUpdate.id, + body: commentBody + }); + } + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: commentBody + }); + } + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: ['external-code-affected'] + }); + + } + diff --git a/BUILD.bazel b/BUILD.bazel index c7b30fc61892..32a453b9a086 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -479,29 +479,6 @@ cc_library( ], ) -# This header is used to warp some internal code so we can reduce suspicious -# symbols export. -cc_library( - name = "exported_internal", - srcs = glob( - [ - "src/ray/internal/internal.cc", - ], - ), - hdrs = glob( - [ - "src/ray/internal/internal.h", - ], - ), - copts = COPTS, - strip_include_prefix = "src", - visibility = ["//visibility:public"], - deps = [ - ":core_worker_lib", - ], - alwayslink = 1, -) - cc_binary( name = "raylet", srcs = ["src/ray/raylet/main.cc"], @@ -2813,7 +2790,6 @@ pyx_library( ), deps = [ "//:core_worker_lib", - "//:exported_internal", "//:global_state_accessor_lib", "//:ray_util", "//:raylet_lib", @@ -2848,7 +2824,6 @@ cc_binary( visibility = ["//java:__subpackages__"], deps = [ "//:core_worker_lib", - "//:exported_internal", "//:global_state_accessor_lib", "//:src/ray/ray_exported_symbols.lds", "//:src/ray/ray_version_script.lds", @@ -2908,6 +2883,7 @@ filegroup( "//src/ray/protobuf:event_py_proto", "//src/ray/protobuf:gcs_py_proto", "//src/ray/protobuf:gcs_service_py_proto", + "//src/ray/protobuf:instance_manager_py_proto", "//src/ray/protobuf:job_agent_py_proto", "//src/ray/protobuf:monitor_py_proto", "//src/ray/protobuf:node_manager_py_proto", diff --git a/README.rst b/README.rst index c2a389f80617..e52e01da439f 100644 --- a/README.rst +++ b/README.rst @@ -23,7 +23,7 @@ Ray is a unified framework for scaling AI and Python applications. Ray consists Learn more about `Ray AIR`_ and its libraries: -- `Datasets`_: Distributed Data Preprocessing +- `Data`_: Scalable Datasets for ML - `Train`_: Distributed Training - `Tune`_: Scalable Hyperparameter Tuning - `RLlib`_: Scalable Reinforcement Learning @@ -44,7 +44,7 @@ Install Ray with: ``pip install ray``. For nightly wheels, see the `Installation page `__. .. _`Serve`: https://docs.ray.io/en/latest/serve/index.html -.. _`Datasets`: https://docs.ray.io/en/latest/data/dataset.html +.. _`Data`: https://docs.ray.io/en/latest/data/dataset.html .. _`Workflow`: https://docs.ray.io/en/latest/workflows/concepts.html .. _`Train`: https://docs.ray.io/en/latest/train/train.html .. _`Tune`: https://docs.ray.io/en/latest/tune/index.html diff --git a/WORKSPACE b/WORKSPACE index 9300a57ed14e..eb6aeba907e3 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,5 +1,6 @@ workspace(name = "com_github_ray_project_ray") +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") load("//bazel:ray_deps_setup.bzl", "ray_deps_setup") ray_deps_setup() @@ -29,3 +30,42 @@ versions.check(minimum_bazel_version = "5.4.0") load("@hedron_compile_commands//:workspace_setup.bzl", "hedron_compile_commands_setup") hedron_compile_commands_setup() + +http_archive( + name = "rules_python", + sha256 = "94750828b18044533e98a129003b6a68001204038dc4749f40b195b24c38f49f", + strip_prefix = "rules_python-0.21.0", + url = "https://github.com/bazelbuild/rules_python/releases/download/0.21.0/rules_python-0.21.0.tar.gz", +) + +load("@rules_python//python:repositories.bzl", "python_register_toolchains") + +python_register_toolchains( + name = "python3_9", + python_version = "3.9", + register_toolchains = False, +) + +load("@python3_9//:defs.bzl", bk_python = "interpreter") +load("@rules_python//python/pip_install:repositories.bzl", "pip_install_dependencies") + +pip_install_dependencies() + +load("@rules_python//python:pip.bzl", "pip_parse") + +pip_parse( + name = "py_deps_buildkite", + python_interpreter_target = bk_python, + requirements_lock = "//release:requirements_buildkite.txt", +) + +load("@py_deps_buildkite//:requirements.bzl", install_py_deps_buildkite = "install_deps") + +install_py_deps_buildkite() + +register_toolchains("//release:python_toolchain") + +register_execution_platforms( + "@local_config_platform//:host", + "//release:hermetic_python_platform", +) diff --git a/ci/build/build-docker-images.py b/ci/build/build-docker-images.py index b12fb87745a3..6b5b35f45055 100644 --- a/ci/build/build-docker-images.py +++ b/ci/build/build-docker-images.py @@ -1,4 +1,5 @@ import datetime +import io import json import functools import glob @@ -9,8 +10,10 @@ import shutil import subprocess import sys +import tarfile from collections import defaultdict -from typing import List, Optional, Tuple +from pathlib import Path +from typing import List, Optional, Tuple, Dict import click import docker @@ -21,11 +24,15 @@ PYTHON_WHL_VERSION = "cp3" ADDITIONAL_PLATFORMS = ["aarch64"] +DOCKER_HUB_REPO = "rayproject" + DOCKER_HUB_DESCRIPTION = { "base-deps": ( - "Internal Image, refer to " "https://hub.docker.com/r/rayproject/ray" + f"Internal Image, refer to https://hub.docker.com/r/{DOCKER_HUB_REPO}/ray" + ), + "ray-deps": ( + f"Internal Image, refer to https://hub.docker.com/r/{DOCKER_HUB_REPO}/ray" ), - "ray-deps": ("Internal Image, refer to " "https://hub.docker.com/r/rayproject/ray"), "ray": "Official Docker Images for Ray, the distributed computing API.", "ray-ml": "Developer ready Docker Image for Ray.", "ray-worker-container": "Internal Image for CI test", @@ -233,7 +240,7 @@ def _build_docker_image( # can be found. build_args["FIND_LINKS_PATH"] = ".whl" - tagged_name = f"rayproject/{image_name}:nightly-{py_version}-{device_tag}" + tagged_name = f"{DOCKER_HUB_REPO}/{image_name}:nightly-{py_version}-{device_tag}" tagged_name = _with_suffix(tagged_name, suffix=suffix) @@ -292,6 +299,43 @@ def _build_docker_image( break print("BUILT: ", tagged_name) + return tagged_name + + +def _extract_files_from_docker(docker_image: str, files: Dict[str, str]): + """Extract files from docker container image and save to local disk. + + ``files`` is a dict mapping from paths inside the docker container to + local paths on the host system. + """ + # Create container + container = DOCKER_CLIENT.containers.create(docker_image) + for container_path, local_path in files.items(): + # Get tar stream of file + stream, stat = container.get_archive(f"{container_path}") + # Create local directory containing target file + local_path = Path(local_path) + local_path.parent.mkdir(exist_ok=True) + # Read tar stream into bytes IO + with tarfile.open(fileobj=io.BytesIO(b"".join(d for d in stream))) as tar: + # Extract file from tar archive into local path + with open(local_path, "wb") as f: + for r in tar.extractfile(os.path.basename(container_path)): + f.write(r) + container.remove() + + +def extract_image_infos(images: List[str], target_dir: str): + for image in images: + image_basename = image.replace("rayproject/", "") + _extract_files_from_docker( + image, + { + "/home/ray/pip-freeze.txt": ( + f"{target_dir}/{image_basename}_" f"pip-freeze.txt" + ) + }, + ) def copy_wheels(human_build): @@ -326,17 +370,22 @@ def check_staleness(repository, tag): return is_stale -def build_for_all_versions(image_name, py_versions, image_types, suffix, **kwargs): +def build_for_all_versions( + image_name, py_versions, image_types, suffix, **kwargs +) -> List[str]: """Builds the given Docker image for all Python & CUDA versions""" + tagged_names = [] for py_version in py_versions: for image_type in image_types: - _build_docker_image( + tagged_name = _build_docker_image( image_name, py_version=py_version, image_type=image_type, suffix=suffix, **kwargs, ) + tagged_names.append(tagged_name) + return tagged_names def build_base_images(py_versions, image_types, suffix): @@ -355,7 +404,7 @@ def build_or_pull_base_images( suffix: Optional[str] = None, ) -> bool: """Returns images to tag and build.""" - repositories = ["rayproject/base-deps", "rayproject/ray-deps"] + repositories = [f"{DOCKER_HUB_REPO}/base-deps", f"{DOCKER_HUB_REPO}/ray-deps"] tags = [ f"nightly-{py_version}-{image_type}" for py_version, image_type in itertools.product(py_versions, image_types) @@ -384,7 +433,9 @@ def build_or_pull_base_images( def prep_ray_ml(): root_dir = _get_root_dir() - requirements_files = ["python/requirements.txt"] + requirements_files = [ + "python/requirements.txt", + ] ml_requirements_files = [ "python/requirements/ml/requirements_ml_docker.txt", "python/requirements/ml/requirements_dl.txt", @@ -392,6 +443,7 @@ def prep_ray_ml(): "python/requirements/ml/requirements_rllib.txt", "python/requirements/ml/requirements_train.txt", "python/requirements/ml/requirements_upstream.txt", + "python/requirements/ml/requirements_no_deps.txt", ] # We don't need these in the ml docker image ignore_requirements = [ @@ -672,7 +724,7 @@ def push_readmes(merge_build: bool): "PUSHRM_DEBUG": 1, "PUSHRM_SHORT": tag_line, } - cmd_string = f"rayproject/{image}" + cmd_string = f"{DOCKER_HUB_REPO}/{image}" print( DOCKER_CLIENT.containers.run( @@ -830,7 +882,11 @@ def main( # TODO Currently don't push ray_worker_container else: # Build Ray Docker images. - build_for_all_versions("ray", py_versions, image_types, suffix=suffix) + all_tagged_images = [] + + all_tagged_images += build_for_all_versions( + "ray", py_versions, image_types, suffix=suffix + ) # List of images to tag and push to docker hub images_to_tag_and_push = [] @@ -854,7 +910,7 @@ def main( if len(ml_image_types) > 0: prep_ray_ml() - build_for_all_versions( + all_tagged_images += build_for_all_versions( "ray-ml", py_versions, image_types=ml_image_types, @@ -862,6 +918,11 @@ def main( ) images_to_tag_and_push += ["ray-ml"] + if is_buildkite: + extract_image_infos( + all_tagged_images, target_dir="/artifact-mount/.image-info" + ) + if build_type in {MERGE, PR}: valid_branch = _valid_branch() if (not valid_branch) and is_merge: @@ -879,5 +940,72 @@ def main( # push_readmes(build_type is MERGE) +def fix_docker_images( + image: str = "ray-ml", + version: str = "nightly", + repo: str = DOCKER_HUB_REPO, +): + """Print commands to manually update docker images post-release. + + This function prints commands that can be run to add new layers to + fix docker images post-release, e.g. when dependencies have to be fixed + or public keys expired. + + The commands can be copied/pasted and executed in a shell. + + Example: + FIX_IMAGE=ray-ml FIX_VERSION=2.3.0 python build-docker-images.py + + """ + tags = create_image_tags( + image_name=image, + py_versions=list(PY_MATRIX.keys()), + image_types=list(BASE_IMAGES.keys()), + specific_tag=None, # Set to `latest` for latest image fixes + version=version, + suffix=None, + ) + print(dict(tags)) + + # Pull images we want to rebuild + for base_tag in tags: + base_image = f"{repo}/{image}:{base_tag}" + + print(f"docker pull {base_image}") + + # Re-tag these base images as e.g. pinned/ray-ml:tag + # This is so we can re-run the build command safely. + pinned_base_image = {} + for base_tag in tags: + base_image = f"{repo}/{image}:{base_tag}" + pinned_image = f"pinned/{image}:{base_tag}" + + pinned_base_image[base_image] = pinned_image + + print(f"docker tag {base_image} {pinned_image}") + + # Create commands to build the new layer for the base images. + for base_tag in tags: + base_image = f"{repo}/{image}:{base_tag}" + pinned_image = pinned_base_image[base_image] + + print(f"docker build --build-arg BASE_IMAGE={pinned_image} -t {base_image} .") + for subtag in tags[base_tag]: + if subtag == base_tag: + continue + + # This will overwrite the rayproject/ray-ml:tag image + # - but we still have the pinned/ image if we want to re-run! + target_image = f"{repo}/{image}:{subtag}" + print(f"docker tag {base_image} {target_image}") + + # Lastly, push new layers + print(f"docker push --all-tags {repo}/{image}") + + if __name__ == "__main__": - main() + fix_image = os.environ.get("FIX_IMAGE") + if not fix_image: + main() + else: + fix_docker_images(fix_image, os.environ.get("FIX_VERSION")) diff --git a/ci/ci.sh b/ci/ci.sh index 47ae6224b830..6c67efa25701 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -77,16 +77,18 @@ reload_env() { fi } -need_wheels() { - local error_code=1 +_need_wheels() { + local result="false" case "${OSTYPE}" in - linux*) if [ "${LINUX_WHEELS-}" = 1 ]; then error_code=0; fi;; - darwin*) if [ "${MAC_WHEELS-}" = 1 ]; then error_code=0; fi;; - msys*) if [ "${WINDOWS_WHEELS-}" = 1 ]; then error_code=0; fi;; + linux*) if [[ "${LINUX_WHEELS-}" == "1" ]]; then result="true"; fi;; + darwin*) if [[ "${MAC_WHEELS-}" == "1" ]]; then result="true"; fi;; + msys*) if [[ "${WINDOWS_WHEELS-}" == "1" ]]; then result="true"; fi;; esac - return "${error_code}" + echo "${result}" } +NEED_WHEELS="$(_need_wheels)" + upload_wheels() { local branch="" commit commit="$(git rev-parse --verify HEAD)" @@ -152,7 +154,7 @@ prepare_docker() { EXPOSE 8000 EXPOSE 10001 RUN pip install /${wheel}[serve] - RUN sudo apt update && sudo apt install curl -y + RUN (sudo apt update || true) && sudo apt install curl -y " > $tmp_dir/Dockerfile pushd $tmp_dir @@ -258,13 +260,14 @@ test_cpp() { } test_wheels() { - local result=0 flush_logs=0 + local result=0 + local flush_logs=0 - if need_wheels; then + if [[ "${NEED_WHEELS}" == "true" ]]; then "${WORKSPACE_DIR}"/ci/build/test-wheels.sh || { result=$? && flush_logs=1; } fi - if [ 0 -ne "${flush_logs}" ]; then + if [[ 0 -ne "${flush_logs}" ]]; then cat -- /tmp/ray/session_latest/logs/* || true sleep 60 # Explicitly sleep 60 seconds for logs to go through fi @@ -284,6 +287,8 @@ install_npm_project() { build_dashboard_front_end() { if [ "${OSTYPE}" = msys ]; then { echo "WARNING: Skipping dashboard due to NPM incompatibilities with Windows"; } 2> /dev/null + elif [ "${NO_DASHBOARD-}" = "1" ]; then + echo "Skipping dashboard build" else ( cd ray/dashboard/client @@ -308,7 +313,8 @@ build_sphinx_docs() { if [ "${OSTYPE}" = msys ]; then echo "WARNING: Documentation not built on Windows due to currently-unresolved issues" else - FAST=True make html + # TODO: revert to "make html" once "sphinx_panels" plugin is fully removed. + FAST=True make develop pip install datasets==2.0.0 RAY_MOCK_MODULES=0 RAY_DEDUP_LOGS=0 make doctest fi @@ -453,6 +459,7 @@ build_wheels() { -e "BUILDKITE_PULL_REQUEST=${BUILDKITE_PULL_REQUEST:-}" -e "BUILDKITE_BAZEL_CACHE_URL=${BUILDKITE_BAZEL_CACHE_URL:-}" -e "RAY_DEBUG_BUILD=${RAY_DEBUG_BUILD:-}" + -e "BUILD_ONE_PYTHON_ONLY=${BUILD_ONE_PYTHON_ONLY:-}" ) IMAGE_NAME="quay.io/pypa/manylinux2014_${HOSTTYPE}" @@ -743,7 +750,7 @@ build() { _bazel_build_protobuf fi - if ! need_wheels; then + if [[ "${NEED_WHEELS}" != "true" ]]; then install_ray if [ "${LINT-}" = 1 ]; then # Try generating Sphinx documentation. To do this, we need to install Ray first. @@ -759,7 +766,7 @@ build() { install_go fi - if need_wheels; then + if [[ "${NEED_WHEELS}" == "true" ]]; then build_wheels fi } @@ -793,20 +800,8 @@ run_minimal_test() { # shellcheck disable=SC2086 bazel test --test_output=streamed --config=ci --test_env=RAY_MINIMAL=1 ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env_ray_minimal # shellcheck disable=SC2086 - bazel test --test_output=streamed --config=ci --test_env=RAY_MINIMAL=1 ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env - # shellcheck disable=SC2086 - bazel test --test_output=streamed --config=ci --test_env=RAY_MINIMAL=1 ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env_2 - # shellcheck disable=SC2086 bazel test --test_output=streamed --config=ci ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_utils - # Todo: Make compatible with python 3.9/3.10 - if [ "$1" != "3.9" ] && [ "$1" != "3.10" ]; then - # shellcheck disable=SC2086 - bazel test --test_output=streamed --config=ci ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env_complicated - fi - - # shellcheck disable=SC2086 - bazel test --test_output=streamed --config=ci ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_runtime_env_validation # shellcheck disable=SC2086 bazel test --test_output=streamed --config=ci --test_env=RAY_MINIMAL=1 ${BAZEL_EXPORT_OPTIONS} python/ray/tests/test_serve_ray_minimal # shellcheck disable=SC2086 diff --git a/ci/env/check_minimal_install.py b/ci/env/check_minimal_install.py index 01beb5adf4c7..aa3d6a7f337c 100644 --- a/ci/env/check_minimal_install.py +++ b/ci/env/check_minimal_install.py @@ -18,6 +18,7 @@ "opencensus", "prometheus_client", "smart_open", + "virtualenv", "torch", "tensorflow", "jax", diff --git a/ci/env/env_info.sh b/ci/env/env_info.sh index f39376fcc548..e84666a524ef 100755 --- a/ci/env/env_info.sh +++ b/ci/env/env_info.sh @@ -8,6 +8,10 @@ echo "Installed pip packages:" python -m pip freeze 2>/dev/null || echo 'Pip not installed' echo "----------------------------" +if [ -n "${BUILDKITE-}" ] && [ -d "/artifact-mount" ]; then + python -m pip freeze > /artifact-mount/pip_freeze.txt +fi + echo "GPU information" echo "----------------------------" GPUCMD="nvidia-smi" @@ -17,5 +21,9 @@ then else eval "${GPUCMD}" python -c "import torch; print('Torch cuda available:', torch.cuda.is_available())" + + if [ -n "${BUILDKITE-}" ] && [ -d "/artifact-mount" ]; then + eval "${GPUCMD}" > /artifact-mount/nvidia_smi.txt + fi fi echo "----------------------------" diff --git a/ci/env/install-bazel.sh b/ci/env/install-bazel.sh index 331aeabb9c3c..87c59b34875a 100755 --- a/ci/env/install-bazel.sh +++ b/ci/env/install-bazel.sh @@ -73,11 +73,14 @@ if [ "${BAZEL_CONFIG_ONLY-}" != "1" ]; then export PATH=$PATH:"$HOME/bin" fi - if [ "${architecture}" = "aarch64" ]; then + if [ "${architecture}" = "aarch64" ] || [ "${architecture}" = "arm64" ]; then # architecture is "aarch64", but the bazel tag is "arm64" url="https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_VERSION}/bazelisk-${platform}-arm64" elif [ "${architecture}" = "x86_64" ]; then url="https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_VERSION}/bazelisk-${platform}-amd64" + else + echo "Could not found matching bazelisk URL for platform ${platform} and architecture ${architecture}" + exit 1 fi if [ "$INSTALL_USER" = "1" ]; then diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh index 6bf7e4959450..81fcd3f72f0f 100755 --- a/ci/env/install-dependencies.sh +++ b/ci/env/install-dependencies.sh @@ -233,7 +233,7 @@ install_upgrade_pip() { fi if "${python}" -m pip --version || "${python}" -m ensurepip; then # Configure pip if present - "${python}" -m pip install --upgrade pip + "${python}" -m pip install --upgrade "pip<23.1" # If we're in a CI environment, do some configuration if [ "${CI-}" = true ]; then @@ -297,102 +297,84 @@ download_mnist() { unzip "${HOME}/data/mnist.zip" -d "${HOME}/data" } -install_pip_packages() { +retry_pip_install() { + local pip_command=$1 + local status="0" + local errmsg="" + + # Try n times; we often encounter OpenSSL.SSL.WantReadError (or others) + # that break the entire CI job: Simply retry installation in this case + # after n seconds. + for _ in {1..3}; do + errmsg=$(eval "${pip_command}" 2>&1) && break + status=$errmsg && echo "'pip install ...' failed, will retry after n seconds!" && sleep 30 + done + if [ "$status" != "0" ]; then + echo "${status}" && return 1 + fi +} +install_pip_packages() { # Install modules needed in all jobs. # shellcheck disable=SC2262 alias pip="python -m pip" - if [ "${MINIMAL_INSTALL-}" != 1 ]; then - # Some architectures will build dm-tree from source. - # Move bazelrc to a different location temporarily to disable --config=ci settings - mv "$HOME/.bazelrc" "$HOME/._brc" || true - pip install --no-clean dm-tree==0.1.5 # --no-clean is due to: https://github.com/deepmind/tree/issues/5 - mv "$HOME/._brc" "$HOME/.bazelrc" || true - fi - - if { [ -n "${PYTHON-}" ] || [ "${DL-}" = "1" ]; } && [ "${MINIMAL_INSTALL-}" != 1 ]; then - # Remove this entire section once Serve dependencies are fixed. - if { [ -z "${BUILDKITE-}" ] || [ "${DL-}" = "1" ]; } && [ "${DOC_TESTING-}" != 1 ] && [ "${TRAIN_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ] && [ "${RLLIB_TESTING-}" != 1 ]; then - # We want to install the CPU version only. - pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/ml/requirements_dl.txt - fi - - # Try n times; we often encounter OpenSSL.SSL.WantReadError (or others) - # that break the entire CI job: Simply retry installation in this case - # after n seconds. - local status="0"; - local errmsg=""; - for _ in {1..3}; do - errmsg=$(CC=gcc pip install -Ur "${WORKSPACE_DIR}"/python/requirements.txt 2>&1) && break; - status=$errmsg && echo "'pip install ...' failed, will retry after n seconds!" && sleep 30; - done - if [ "$status" != "0" ]; then - echo "${status}" && return 1 - fi + # Array to hold all requirements files to install later + requirements_files=() + # Single packages to install in sync with files + requirements_packages=() + # Packages to install _after_ previous files have been installed + # (e.g. to install a custom pyarrow or torch version). This + # would otherwise conflict with pinned dependencies in our requirements + # files. + delayed_packages=() - # Repeat for requirements_test.txt - local status="0"; - local errmsg=""; - for _ in {1..3}; do - errmsg=$(CC=gcc pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements_test.txt 2>&1) && break; - status=$errmsg && echo "'pip install ...' failed, will retry after n seconds!" && sleep 30; - done - if [ "$status" != "0" ]; then - echo "${status}" && return 1 - fi - - fi + requirements_files+=("${WORKSPACE_DIR}/python/requirements_test.txt") if [ "${LINT-}" = 1 ]; then install_linters - # readthedocs has an antiquated build env. - # This is a best effort to reproduce it locally to avoid doc build failures and hidden errors. - local python_version - python_version="$(python -s -c "import sys; print('%s.%s' % sys.version_info[:2])")" - if [ "${OSTYPE}" = msys ] && [ "${python_version}" = "3.8" ]; then - { echo "WARNING: Pillow binaries not available on Windows; cannot build docs"; } 2> /dev/null - else - pip install --use-deprecated=legacy-resolver -r "${WORKSPACE_DIR}"/doc/requirements-rtd.txt - pip install --use-deprecated=legacy-resolver -r "${WORKSPACE_DIR}"/doc/requirements-doc.txt - fi + + requirements_files+=("${WORKSPACE_DIR}/doc/requirements-doc.txt") fi # Additional default doc testing dependencies. if [ "${DOC_TESTING-}" = 1 ]; then - # For Ray Core and Ray Serve DAG visualization docs test - sudo apt-get install -y graphviz - pip install -U pydot # For DAG visualization - # For the dataset examples - sudo apt-get install -y tesseract-ocr - pip install -U pytesseract "spacy>=3" spacy_langdetect - python -m spacy download en_core_web_sm + # For Ray Core and Ray Serve DAG visualization docs test + dataset examples + sudo apt-get install -y graphviz tesseract-ocr + + # For DAG visualization + requirements_packages+=("pydot") + requirements_packages+=("pytesseract") + requirements_packages+=("spacy>=3") + requirements_packages+=("spacy_langdetect") fi # Additional RLlib test dependencies. if [ "${RLLIB_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/ml/requirements_rllib.txt + requirements_files+=("${WORKSPACE_DIR}/python/requirements/ml/requirements_rllib.txt") #TODO(amogkam): Add this back to requirements_rllib.txt once mlagents no longer pins torch<1.9.0 version. pip install --no-dependencies mlagents==0.28.0 fi - SITE_PACKAGES=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') + # Some Ray Train dependencies have to be installed with --no-deps, + # as sub-dependencies conflict. The packages still work for our workflows. + # Todo(krfricke): Try to remove once we move to Python 3.8 in CI. + local install_ml_no_deps=0 # Additional Train test dependencies. if [ "${TRAIN_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/ml/requirements_train.txt + requirements_files+=("${WORKSPACE_DIR}/python/requirements/ml/requirements_train.txt") + install_ml_no_deps=1 fi - # Additional Tune/Doc test dependencies. if [ "${TUNE_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/ml/requirements_tune.txt - download_mnist + requirements_files+=("${WORKSPACE_DIR}/python/requirements/ml/requirements_tune.txt") fi # For Tune, install upstream dependencies. if [ "${TUNE_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/ml/requirements_upstream.txt + requirements_files+=("${WORKSPACE_DIR}/python/requirements/ml/requirements_upstream.txt") fi # Additional dependency for Ludwig. @@ -400,51 +382,87 @@ install_pip_packages() { # dependencies with Modin. if [ "${INSTALL_LUDWIG-}" = 1 ]; then # TODO: eventually pin this to master. - pip install -U "ludwig[test]>=0.4" "jsonschema>=4" + requirements_packages+=("ludwig[test]>=0.4") + requirements_packages+=("jsonschema>=4") fi # Additional dependency for time series libraries. # This cannot be included in requirements_tune.txt as it has conflicting # dependencies. if [ "${INSTALL_TIMESERIES_LIBS-}" = 1 ]; then - pip install -U "statsforecast==1.1.0" "prophet==1.1.1" + requirements_packages+=("statsforecast==1.5.0") + requirements_packages+=("prophet==1.1.1") + requirements_packages+=("holidays==0.24") # holidays 0.25 causes `import prophet` to fail. fi # Data processing test dependencies. if [ "${DATA_PROCESSING_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/data_processing/requirements.txt + requirements_files+=("${WORKSPACE_DIR}/python/requirements/data_processing/requirements.txt") fi if [ "${DATA_PROCESSING_TESTING-}" = 1 ]; then - pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/data_processing/requirements_dataset.txt + requirements_files+=("${WORKSPACE_DIR}/python/requirements/data_processing/requirements_dataset.txt") if [ -n "${ARROW_VERSION-}" ]; then if [ "${ARROW_VERSION-}" = nightly ]; then - pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --prefer-binary --pre pyarrow + delayed_packages+=("--extra-index-url") + delayed_packages+=("https://pypi.fury.io/arrow-nightlies/") + delayed_packages+=("--prefer-binary") + delayed_packages+=("--pre") + delayed_packages+=("pyarrow") else - pip install -U pyarrow=="${ARROW_VERSION}" + delayed_packages+=("pyarrow==${ARROW_VERSION}") fi fi if [ -n "${ARROW_MONGO_VERSION-}" ]; then - pip install -U pymongoarrow=="${ARROW_MONGO_VERSION}" + delayed_packages+=("pymongoarrow==${ARROW_MONGO_VERSION}") fi fi - # Remove this entire section once Serve dependencies are fixed. - if [ "${MINIMAL_INSTALL-}" != 1 ] && [ "${DOC_TESTING-}" != 1 ] && [ "${TRAIN_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ] && [ "${RLLIB_TESTING-}" != 1 ]; then - # If CI has deemed that a different version of Torch - # should be installed, then upgrade/downgrade to that specific version. - if [ -n "${TORCH_VERSION-}" ]; then - case "${TORCH_VERSION-1.9.0}" in - 1.9.0) TORCHVISION_VERSION=0.10.0;; - 1.8.1) TORCHVISION_VERSION=0.9.1;; - 1.5) TORCHVISION_VERSION=0.6.0;; - *) TORCHVISION_VERSION=0.5.0;; - esac - pip install --use-deprecated=legacy-resolver --upgrade torch=="${TORCH_VERSION-1.9.0}" torchvision=="${TORCHVISION_VERSION}" - fi + if [ "${install_ml_no_deps}" = 1 ]; then + # Install these requirements first. Their dependencies may be overwritten later + # by the main install. + pip install -r "${WORKSPACE_DIR}/python/requirements/ml/requirements_no_deps.txt" + fi + + retry_pip_install "CC=gcc pip install -Ur ${WORKSPACE_DIR}/python/requirements.txt" + + # Install deeplearning libraries (Torch + TensorFlow) + if [ -n "${TORCH_VERSION-}" ] || [ "${DL-}" = "1" ] || [ "${RLLIB_TESTING-}" = 1 ] || [ "${TRAIN_TESTING-}" = 1 ] || [ "${TUNE_TESTING-}" = 1 ]; then + # If we require a custom torch version, use that + if [ -n "${TORCH_VERSION-}" ]; then + case "${TORCH_VERSION-1.9.0}" in + 1.9.0) TORCHVISION_VERSION=0.10.0;; + 1.8.1) TORCHVISION_VERSION=0.9.1;; + 1.6) TORCHVISION_VERSION=0.7.0;; + 1.5) TORCHVISION_VERSION=0.6.0;; + *) TORCHVISION_VERSION=0.5.0;; + esac + # Install right away, as some dependencies (e.g. torch-spline-conv) need + # torch to be installed for their own install. + pip install -U "torch==${TORCH_VERSION-1.9.0}" "torchvision==${TORCHVISION_VERSION}" + # We won't add requirements_dl.txt as it would otherwise overwrite our custom + # torch. Thus we have also have to install tensorflow manually. + TF_PACKAGE=$(grep "tensorflow==" "${WORKSPACE_DIR}/python/requirements/ml/requirements_dl.txt") + TFPROB_PACKAGE=$(grep "tensorflow-probability==" "${WORKSPACE_DIR}/python/requirements/ml/requirements_dl.txt") + + # %%;* deletes everything after ; to get rid of e.g. python version specifiers + pip install -U "${TF_PACKAGE%%;*}" "${TFPROB_PACKAGE%%;*}" + else + # Otherwise, use pinned default torch version. + # Again, install right away, as some dependencies (e.g. torch-spline-conv) need + # torch to be installed for their own install. + TORCH_PACKAGE=$(grep "torch==" "${WORKSPACE_DIR}/python/requirements/ml/requirements_dl.txt") + TORCHVISION_PACKAGE=$(grep "torchvision==" "${WORKSPACE_DIR}/python/requirements/ml/requirements_dl.txt") + + # %%;* deletes everything after ; to get rid of e.g. python version specifiers + pip install "${TORCH_PACKAGE%%;*}" "${TORCHVISION_PACKAGE%%;*}" + requirements_files+=("${WORKSPACE_DIR}/python/requirements/ml/requirements_dl.txt") + fi fi # Inject our own mirror for the CIFAR10 dataset if [ "${TRAIN_TESTING-}" = 1 ] || [ "${TUNE_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then + SITE_PACKAGES=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') + TF_CIFAR="${SITE_PACKAGES}/tensorflow/python/keras/datasets/cifar10.py" TORCH_CIFAR="${SITE_PACKAGES}/torchvision/datasets/cifar.py" @@ -454,17 +472,46 @@ install_pip_packages() { "$TORCH_CIFAR" fi + # Generate the pip command with collected requirements files + pip_cmd="pip install -U -c ${WORKSPACE_DIR}/python/requirements.txt" + for file in "${requirements_files[@]}"; do + pip_cmd+=" -r ${file}" + done + + # Expand single requirements + if [ "${#requirements_packages[@]}" -gt 0 ]; then + pip_cmd+=" ${requirements_packages[*]}" + fi + + # Install + eval "${pip_cmd}" + + # Install delayed packages + if [ "${#delayed_packages[@]}" -gt 0 ]; then + pip install -U -c "${WORKSPACE_DIR}/python/requirements.txt" "${delayed_packages[@]}" + fi + # Additional Tune dependency for Horovod. # This must be run last (i.e., torch cannot be re-installed after this) if [ "${INSTALL_HOROVOD-}" = 1 ]; then "${SCRIPT_DIR}"/install-horovod.sh fi - # install hdfs if needed. - if [ "${INSTALL_HDFS-}" = 1 ]; then - "${SCRIPT_DIR}"/install-hdfs.sh + if [ "${TUNE_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then + download_mnist + fi + + if [ "${DOC_TESTING-}" = 1 ]; then + # Todo: This downgrades spacy and related dependencies because + # `en_core_web_sm` is only compatible with spacy < 3.6. + # We should move to a model that does not depend on a stale version. + python -m spacy download en_core_web_sm fi +} +install_thirdparty_packages() { + # shellcheck disable=SC2262 + alias pip="python -m pip" CC=gcc pip install psutil setproctitle==1.2.2 colorama --target="${WORKSPACE_DIR}/python/ray/thirdparty_files" } @@ -491,7 +538,16 @@ install_dependencies() { fi fi - install_pip_packages + # install hdfs if needed. + if [ "${INSTALL_HDFS-}" = 1 ]; then + "${SCRIPT_DIR}"/install-hdfs.sh + fi + + if [ "${MINIMAL_INSTALL-}" != "1" ]; then + install_pip_packages + fi + + install_thirdparty_packages } install_dependencies "$@" diff --git a/ci/env/install-minimal.sh b/ci/env/install-minimal.sh index e99e453ea11e..045239badc48 100755 --- a/ci/env/install-minimal.sh +++ b/ci/env/install-minimal.sh @@ -1,5 +1,12 @@ #!/usr/bin/env bash +if [ "$1" == "3.11" ]; then + # TODO: fix build wheels unsupported tags in the future + echo "'set -xe' not working for Python 3.11" +else + set -xe +fi + # Python version can be specified as 3.7, 3.8, 3.9, etc.. if [ -z "$1" ]; then PYTHON_VERSION=${PYTHON-3.7} diff --git a/ci/lint/check-bazel-team-owner.py b/ci/lint/check-bazel-team-owner.py index 11bc69d0417e..78e51f80da5c 100644 --- a/ci/lint/check-bazel-team-owner.py +++ b/ci/lint/check-bazel-team-owner.py @@ -25,8 +25,13 @@ def perform_check(raw_xml_string: str): missing_owners = [] for rule in tree.findall("rule"): test_name = rule.attrib["name"] - tags = [child.attrib["value"] for child in rule.find("list").getchildren()] - team_owner = [t for t in tags if t.startswith("team")] + tags = [] + for lst in rule.findall("list"): + if lst.attrib["name"] != "tags": + continue + tags = [child.attrib["value"] for child in lst.getchildren()] + break + team_owner = [t for t in tags if t.startswith("team:")] if len(team_owner) == 0: missing_owners.append(test_name) owners[test_name] = team_owner diff --git a/ci/lint/format.sh b/ci/lint/format.sh index 7dfc6be0dafe..309cdfa52c42 100755 --- a/ci/lint/format.sh +++ b/ci/lint/format.sh @@ -155,12 +155,14 @@ BLACK_EXCLUDES=( `'python/ray/core/src/ray/gcs/*|'` `'python/ray/thirdparty_files/*|'` `'python/ray/_private/thirdparty/*|'` - `'python/ray/serve/tests/test_config_files/syntax_error\.py' + `'python/ray/serve/tests/test_config_files/syntax_error\.py|'` + `'doc/external/*' ) GIT_LS_EXCLUDES=( ':(exclude)python/ray/cloudpickle/' ':(exclude)python/ray/_private/runtime_env/_clonevirtualenv.py' + ':(exclude)doc/external/' ) JAVA_EXCLUDES=( diff --git a/ci/pipeline/determine_tests_to_run.py b/ci/pipeline/determine_tests_to_run.py index bed9110be938..7a3cd86d4320 100644 --- a/ci/pipeline/determine_tests_to_run.py +++ b/ci/pipeline/determine_tests_to_run.py @@ -88,6 +88,8 @@ def get_commit_range(): # Whether all RLlib tests should be run. # Set to 1 only when a source file in `ray/rllib` has been changed. RAY_CI_RLLIB_DIRECTLY_AFFECTED = 0 + # Whether to run all RLlib contrib tests + RAY_CI_RLLIB_CONTRIB_AFFECTED = 0 RAY_CI_SERVE_AFFECTED = 0 RAY_CI_CORE_CPP_AFFECTED = 0 RAY_CI_CPP_AFFECTED = 0 @@ -179,6 +181,9 @@ def get_commit_range(): RAY_CI_RLLIB_DIRECTLY_AFFECTED = 1 RAY_CI_LINUX_WHEELS_AFFECTED = 1 RAY_CI_MACOS_WHEELS_AFFECTED = 1 + elif re.match("rllib_contrib/", changed_file): + if not changed_file.endswith(".md"): + RAY_CI_RLLIB_CONTRIB_AFFECTED = 1 elif changed_file.startswith("python/ray/serve"): RAY_CI_DOC_AFFECTED = 1 RAY_CI_SERVE_AFFECTED = 1 @@ -307,6 +312,8 @@ def get_commit_range(): RAY_CI_TRAIN_AFFECTED = 1 RAY_CI_RLLIB_AFFECTED = 1 RAY_CI_RLLIB_DIRECTLY_AFFECTED = 1 + # the rllib contrib ci should only be run on pull requests + RAY_CI_RLLIB_CONTRIB_AFFECTED = 0 RAY_CI_SERVE_AFFECTED = 1 RAY_CI_CPP_AFFECTED = 1 RAY_CI_CORE_CPP_AFFECTED = 1 @@ -331,6 +338,7 @@ def get_commit_range(): "RAY_CI_TRAIN_AFFECTED={}".format(RAY_CI_TRAIN_AFFECTED), "RAY_CI_RLLIB_AFFECTED={}".format(RAY_CI_RLLIB_AFFECTED), "RAY_CI_RLLIB_DIRECTLY_AFFECTED={}".format(RAY_CI_RLLIB_DIRECTLY_AFFECTED), + "RAY_CI_RLLIB_CONTRIB_AFFECTED={}".format(RAY_CI_RLLIB_CONTRIB_AFFECTED), "RAY_CI_SERVE_AFFECTED={}".format(RAY_CI_SERVE_AFFECTED), "RAY_CI_DASHBOARD_AFFECTED={}".format(RAY_CI_DASHBOARD_AFFECTED), "RAY_CI_DOC_AFFECTED={}".format(RAY_CI_DOC_AFFECTED), diff --git a/ci/pipeline/py_dep_analysis.py b/ci/pipeline/py_dep_analysis.py index c9aa6a701b55..4c5720475487 100644 --- a/ci/pipeline/py_dep_analysis.py +++ b/ci/pipeline/py_dep_analysis.py @@ -168,6 +168,9 @@ def build_dep_graph() -> DepGraph: continue full = _full_module_path(module, f) + if full.startswith("ray.serve.tests.test_config_files."): + # Skip ray serve test files; can contain invalid python code. + continue if full not in graph.ids: graph.ids[full] = len(graph.ids) diff --git a/cpp/include/ray/api/metric.h b/cpp/include/ray/api/metric.h index d3d87df60698..10cb95257dd5 100644 --- a/cpp/include/ray/api/metric.h +++ b/cpp/include/ray/api/metric.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include diff --git a/cpp/src/ray/runtime/task/native_task_submitter.cc b/cpp/src/ray/runtime/task/native_task_submitter.cc index 3e3228a50da4..e69cf61fa164 100644 --- a/cpp/src/ray/runtime/task/native_task_submitter.cc +++ b/cpp/src/ray/runtime/task/native_task_submitter.cc @@ -66,11 +66,14 @@ ObjectID NativeTaskSubmitter::Submit(InvocationSpec &invocation, options.name = call_options.name; options.resources = call_options.resources; options.serialized_runtime_env_info = call_options.serialized_runtime_env_info; - std::optional> return_refs; + std::vector return_refs; if (invocation.task_type == TaskType::ACTOR_TASK) { - return_refs = core_worker.SubmitActorTask( - invocation.actor_id, BuildRayFunction(invocation), invocation.args, options); - if (!return_refs.has_value()) { + auto status = core_worker.SubmitActorTask(invocation.actor_id, + BuildRayFunction(invocation), + invocation.args, + options, + return_refs); + if (!status.ok()) { return ObjectID::Nil(); } } else { @@ -95,7 +98,7 @@ ObjectID NativeTaskSubmitter::Submit(InvocationSpec &invocation, ""); } std::vector return_ids; - for (const auto &ref : return_refs.value()) { + for (const auto &ref : return_refs) { return_ids.push_back(ObjectID::FromBinary(ref.object_id())); } return return_ids[0]; diff --git a/dashboard/agent.py b/dashboard/agent.py index 345099ff7c25..df57590ff0b6 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -15,7 +15,7 @@ import ray.dashboard.consts as dashboard_consts import ray.dashboard.utils as dashboard_utils from ray.dashboard.consts import _PARENT_DEATH_THREASHOLD -from ray._private.gcs_pubsub import GcsAioPublisher, GcsPublisher +from ray._private.gcs_pubsub import GcsAioPublisher from ray._raylet import GcsClient from ray._private.gcs_utils import GcsAioClient from ray._private.ray_logging import setup_component_logger @@ -263,7 +263,9 @@ async def _check_parent(): ray._private.utils.publish_error_to_driver( ray_constants.RAYLET_DIED_ERROR, msg, - gcs_publisher=GcsPublisher(address=self.gcs_address), + gcs_publisher=ray._raylet.GcsPublisher( + address=self.gcs_address + ), ) else: logger.info(msg) diff --git a/dashboard/client/src/App.tsx b/dashboard/client/src/App.tsx index c183a7669d69..6647b4129cf6 100644 --- a/dashboard/client/src/App.tsx +++ b/dashboard/client/src/App.tsx @@ -4,13 +4,17 @@ import dayjs from "dayjs"; import duration from "dayjs/plugin/duration"; import React, { Suspense, useEffect, useState } from "react"; import { HashRouter, Navigate, Route, Routes } from "react-router-dom"; -import ActorDetailPage from "./pages/actor/ActorDetail"; +import ActorDetailPage, { ActorDetailLayout } from "./pages/actor/ActorDetail"; +import { ActorLayout } from "./pages/actor/ActorLayout"; import Loading from "./pages/exception/Loading"; import JobList, { JobsLayout } from "./pages/job"; import { JobDetailChartsPage } from "./pages/job/JobDetail"; -import { JobDetailActorsPage } from "./pages/job/JobDetailActorPage"; +import { + JobDetailActorDetailWrapper, + JobDetailActorsPage, +} from "./pages/job/JobDetailActorPage"; import { JobDetailInfoPage } from "./pages/job/JobDetailInfoPage"; -import { JobDetailLayout } from "./pages/job/JobDetailLayout"; +import { JobDetailLayout, JobPage } from "./pages/job/JobDetailLayout"; import { MainNavLayout } from "./pages/layout/MainNavLayout"; import { SideTabPage } from "./pages/layout/SideTabLayout"; import { LogsLayout } from "./pages/log/Logs"; @@ -28,6 +32,11 @@ import { import { ServeApplicationsListPage } from "./pages/serve/ServeApplicationsListPage"; import { ServeLayout } from "./pages/serve/ServeLayout"; import { ServeReplicaDetailPage } from "./pages/serve/ServeReplicaDetailPage"; +import { + ServeControllerDetailPage, + ServeHttpProxyDetailPage, +} from "./pages/serve/ServeSystemActorDetailPage"; +import { TaskPage } from "./pages/task/TaskPage"; import { getNodeList } from "./service/node"; import { lightTheme } from "./theme"; @@ -161,44 +170,70 @@ const App = () => { } path="" /> - } path="nodes/:id" /> + } path="nodes/:id" /> } path="jobs"> } path="" /> - } path=":id"> + } path=":id"> + } path=""> + + + + } + path="info" + /> + + + + } + path="" + /> + + + + } + path="actors" + /> + - - + + + } - path="info" - /> - - - - } - path="" - /> - - - - } - path="actors" - /> - } path="actors/:id" /> + path="actors/:actorId" + > + } path="" /> + } path="tasks/:taskId" /> + + } path="tasks/:taskId" /> + + + } path="actors"> + } path="" /> + } path=":actorId"> + } path="" /> + } path="tasks/:taskId" /> - } path="actors" /> - } path="actors/:id" /> } path="metrics" /> } path="serve"> } path="" /> + } + path="controller" + /> + } + path="httpProxies/:httpProxyId" + /> } path="applications/:applicationName" diff --git a/dashboard/client/src/common/CodeDialogButton/CodeDialogButton.tsx b/dashboard/client/src/common/CodeDialogButton/CodeDialogButton.tsx index 711616fa1092..0aea9c9f4d61 100644 --- a/dashboard/client/src/common/CodeDialogButton/CodeDialogButton.tsx +++ b/dashboard/client/src/common/CodeDialogButton/CodeDialogButton.tsx @@ -5,9 +5,11 @@ import { makeStyles, Typography, } from "@material-ui/core"; +import classNames from "classnames"; import yaml from "js-yaml"; import React, { useState } from "react"; import DialogWithTitle from "../DialogWithTitle"; +import { ClassNameProps } from "../props"; const useStyles = makeStyles((theme) => createStyles({ @@ -15,7 +17,8 @@ const useStyles = makeStyles((theme) => whiteSpace: "pre", fontFamily: "SFMono-Regular,Consolas,Liberation Mono,Menlo,monospace", padding: theme.spacing(2), - // borderRadius: theme.spacing(1), + overflow: "scroll", + maxHeight: 600, }, }), ); @@ -92,13 +95,14 @@ const useCodeDialogButtonWithPreviewStyles = makeStyles((theme) => }), ); -type CodeDialogButtonWithPreviewProps = CodeDialogButtonProps; +type CodeDialogButtonWithPreviewProps = CodeDialogButtonProps & ClassNameProps; /** * Similar to CodeDialogButton but also shows a snippet of the expanded text next to the button. */ export const CodeDialogButtonWithPreview = ({ code, buttonText, + className, ...props }: CodeDialogButtonWithPreviewProps) => { const classes = useCodeDialogButtonWithPreviewStyles(); @@ -109,7 +113,7 @@ export const CodeDialogButtonWithPreview = ({ const buttonTextToPass = buttonText ?? "Expand"; return ( -
+
{codeText} alignItems: "center", fontWeight: 500, cursor: "pointer", + marginRight: theme.spacing(1), }, icon: { marginRight: theme.spacing(1), width: 24, height: 24, }, - body: { - marginTop: theme.spacing(1), - }, - bodyHidden: { - display: "none", - }, }), ); @@ -43,6 +38,10 @@ type CollapsibleSectionProps = PropsWithChildren< onExpandButtonClick?: () => void; title: string; startExpanded?: boolean; + /** + * Icon to show to the right of the title. + */ + icon?: React.ReactNode; /** * An optimization to not avoid re-rendering the contents of the collapsible section. * When enabled, we will keep the content around when collapsing but hide it via css. @@ -64,19 +63,13 @@ export const CollapsibleSection = forwardRef< className, children, keepRendered, + icon, }, ref, ) => { const classes = useStyles(); const [internalExpanded, setInternalExpanded] = useState(startExpanded); const finalExpanded = expanded !== undefined ? expanded : internalExpanded; - const [rendered, setRendered] = useState(finalExpanded); - - useEffect(() => { - if (finalExpanded) { - setRendered(true); - } - }, [finalExpanded]); const handleExpandClick = () => { onExpandButtonClick?.(); @@ -85,28 +78,84 @@ export const CollapsibleSection = forwardRef< return (
- - {finalExpanded ? ( - - ) : ( - - )} - {title} - - {(finalExpanded || (keepRendered && rendered)) && ( -
+ - {children} -
- )} + {finalExpanded ? ( + + ) : ( + + )} + {title} + + {icon} + + + {children} +
); }, ); + +const useHideableBlockStyles = makeStyles((theme) => + createStyles({ + body: { + marginTop: theme.spacing(1), + }, + bodyHidden: { + display: "none", + }, + }), +); + +type HideableBlockProps = PropsWithChildren< + { + visible: boolean; + /** + * An optimization to not avoid re-rendering the contents of the collapsible section. + * When enabled, we will keep the content around when collapsing but hide it via css. + */ + keepRendered?: boolean; + } & ClassNameProps +>; + +/** + * Component that can be hidden depending on a passed in prop. Supports an optimization + * to keep the component rendered (but not visible) when hidden to avoid re-rendering + * when component is shown again. + */ +export const HideableBlock = ({ + visible, + keepRendered, + children, +}: HideableBlockProps) => { + const classes = useHideableBlockStyles(); + + // visible represents whether the component is viewable in the browser. + // Rendered represents whether the DOM elements exist in the DOM tree. + // If !visible && rendered, then the elements are in the DOM but are + // not drawn via CSS visibility rules. + const [rendered, setRendered] = useState(visible); + + useEffect(() => { + if (visible) { + setRendered(true); + } + }, [visible]); + + // Optimization to keep the component rendered (but not visible) when hidden + // to avoid re-rendering when component is shown again. + return visible || (keepRendered && rendered) ? ( +
+ {children} +
+ ) : null; +}; diff --git a/dashboard/client/src/common/JobStatus.tsx b/dashboard/client/src/common/JobStatus.tsx new file mode 100644 index 000000000000..09473674e031 --- /dev/null +++ b/dashboard/client/src/common/JobStatus.tsx @@ -0,0 +1,155 @@ +import { Box, createStyles, makeStyles } from "@material-ui/core"; +import classNames from "classnames"; +import React from "react"; +import { + RiCheckboxCircleFill, + RiCloseCircleFill, + RiLoader4Line, + RiStopCircleFill, +} from "react-icons/ri"; +import { StatusChip } from "../components/StatusChip"; +import { JobStatus, UnifiedJob } from "../type/job"; +import { ClassNameProps } from "./props"; + +const useJobRunningIconStyles = makeStyles((theme) => + createStyles({ + icon: { + width: 20, + height: 20, + }, + iconSmall: { + width: 16, + height: 16, + }, + "@keyframes spinner": { + from: { + transform: "rotate(0deg)", + }, + to: { + transform: "rotate(360deg)", + }, + }, + iconRunning: { + color: "#1E88E5", + animationName: "$spinner", + animationDuration: "1000ms", + animationIterationCount: "infinite", + animationTimingFunction: "linear", + }, + }), +); + +type JobRunningIconProps = { small?: boolean } & ClassNameProps; + +export const JobRunningIcon = ({ + className, + small = false, +}: JobRunningIconProps) => { + const classes = useJobRunningIconStyles(); + return ( + + ); +}; + +const useJobStatusIconStyles = makeStyles((theme) => + createStyles({ + icon: { + width: 20, + height: 20, + }, + iconSmall: { + width: 16, + height: 16, + }, + colorSuccess: { + color: theme.palette.success.main, + }, + colorError: { + color: theme.palette.error.main, + }, + colorStopped: { + color: "#757575", + }, + }), +); + +type JobStatusIconProps = { + job: UnifiedJob; + small?: boolean; +} & ClassNameProps; + +export const JobStatusIcon = ({ + job, + small = false, + className, +}: JobStatusIconProps) => { + const classes = useJobStatusIconStyles(); + switch (job.status) { + case JobStatus.SUCCEEDED: + return ( + + ); + case JobStatus.FAILED: + return ( + + ); + case JobStatus.STOPPED: + return ( + + ); + default: + return ; + } +}; + +type JobStatusWithIconProps = { + job: UnifiedJob; +}; + +export const JobStatusWithIcon = ({ job }: JobStatusWithIconProps) => { + return ( + + } + /> + + ); +}; diff --git a/dashboard/client/src/common/MultiTabLogViewer.tsx b/dashboard/client/src/common/MultiTabLogViewer.tsx new file mode 100644 index 000000000000..e9a9d617cc60 --- /dev/null +++ b/dashboard/client/src/common/MultiTabLogViewer.tsx @@ -0,0 +1,244 @@ +import { + Box, + createStyles, + IconButton, + makeStyles, + Tab, + Tabs, + Typography, +} from "@material-ui/core"; +import React, { useState } from "react"; +import { RiExternalLinkLine, RiSortAsc, RiSortDesc } from "react-icons/ri"; +import { Link } from "react-router-dom"; +import { useStateApiLogs } from "../pages/log/hooks"; +import { LogViewer } from "../pages/log/LogViewer"; +import { HideableBlock } from "./CollapsibleSection"; +import { ClassNameProps } from "./props"; + +const useStyles = makeStyles((theme) => + createStyles({ + tabs: { + borderBottom: `1px solid ${theme.palette.divider}`, + }, + }), +); + +export type MultiTabLogViewerTabDetails = { + title: string; +} & LogViewerData; + +export type MultiTabLogViewerProps = { + tabs: MultiTabLogViewerTabDetails[]; + otherLogsLink?: string; +} & ClassNameProps; + +export const MultiTabLogViewer = ({ + tabs, + otherLogsLink, + className, +}: MultiTabLogViewerProps) => { + const classes = useStyles(); + const [value, setValue] = useState(tabs[0]?.title); + const [expanded, setExpanded] = useState(false); + + const currentTab = tabs.find((tab) => tab.title === value); + + if (tabs.length === 0) { + return No logs to display.; + } + + return ( +
+ + + {(tabs.length > 1 || otherLogsLink) && ( + { + setValue(newValue); + }} + indicatorColor="primary" + > + {tabs.map(({ title }) => ( + + ))} + {otherLogsLink && ( + + Other logs   + + } + onClick={(event) => { + // Prevent the tab from changing + setValue(value); + }} + component={Link} + to={otherLogsLink} + target="_blank" + rel="noopener noreferrer" + /> + )} + + )} + + {!currentTab ? ( + Please select a tab. + ) : ( + tabs.map((tab) => { + const { title, ...data } = tab; + return ( + + + + ); + }) + )} + + { + setExpanded(!expanded); + }} + > + {expanded ? : } + + +
+ ); +}; + +type TextData = { + contents: string; +}; +type FileData = { + nodeId: string | null; + filename?: string; +}; +type ActorData = { + actorId: string | null; + suffix: "out" | "err"; +}; +type TaskData = { + taskId: string | null; + suffix: "out" | "err"; +}; + +type LogViewerData = TextData | FileData | ActorData | TaskData; + +const isLogViewerDataText = (data: LogViewerData): data is TextData => + "contents" in data; + +const isLogViewerDataActor = (data: LogViewerData): data is ActorData => + "actorId" in data; + +const isLogViewerDataTask = (data: LogViewerData): data is TaskData => + "taskId" in data; + +export type StateApiLogViewerProps = { + height?: number; + data: LogViewerData; +}; + +export const StateApiLogViewer = ({ + height = 300, + data, +}: StateApiLogViewerProps) => { + if (isLogViewerDataText(data)) { + return ; + } else if (isLogViewerDataActor(data)) { + return ; + } else if (isLogViewerDataTask(data)) { + return ; + } else { + return ; + } +}; + +const TextLogViewer = ({ + height = 300, + contents, +}: { + height: number; + contents: string; +}) => { + return ; +}; + +const FileLogViewer = ({ + height = 300, + nodeId, + filename, +}: { + height: number; +} & FileData) => { + const apiData = useStateApiLogs({ nodeId, filename }, filename); + return ; +}; + +const ActorLogViewer = ({ + height = 300, + actorId, + suffix, +}: { + height: number; +} & ActorData) => { + const apiData = useStateApiLogs( + { actorId, suffix }, + `actor-log-${actorId}.${suffix}`, + ); + return ; +}; + +const TaskLogViewer = ({ + height = 300, + taskId, + suffix, +}: { + height: number; +} & TaskData) => { + const apiData = useStateApiLogs( + { taskId, suffix }, + `task-log-${taskId}.${suffix}`, + ); + return ; +}; + +const ApiLogViewer = ({ + apiData: { downloadUrl, log, path, refresh }, + height = 300, +}: { + apiData: ReturnType; + height: number; +}) => { + return typeof log === "string" ? ( + { + refresh(); + }} + /> + ) : ( + Failed to load + ); +}; diff --git a/dashboard/client/src/common/ProfilingLink.tsx b/dashboard/client/src/common/ProfilingLink.tsx index bf432a712e87..323d4a063b18 100644 --- a/dashboard/client/src/common/ProfilingLink.tsx +++ b/dashboard/client/src/common/ProfilingLink.tsx @@ -1,3 +1,4 @@ +import { Link } from "@material-ui/core"; import React, { PropsWithChildren } from "react"; import { ClassNameProps } from "./props"; @@ -19,14 +20,14 @@ export const CpuProfilingLink = ({ } return ( - Stack Trace{type ? ` (${type})` : ""} - + ); }; @@ -40,13 +41,13 @@ export const CpuStackTraceLink = ({ } return ( - CPU Flame Graph{type ? ` (${type})` : ""} - + ); }; diff --git a/dashboard/client/src/common/Section.tsx b/dashboard/client/src/common/Section.tsx new file mode 100644 index 000000000000..bcbdada9ad36 --- /dev/null +++ b/dashboard/client/src/common/Section.tsx @@ -0,0 +1,57 @@ +import { + Box, + BoxProps, + createStyles, + makeStyles, + Paper, + Typography, +} from "@material-ui/core"; +import classNames from "classnames"; +import React, { PropsWithChildren } from "react"; +import { ClassNameProps } from "./props"; + +const useStyles = makeStyles((theme) => + createStyles({ + contentContainer: { + padding: theme.spacing(2), + height: "100%", + }, + contentContainerNoTopPadding: { + paddingTop: 0, + }, + }), +); + +type SectionProps = { + title?: string; + noTopPadding?: boolean; +} & ClassNameProps & + BoxProps; + +export const Section = ({ + title, + children, + className, + noTopPadding = false, + ...props +}: PropsWithChildren) => { + const classes = useStyles(); + + return ( + + {title && ( + + {title} + + )} + + {children} + + + ); +}; diff --git a/dashboard/client/src/common/ServeStatus.component.test.tsx b/dashboard/client/src/common/ServeStatus.component.test.tsx new file mode 100644 index 000000000000..5436f583e02f --- /dev/null +++ b/dashboard/client/src/common/ServeStatus.component.test.tsx @@ -0,0 +1,60 @@ +import { render, screen } from "@testing-library/react"; +import React from "react"; +import { ServeApplication, ServeApplicationStatus } from "../type/serve"; +import { ServeStatusIcon } from "./ServeStatus"; + +const APP: ServeApplication = { + name: "MyServeApp", + route_prefix: "/my-serve-app", + docs_path: null, + status: ServeApplicationStatus.RUNNING, + message: "", + last_deployed_time_s: 1682029771.0748637, + deployed_app_config: null, + deployments: {}, +}; + +describe("ServeStatusIcon", () => { + it("renders RUNNING status", async () => { + render(); + + await screen.findByTestId("serve-status-icon"); + + const icon = screen.getByTestId("serve-status-icon"); + const classList = icon.getAttribute("class"); + expect(classList).toContain("colorSuccess"); + }); + + it("renders NOT_STARTED status", async () => { + render( + , + ); + + await screen.findByTestId("serve-status-icon"); + + expect(screen.queryByTestId("serve-status-icon")).not.toHaveClass( + "colorSuccess", + ); + expect(screen.queryByTestId("serve-status-icon")).not.toHaveClass( + "colorError", + ); + }); + + it("renders DEPLOY_FAILED status", async () => { + render( + , + ); + + await screen.findByTestId("serve-status-icon"); + + const icon = screen.getByTestId("serve-status-icon"); + const classList = icon.getAttribute("class"); + expect(classList).toContain("colorError"); + }); +}); diff --git a/dashboard/client/src/common/ServeStatus.tsx b/dashboard/client/src/common/ServeStatus.tsx new file mode 100644 index 000000000000..dd4ebad48889 --- /dev/null +++ b/dashboard/client/src/common/ServeStatus.tsx @@ -0,0 +1,77 @@ +import { createStyles, makeStyles } from "@material-ui/core"; +import classNames from "classnames"; +import React from "react"; +import { + RiCloseCircleFill, + RiRecordCircleFill, + RiStopCircleFill, +} from "react-icons/ri"; +import { ServeApplication } from "../type/serve"; +import { JobRunningIcon } from "./JobStatus"; +import { ClassNameProps } from "./props"; + +type ServeStatusIconProps = { + app: ServeApplication; + small: boolean; +} & ClassNameProps; + +const useServeStatusIconStyles = makeStyles((theme) => + createStyles({ + icon: { + width: 20, + height: 20, + marginRight: 8, + }, + iconSmall: { + width: 16, + height: 16, + }, + colorSuccess: { + color: theme.palette.success.main, + }, + colorError: { + color: theme.palette.error.main, + }, + }), +); + +export const ServeStatusIcon = ({ + app, + small, + className, +}: ServeStatusIconProps) => { + const classes = useServeStatusIconStyles(); + + switch (app.status) { + case "RUNNING": + return ( + + ); + case "NOT_STARTED": + return ( + + ); + case "DEPLOY_FAILED": + return ( + + ); + default: + // DEPLOYING || DELETEING + return ( + + ); + } +}; diff --git a/dashboard/client/src/common/util.ts b/dashboard/client/src/common/util.ts index 42571de261a9..c25914c2a175 100644 --- a/dashboard/client/src/common/util.ts +++ b/dashboard/client/src/common/util.ts @@ -1,3 +1,5 @@ +import _ from "lodash"; + export const getWeightedAverage = ( input: { weight: number; @@ -24,3 +26,15 @@ export const filterObj = (obj: Record, filterFn: any) => export const mapObj = (obj: Record, filterFn: any) => Object.fromEntries(Object.entries(obj).map(filterFn) as any[]); + +export const filterRuntimeEnvSystemVariables = ( + runtime_env: Record, +): Record => { + const out = _.pickBy(runtime_env, (_, key) => { + if (key.startsWith("_")) { + return false; + } + return true; + }); + return out; +}; diff --git a/dashboard/client/src/common/util.unit.test.ts b/dashboard/client/src/common/util.unit.test.ts new file mode 100644 index 000000000000..86c1758d48b6 --- /dev/null +++ b/dashboard/client/src/common/util.unit.test.ts @@ -0,0 +1,34 @@ +import { filterRuntimeEnvSystemVariables } from "./util"; + +describe("filterRuntimeEnvSystemVariables", () => { + it("filters out system variables", () => { + expect( + filterRuntimeEnvSystemVariables({ + pip: { + pip_check: true, + packages: ["chess", "foo", "bar"], + pip_version: "1.2.3", + }, + env_vars: { + FOO: "foo", + BAR: "5", + }, + working_dir: ".", + _ray_release: "2.3.1", + _ray_commit: "12345abc", + _inject_current_ray: false, + }), + ).toEqual({ + pip: { + pip_check: true, + packages: ["chess", "foo", "bar"], + pip_version: "1.2.3", + }, + env_vars: { + FOO: "foo", + BAR: "5", + }, + working_dir: ".", + }); + }); +}); diff --git a/dashboard/client/src/components/ActorTable.tsx b/dashboard/client/src/components/ActorTable.tsx index 2ef4d150bb60..885a501b522f 100644 --- a/dashboard/client/src/components/ActorTable.tsx +++ b/dashboard/client/src/components/ActorTable.tsx @@ -15,8 +15,8 @@ import { orange } from "@material-ui/core/colors"; import { SearchOutlined } from "@material-ui/icons"; import Autocomplete from "@material-ui/lab/Autocomplete"; import Pagination from "@material-ui/lab/Pagination"; +import _ from "lodash"; import React, { useContext, useMemo, useState } from "react"; -import { Link } from "react-router-dom"; import { GlobalContext } from "../App"; import { DurationText } from "../common/DurationText"; import { ActorLink } from "../common/links"; @@ -39,14 +39,22 @@ export type ActorTableProps = { detailPathPrefix?: string; }; +const SEQUENCE = { + FIRST: 1, + MIDDLE: 2, + LAST: 3, +}; + type StateOrder = { [key in ActorEnum]: number; }; + const stateOrder: StateOrder = { - [ActorEnum.ALIVE]: 0, - [ActorEnum.PENDING]: 1, - [ActorEnum.RECONSTRUCTING]: 2, - [ActorEnum.DEAD]: 3, + [ActorEnum.ALIVE]: SEQUENCE.FIRST, + [ActorEnum.DEPENDENCIES_UNREADY]: SEQUENCE.MIDDLE, + [ActorEnum.PENDING_CREATION]: SEQUENCE.MIDDLE, + [ActorEnum.RESTARTING]: SEQUENCE.MIDDLE, + [ActorEnum.DEAD]: SEQUENCE.LAST, }; //type predicate for ActorEnum const isActorEnum = (state: unknown): state is ActorEnum => { @@ -56,26 +64,11 @@ const isActorEnum = (state: unknown): state is ActorEnum => { // We sort the actorsList so that the "Alive" actors appear at first and "Dead" actors appear in the end. export const sortActors = (actorList: Actor[]) => { const sortedActors = [...actorList]; - sortedActors.sort((actor1, actor2) => { - const actorOrder1 = isActorEnum(actor1.state) - ? stateOrder[actor1.state] - : 0; - const actorOrder2 = isActorEnum(actor2.state) - ? stateOrder[actor2.state] - : 0; - - const actorTime1 = actor1.startTime || 0; - const actorTime2 = actor2.startTime || 0; - - if (actorOrder1 !== actorOrder2) { - return actorOrder1 - actorOrder2; - } else { - // When the state is equal, we sort by startTime - // in order to provide a determined order for users no matter the backend API changes - return actorTime1 - actorTime2; - } + return _.sortBy(sortedActors, (actor) => { + const actorOrder = isActorEnum(actor.state) ? stateOrder[actor.state] : 0; + const actorTime = actor.startTime || 0; + return [actorOrder, actorTime]; }); - return sortedActors; }; const ActorTable = ({ @@ -184,8 +177,8 @@ const ActorTable = ({ ), }, { label: "Uptime" }, - { label: "Job Id" }, - { label: "Pid" }, + { label: "Job ID" }, + { label: "PID" }, { label: "IP" }, { label: "Restarted", @@ -196,16 +189,16 @@ const ActorTable = ({ ), }, { - label: "Placement Group Id", + label: "Placement group ID", helpInfo: ( - The id of the placement group this actor is scheduled to. + The ID of the placement group this actor is scheduled to.
), }, { - label: "Required Resources", + label: "Required resources", helpInfo: ( The required Ray resources to start an actor. @@ -223,7 +216,7 @@ const ActorTable = ({ ), }, { - label: "Exit Detail", + label: "Exit detail", helpInfo: ( The detail of an actor exit. Only available when an actor is dead. @@ -436,14 +429,16 @@ const ActorTable = ({ {ipLogMap[address?.ipAddress] && ( - Log - +
{ + // ==== auto scaling status + // Node status + // .... + // Resources + // .... + const sections = cluster_status.split("Resources"); + return formatClusterStatus( + "Node Status", + sections[0].split("Node status")[1], + ); +}; + +const formatResourcesStatus = (cluster_status: string) => { + // ==== auto scaling status + // Node status + // .... + // Resources + // .... + const sections = cluster_status.split("Resources"); + return formatClusterStatus("Resource Status", sections[1]); +}; + +const formatClusterStatus = (title: string, cluster_status: string) => { + const cluster_status_rows = cluster_status.split("\n"); + + return ( +
+ + {title} + + {cluster_status_rows.map((i, key) => { + // Format the output. + // See format_info_string in util.py + if (i.startsWith("-----") || i.startsWith("=====") || i === "") { + // Ignore separators + return null; + } else if (i.endsWith(":")) { + return ( +
+ {i} +
+ ); + } else { + return
{i}
; + } + })} +
+ ); +}; + +type StatusCardProps = { + cluster_status: RayStatusResp | undefined; +}; + +export const NodeStatusCard = ({ cluster_status }: StatusCardProps) => { + return ( + + {cluster_status?.data + ? formatNodeStatus(cluster_status?.data.clusterStatus) + : "No cluster status."} + + ); +}; + +export const ResourceStatusCard = ({ cluster_status }: StatusCardProps) => { + return ( + + {cluster_status?.data + ? formatResourcesStatus(cluster_status?.data.clusterStatus) + : "No cluster status."} + + ); +}; diff --git a/dashboard/client/src/components/ListItemCard.tsx b/dashboard/client/src/components/ListItemCard.tsx new file mode 100644 index 000000000000..530cb08f13d8 --- /dev/null +++ b/dashboard/client/src/components/ListItemCard.tsx @@ -0,0 +1,135 @@ +import { createStyles, makeStyles, Typography } from "@material-ui/core"; +import classNames from "classnames"; +import React, { ReactNode } from "react"; +import { Link } from "react-router-dom"; +import { ClassNameProps } from "../common/props"; +import { + LinkWithArrow, + OverviewCard, +} from "../pages/overview/cards/OverviewCard"; + +type ListItemCardProps = { + headerTitle: string; + items: ListItemProps[]; + emptyListText: string; + footerText: string; + footerLink: string; +} & ClassNameProps; + +type ListItemProps = { + title: string | undefined; + subtitle: string; + link: string | undefined; + icon: ReactNode; +} & ClassNameProps; + +const useStyles = makeStyles((theme) => + createStyles({ + root: { + display: "flex", + flexDirection: "column", + padding: theme.spacing(2, 3), + }, + listContainer: { + marginTop: theme.spacing(2), + flex: 1, + overflow: "hidden", + }, + listItem: { + "&:not(:first-child)": { + marginTop: theme.spacing(1), + }, + }, + }), +); + +export const ListItemCard = ({ + className, + headerTitle, + items, + emptyListText: itemEmptyTip, + footerText, + footerLink, +}: ListItemCardProps) => { + const classes = useStyles(); + + return ( + + {headerTitle} +
+ {items.map((item: ListItemProps) => ( + + ))} + {items.length === 0 && ( + {itemEmptyTip} + )} +
+ +
+ ); +}; + +const useListItemStyles = makeStyles((theme) => + createStyles({ + root: { + display: "flex", + flexDirection: "row", + flexWrap: "nowrap", + alignItems: "center", + textDecoration: "none", + }, + + textContainer: { + flex: "1 1 auto", + width: `calc(100% - ${theme.spacing(1) + 20}px)`, + }, + title: { + color: "#036DCF", + }, + entrypoint: { + overflow: "hidden", + textOverflow: "ellipsis", + whiteSpace: "nowrap", + color: "#5F6469", + }, + }), +); + +const ListItem = ({ + icon, + title, + subtitle, + className, + link, +}: ListItemProps) => { + const classes = useListItemStyles(); + + const cardContent = ( + + {icon} +
+ + {title} + + + {subtitle} + +
+
+ ); + return ( +
+ {link !== undefined ? ( + + {cardContent} + + ) : ( +
{cardContent}
+ )} +
+ ); +}; diff --git a/dashboard/client/src/components/Loading.tsx b/dashboard/client/src/components/Loading.tsx index 6c1cb1e8f0ea..edca6bb2063e 100644 --- a/dashboard/client/src/components/Loading.tsx +++ b/dashboard/client/src/components/Loading.tsx @@ -1,10 +1,7 @@ -import { Backdrop, CircularProgress } from "@material-ui/core"; +import { CircularProgress } from "@material-ui/core"; import React from "react"; -const Loading = ({ loading }: { loading: boolean }) => ( - - - -); +const Loading = ({ loading }: { loading: boolean }) => + loading ? : null; export default Loading; diff --git a/dashboard/client/src/components/MetadataSection/MetadataSection.tsx b/dashboard/client/src/components/MetadataSection/MetadataSection.tsx index eb2c8af78b50..f9091a78a62e 100644 --- a/dashboard/client/src/components/MetadataSection/MetadataSection.tsx +++ b/dashboard/client/src/components/MetadataSection/MetadataSection.tsx @@ -4,7 +4,6 @@ import { IconButton, Link, makeStyles, - Paper, Tooltip, Typography, } from "@material-ui/core"; @@ -12,6 +11,7 @@ import copy from "copy-to-clipboard"; import React, { useState } from "react"; import { RiFileCopyLine } from "react-icons/ri"; import { Link as RouterLink } from "react-router-dom"; +import { Section } from "../../common/Section"; import { HelpInfo } from "../Tooltip"; export type StringOnlyMetadataContent = { @@ -30,6 +30,9 @@ type CopyableMetadataContent = StringOnlyMetadataContent & { readonly copyableValue: string; }; +type CopyAndLinkableMetadataContent = LinkableMetadataContent & + CopyableMetadataContent; + export type Metadata = { readonly label: string; readonly labelTooltip?: string | JSX.Element; @@ -39,6 +42,7 @@ export type Metadata = { | StringOnlyMetadataContent | LinkableMetadataContent | CopyableMetadataContent + | CopyAndLinkableMetadataContent | JSX.Element; /** @@ -55,7 +59,6 @@ const useStyles = makeStyles((theme) => gridTemplateColumns: "repeat(3, minmax(0, 1fr))", rowGap: theme.spacing(1), columnGap: theme.spacing(4), - padding: theme.spacing(2), }, label: { color: theme.palette.text.secondary, @@ -93,6 +96,28 @@ export const MetadataContentField: React.FC<{ const classes = useStyles(); const [copyIconClicked, setCopyIconClicked] = useState(false); + const copyElement = content && "copyableValue" in content && ( + + { + setCopyIconClicked(true); + copy(content.copyableValue); + }} + // Set up mouse events to avoid text changing while tooltip is visible + onMouseEnter={() => setCopyIconClicked(false)} + onMouseLeave={() => setTimeout(() => setCopyIconClicked(false), 333)} + size="small" + className={classes.button} + > + + + + ); + if (content === undefined || "value" in content) { return content === undefined || !("link" in content) ? (
@@ -104,47 +129,31 @@ export const MetadataContentField: React.FC<{ > {content?.value ?? "-"} - {content && "copyableValue" in content && ( - - { - setCopyIconClicked(true); - copy(content.copyableValue); - }} - // Set up mouse events to avoid text changing while tooltip is visible - onMouseEnter={() => setCopyIconClicked(false)} - onMouseLeave={() => - setTimeout(() => setCopyIconClicked(false), 333) - } - size="small" - className={classes.button} - > - - - - )} + {copyElement}
) : content.link.startsWith("http") ? ( - - {content.value} - +
+ + {content.value} + + {copyElement} +
) : ( - - {content.value} - +
+ + {content.value} + + {copyElement} +
); } return
{content}
; @@ -193,15 +202,8 @@ export const MetadataSection = ({ metadataList: Metadata[]; }) => { return ( - - {header && ( - - {header} - - )} - - - - +
+ +
); }; diff --git a/dashboard/client/src/components/StatusChip.tsx b/dashboard/client/src/components/StatusChip.tsx index ab4869db7998..4437321ac7e2 100644 --- a/dashboard/client/src/components/StatusChip.tsx +++ b/dashboard/client/src/components/StatusChip.tsx @@ -1,25 +1,21 @@ -import { Color } from "@material-ui/core"; -import { - blue, - blueGrey, - cyan, - green, - grey, - lightBlue, - orange, - red, - yellow, -} from "@material-ui/core/colors"; +import { Color, createStyles, makeStyles } from "@material-ui/core"; +import { blue, blueGrey, cyan, green, red } from "@material-ui/core/colors"; import { CSSProperties } from "@material-ui/core/styles/withStyles"; +import classNames from "classnames"; import React, { ReactNode } from "react"; +import { TaskStatus } from "../pages/job/hook/useJobProgress"; import { ActorEnum } from "../type/actor"; +import { JobStatus } from "../type/job"; import { PlacementGroupState } from "../type/placementGroup"; import { ServeApplicationStatus, ServeDeploymentStatus, ServeReplicaState, + ServeSystemActorStatus, } from "../type/serve"; -import { TypeTaskStatus } from "../type/task"; + +const orange = "#DB6D00"; +const grey = "#5F6469"; const colorMap = { node: { @@ -28,60 +24,66 @@ const colorMap = { }, worker: { ALIVE: green, + DEAD: red, }, actor: { [ActorEnum.ALIVE]: green, [ActorEnum.DEAD]: red, - [ActorEnum.PENDING]: blue, - [ActorEnum.RECONSTRUCTING]: lightBlue, + [ActorEnum.DEPENDENCIES_UNREADY]: orange, + [ActorEnum.PENDING_CREATION]: orange, + [ActorEnum.RESTARTING]: orange, }, task: { - [TypeTaskStatus.FAILED]: red, - [TypeTaskStatus.FINISHED]: green, - [TypeTaskStatus.RUNNING]: blue, - [TypeTaskStatus.RUNNING_IN_RAY_GET]: blue, - [TypeTaskStatus.RUNNING_IN_RAY_WAIT]: blue, - [TypeTaskStatus.SUBMITTED_TO_WORKER]: "#cfcf08", - [TypeTaskStatus.PENDING_ARGS_FETCH]: blue, - [TypeTaskStatus.PENDING_OBJ_STORE_MEM_AVAIL]: blue, - [TypeTaskStatus.PENDING_NODE_ASSIGNMENT]: "#cfcf08", - [TypeTaskStatus.PENDING_ARGS_AVAIL]: "#f79e02", + [TaskStatus.FAILED]: red, + [TaskStatus.FINISHED]: green, + [TaskStatus.RUNNING]: blue, + [TaskStatus.SUBMITTED_TO_WORKER]: orange, + [TaskStatus.PENDING_NODE_ASSIGNMENT]: orange, + [TaskStatus.PENDING_ARGS_AVAIL]: orange, + [TaskStatus.UNKNOWN]: grey, }, job: { - INIT: grey, - SUBMITTED: "#cfcf08", - DISPATCHED: lightBlue, - RUNNING: blue, - COMPLETED: green, - SUCCEEDED: green, - FINISHED: green, - FAILED: red, + [JobStatus.PENDING]: orange, + [JobStatus.RUNNING]: blue, + [JobStatus.STOPPED]: grey, + [JobStatus.SUCCEEDED]: green, + [JobStatus.FAILED]: red, }, placementGroup: { - [PlacementGroupState.PENDING]: "#f79e02", - [PlacementGroupState.CREATED]: blue, - [PlacementGroupState.REMOVED]: red, - [PlacementGroupState.RESCHEDULING]: "#cfcf08", + [PlacementGroupState.PENDING]: orange, + [PlacementGroupState.CREATED]: green, + [PlacementGroupState.REMOVED]: grey, + [PlacementGroupState.RESCHEDULING]: orange, }, serveApplication: { [ServeApplicationStatus.NOT_STARTED]: grey, - [ServeApplicationStatus.DEPLOYING]: yellow, + [ServeApplicationStatus.DEPLOYING]: orange, [ServeApplicationStatus.RUNNING]: green, [ServeApplicationStatus.DEPLOY_FAILED]: red, - [ServeApplicationStatus.DELETING]: yellow, + [ServeApplicationStatus.DELETING]: orange, }, serveDeployment: { - [ServeDeploymentStatus.UPDATING]: yellow, + [ServeDeploymentStatus.UPDATING]: orange, [ServeDeploymentStatus.HEALTHY]: green, [ServeDeploymentStatus.UNHEALTHY]: red, }, serveReplica: { - [ServeReplicaState.STARTING]: yellow, - [ServeReplicaState.UPDATING]: yellow, + [ServeReplicaState.STARTING]: orange, + [ServeReplicaState.UPDATING]: orange, [ServeReplicaState.RECOVERING]: orange, [ServeReplicaState.RUNNING]: green, [ServeReplicaState.STOPPING]: red, }, + serveHttpProxy: { + [ServeSystemActorStatus.HEALTHY]: green, + [ServeSystemActorStatus.UNHEALTHY]: red, + [ServeSystemActorStatus.STARTING]: orange, + }, + serveController: { + [ServeSystemActorStatus.HEALTHY]: green, + [ServeSystemActorStatus.UNHEALTHY]: red, + [ServeSystemActorStatus.STARTING]: orange, + }, } as { [key: string]: { [key: string]: Color | string; @@ -96,23 +98,35 @@ const typeMap = { [key: string]: Color; }; +const useStyles = makeStyles((theme) => + createStyles({ + root: { + padding: "2px 8px", + border: "solid 1px", + borderRadius: 4, + fontSize: 12, + margin: 2, + display: "inline-flex", + alignItems: "center", + }, + afterIcon: { + marginLeft: 4, + }, + }), +); + export const StatusChip = ({ type, status, suffix, + icon, }: { type: string; status: string | ActorEnum | ReactNode; suffix?: string; + icon?: ReactNode; }) => { - const style = { - padding: "2px 8px", - border: "solid 1px", - borderRadius: 4, - fontSize: 12, - margin: 2, - } as CSSProperties; - + const classes = useStyles(); let color: Color | string = blueGrey; if (typeMap[type]) { @@ -127,6 +141,7 @@ export const StatusChip = ({ const colorValue = typeof color === "string" ? color : color[500]; + const style: CSSProperties = {}; style.color = colorValue; style.borderColor = colorValue; if (color !== blueGrey) { @@ -134,8 +149,11 @@ export const StatusChip = ({ } return ( - - {status} + + {icon} + + {status} + {suffix} ); diff --git a/dashboard/client/src/components/TaskTable.tsx b/dashboard/client/src/components/TaskTable.tsx index 868e79732518..fbce5c5a2683 100644 --- a/dashboard/client/src/components/TaskTable.tsx +++ b/dashboard/client/src/components/TaskTable.tsx @@ -1,8 +1,7 @@ import { Box, - createStyles, InputAdornment, - makeStyles, + Link, Table, TableBody, TableCell, @@ -15,10 +14,9 @@ import { } from "@material-ui/core"; import Autocomplete from "@material-ui/lab/Autocomplete"; import Pagination from "@material-ui/lab/Pagination"; -import React, { useContext, useState } from "react"; -import { Link } from "react-router-dom"; -import { GlobalContext } from "../App"; -import DialogWithTitle from "../common/DialogWithTitle"; +import React, { useState } from "react"; +import { Link as RouterLink } from "react-router-dom"; +import { CodeDialogButton } from "../common/CodeDialogButton"; import { DurationText } from "../common/DurationText"; import { ActorLink, NodeLink } from "../common/links"; import rowStyles from "../common/RowStyles"; @@ -60,7 +58,7 @@ const TaskTable = ({ const columns = [ { label: "ID" }, { label: "Name" }, - { label: "Job Id" }, + { label: "Job ID" }, { label: "State" }, { label: "Actions", @@ -77,13 +75,13 @@ const TaskTable = ({ ), }, { label: "Duration" }, - { label: "Function or Class Name" }, - { label: "Node Id" }, - { label: "Actor_id" }, - { label: "Worker_id" }, + { label: "Function or class name" }, + { label: "Node ID" }, + { label: "Actor ID" }, + { label: "Worker ID" }, { label: "Type" }, - { label: "Placement Group Id" }, - { label: "Required Resources" }, + { label: "Placement group ID" }, + { label: "Required resources" }, ]; return ( @@ -231,7 +229,9 @@ const TaskTable = ({ arrow interactive > -
{task_id}
+ + {task_id} +
{name ? name : "-"} @@ -299,24 +299,14 @@ const TaskTable = ({ - ( -
- {key}: {val} -
- ), - )} - arrow - interactive - > -
- {Object.entries(required_resources || {}) - .map(([key, val]) => `${key}: ${val}`) - .join(", ")} -
-
+ {Object.entries(required_resources || {}).length > 0 ? ( + + ) : ( + "{}" + )}
); @@ -330,70 +320,29 @@ const TaskTable = ({ export default TaskTable; -const useTaskTableActionsStyles = makeStyles(() => - createStyles({ - errorDetails: { - whiteSpace: "pre", - }, - link: { - border: "none", - cursor: "pointer", - color: "#036DCF", - textDecoration: "underline", - background: "none", - }, - }), -); - type TaskTableActionsProps = { task: Task; }; const TaskTableActions = ({ task }: TaskTableActionsProps) => { - const classes = useTaskTableActionsStyles(); - const { ipLogMap } = useContext(GlobalContext); - const [showErrorDetailsDialog, setShowErrorDetailsDialog] = useState(false); - - const handleErrorClick = () => { - setShowErrorDetailsDialog(true); - }; - - const errorDetails = task.error_type - ? `Error Type: ${task.error_type}\n\n${task.error_message}` - : undefined; + const errorDetails = + task.error_type !== null && task.error_message !== null + ? `Error Type: ${task.error_type}\n\n${task.error_message}` + : undefined; return ( - {task?.profiling_data?.node_ip_address && - ipLogMap[task?.profiling_data?.node_ip_address] && - task.worker_id && - task.job_id && ( - - - Log - -
-
- )} + + Log + +
+ {errorDetails && ( - - )} - {showErrorDetailsDialog && errorDetails && ( - { - setShowErrorDetailsDialog(false); - }} - > -
{errorDetails}
-
+ code={errorDetails} + buttonText="Error" + /> )}
); diff --git a/dashboard/client/src/pages/actor/ActorDetail.tsx b/dashboard/client/src/pages/actor/ActorDetail.tsx index 3951cb0ffa54..c134a1482e3e 100644 --- a/dashboard/client/src/pages/actor/ActorDetail.tsx +++ b/dashboard/client/src/pages/actor/ActorDetail.tsx @@ -1,7 +1,7 @@ import { makeStyles } from "@material-ui/core"; -import React, { useContext } from "react"; -import { Link } from "react-router-dom"; -import { GlobalContext } from "../../App"; +import React from "react"; +import { Outlet } from "react-router-dom"; +import { CollapsibleSection } from "../../common/CollapsibleSection"; import { DurationText } from "../../common/DurationText"; import { formatDateFromTimeMs } from "../../common/formatUtils"; import { generateNodeLink } from "../../common/links"; @@ -9,17 +9,20 @@ import { CpuProfilingLink, CpuStackTraceLink, } from "../../common/ProfilingLink"; +import { Section } from "../../common/Section"; import Loading from "../../components/Loading"; import { MetadataSection } from "../../components/MetadataSection"; import { StatusChip } from "../../components/StatusChip"; import TitleCard from "../../components/TitleCard"; import { MainNavPageInfo } from "../layout/mainNavContext"; import TaskList from "../state/task"; +import { ActorLogs } from "./ActorLogs"; import { useActorDetail } from "./hook/useActorDetail"; const useStyle = makeStyles((theme) => ({ root: { padding: theme.spacing(2), + backgroundColor: "white", }, paper: { padding: theme.spacing(2), @@ -32,19 +35,47 @@ const useStyle = makeStyles((theme) => ({ tab: { marginBottom: theme.spacing(2), }, + tasksSection: { + marginTop: theme.spacing(4), + }, })); +export const ActorDetailLayout = () => { + const { params, actorDetail } = useActorDetail(); + + return ( +
+ + +
+ ); +}; + const ActorDetailPage = () => { const classes = useStyle(); - const { ipLogMap } = useContext(GlobalContext); - const { params, actorDetail, msg } = useActorDetail(); + const { params, actorDetail, msg, isLoading } = useActorDetail(); - if (!actorDetail) { + if (isLoading || actorDetail === undefined) { return (
- - - + + +
Request Status: {msg}
@@ -54,156 +85,147 @@ const ActorDetailPage = () => { return (
- - - , - }, - { - label: "ID", - content: actorDetail.actorId - ? { - value: actorDetail.actorId, - copyableValue: actorDetail.actorId, - } - : { value: "-" }, - }, - { - label: "Name", - content: actorDetail.name - ? { - value: actorDetail.name, - } - : { value: "-" }, + , + }, + { + label: "ID", + content: actorDetail.actorId + ? { + value: actorDetail.actorId, + copyableValue: actorDetail.actorId, + } + : { value: "-" }, + }, + { + label: "Name", + content: actorDetail.name + ? { + value: actorDetail.name, + } + : { value: "-" }, + }, + { + label: "Class Name", + content: actorDetail.actorClass + ? { + value: actorDetail.actorClass, + } + : { value: "-" }, + }, + { + label: "Repr", + content: actorDetail.reprName + ? { + value: actorDetail.reprName, + } + : { value: "-" }, + }, + { + label: "Job ID", + content: actorDetail.jobId + ? { + value: actorDetail.jobId, + copyableValue: actorDetail.jobId, + } + : { value: "-" }, + }, + { + label: "Node ID", + content: actorDetail.address?.rayletId + ? { + value: actorDetail.address?.rayletId, + copyableValue: actorDetail.address?.rayletId, + link: actorDetail.address.rayletId + ? generateNodeLink(actorDetail.address.rayletId) + : undefined, + } + : { value: "-" }, + }, + { + label: "Worker ID", + content: actorDetail.address?.workerId + ? { + value: actorDetail.address?.workerId, + copyableValue: actorDetail.address?.workerId, + } + : { value: "-" }, + }, + { + label: "Started at", + content: { + value: actorDetail.startTime + ? formatDateFromTimeMs(actorDetail.startTime) + : "-", }, - { - label: "Class Name", - content: actorDetail.actorClass - ? { - value: actorDetail.actorClass, - } - : { value: "-" }, + }, + { + label: "Ended at", + content: { + value: actorDetail.endTime + ? formatDateFromTimeMs(actorDetail.endTime) + : "-", }, - { - label: "Repr", - content: actorDetail.reprName - ? { - value: actorDetail.reprName, - } - : { value: "-" }, - }, - { - label: "Job ID", - content: actorDetail.jobId - ? { - value: actorDetail.jobId, - copyableValue: actorDetail.jobId, - } - : { value: "-" }, - }, - { - label: "Node ID", - content: actorDetail.address?.rayletId - ? { - value: actorDetail.address?.rayletId, - copyableValue: actorDetail.address?.rayletId, - link: actorDetail.address.rayletId - ? generateNodeLink(actorDetail.address.rayletId) - : undefined, - } - : { value: "-" }, - }, - { - label: "Worker ID", - content: actorDetail.address?.workerId - ? { - value: actorDetail.address?.workerId, - copyableValue: actorDetail.address?.workerId, - } - : { value: "-" }, - }, - { - label: "Started at", - content: { - value: actorDetail.startTime - ? formatDateFromTimeMs(actorDetail.startTime) - : "-", - }, - }, - { - label: "Ended at", - content: { - value: actorDetail.endTime - ? formatDateFromTimeMs(actorDetail.endTime) - : "-", - }, - }, - { - label: "Uptime", - content: actorDetail.startTime ? ( - + ) : ( + - + ), + }, + { + label: "Restarted", + content: { value: actorDetail.numRestarts }, + }, + { + label: "Exit Detail", + content: actorDetail.exitDetail + ? { + value: actorDetail.exitDetail, + } + : { value: "-" }, + }, + { + label: "Actions", + content: ( +
+ - ) : ( - - - ), - }, - { - label: "Restarted", - content: { value: actorDetail.numRestarts }, - }, - { - label: "Exit Detail", - content: actorDetail.exitDetail - ? { - value: actorDetail.exitDetail, - } - : { value: "-" }, - }, - { - label: "Actions", - content: ( -
- - Log - -
- -
- -
- ), - }, - ]} - /> - - - - +
+ +
+ ), + }, + ]} + /> + +
+ +
+
+ +
+ +
+
); }; diff --git a/dashboard/client/src/pages/actor/ActorLayout.tsx b/dashboard/client/src/pages/actor/ActorLayout.tsx new file mode 100644 index 000000000000..7033a12a57c1 --- /dev/null +++ b/dashboard/client/src/pages/actor/ActorLayout.tsx @@ -0,0 +1,18 @@ +import React from "react"; +import { Outlet } from "react-router-dom"; +import { MainNavPageInfo } from "../layout/mainNavContext"; + +export const ActorLayout = () => { + return ( +
+ + +
+ ); +}; diff --git a/dashboard/client/src/pages/actor/ActorList.tsx b/dashboard/client/src/pages/actor/ActorList.tsx index a99ce2dae6b2..d9ab44f1914c 100644 --- a/dashboard/client/src/pages/actor/ActorList.tsx +++ b/dashboard/client/src/pages/actor/ActorList.tsx @@ -1,6 +1,4 @@ -import { Grid } from "@material-ui/core"; -import dayjs from "dayjs"; -import React, { useState } from "react"; +import React from "react"; import ActorTable, { ActorTableProps } from "../../components/ActorTable"; import { Actor } from "../../type/actor"; import { useActorList } from "./hook/useActorList"; @@ -16,17 +14,11 @@ const ActorList = ({ jobId?: string | null; detailPathPrefix?: string; } & Pick) => { - const [timeStamp] = useState(dayjs()); const data: { [actorId: string]: Actor } | undefined = useActorList(); const actors: { [actorId: string]: Actor } = data ? data : {}; return (
- - - Last updated: {timeStamp.format("YYYY-MM-DD HH:mm:ss")} - - ; +}; + +export const ActorLogs = ({ + actor: { + actorId, + pid, + address: { workerId, rayletId }, + }, +}: ActorLogsProps) => { + const tabs: MultiTabLogViewerTabDetails[] = [ + { + title: "stderr", + actorId, + suffix: "err", + }, + { + title: "stdout", + actorId, + suffix: "out", + }, + { + title: "system", + nodeId: rayletId, + // TODO(aguo): Have API return the log file name. + filename: `python-core-worker-${workerId}_${pid}.log`, + }, + ]; + return ; +}; diff --git a/dashboard/client/src/pages/actor/hook/mockedUseActorList.ts b/dashboard/client/src/pages/actor/hook/mockedUseActorList.ts new file mode 100644 index 000000000000..2063b68fc776 --- /dev/null +++ b/dashboard/client/src/pages/actor/hook/mockedUseActorList.ts @@ -0,0 +1,113 @@ +import { Actor } from "../../../type/actor"; + +const MOCK_ACTORS: { [actorId: string]: Actor } = { + ACTOR_1: { + actorId: "ACTOR_1", + jobId: "01000000", + address: { + rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + ipAddress: "172.31.11.178", + port: 10003, + workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", + }, + state: "ALIVE", + numRestarts: "0", + name: "", + pid: 25321, + startTime: 1679010689148, + endTime: 0, + actorClass: "Counter", + exitDetail: "-", + requiredResources: {}, + placementGroupId: "123", + reprName: ",", + }, + ACTOR_2: { + actorId: "ACTOR_2", + jobId: "01000000", + address: { + rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + ipAddress: "172.31.11.178", + port: 10003, + workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", + }, + state: "DEAD", + numRestarts: "0", + name: "", + pid: 25322, + startTime: 1679010689150, + endTime: 0, + actorClass: "Counter", + exitDetail: "-", + requiredResources: {}, + placementGroupId: "123", + reprName: ",", + }, + ACTOR_3: { + actorId: "ACTOR_3", + jobId: "01000000", + address: { + rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + ipAddress: "172.31.11.178", + port: 10003, + workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", + }, + state: "DEPENDENCIES_UNREADY", + numRestarts: "0", + name: "", + pid: 25323, + startTime: 1679010689152, + endTime: 0, + actorClass: "Counter", + exitDetail: "-", + requiredResources: {}, + placementGroupId: "123", + reprName: ",", + }, + ACTOR_4: { + actorId: "ACTOR_4", + jobId: "01000000", + address: { + rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + ipAddress: "172.31.11.178", + port: 10003, + workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", + }, + state: "PENDING_CREATION", + numRestarts: "0", + name: "", + pid: 25324, + startTime: 1679010689154, + endTime: 0, + actorClass: "Counter", + exitDetail: "-", + requiredResources: {}, + placementGroupId: "123", + reprName: ",", + }, + ACTOR_5: { + actorId: "ACTOR_5", + jobId: "01000000", + address: { + rayletId: "426854e68e4225b3941deaf03c8dcfcb1daacc69a92711d370dbb0e1", + ipAddress: "172.31.11.178", + port: 10003, + workerId: "b8b276a03612644098ed7a929c3b0e50f5bde894eb0d8cab288fbb6d", + }, + state: "RESTARTING", + numRestarts: "1", + name: "", + pid: 25325, + startTime: 1679010689156, + endTime: 0, + actorClass: "Counter", + exitDetail: "-", + requiredResources: {}, + placementGroupId: "123", + reprName: ",", + }, +}; + +export const useActorList = (): { [actorId: string]: Actor } => { + return MOCK_ACTORS; +}; diff --git a/dashboard/client/src/pages/actor/hook/useActorDetail.ts b/dashboard/client/src/pages/actor/hook/useActorDetail.ts index 271f903e076c..ec51d6a53eff 100644 --- a/dashboard/client/src/pages/actor/hook/useActorDetail.ts +++ b/dashboard/client/src/pages/actor/hook/useActorDetail.ts @@ -5,13 +5,28 @@ import { GlobalContext } from "../../../App"; import { API_REFRESH_INTERVAL_MS } from "../../../common/constants"; import { ActorResp, getActor } from "../../../service/actor"; +export const useFetchActor = (actorId: string | null) => { + return useSWR( + actorId ? ["useActorDetail", actorId] : null, + async ([_, actorId]) => { + const actor_resp = await getActor(actorId); + const data: ActorResp = actor_resp?.data; + const { data: rspData } = data; + + if (rspData.detail) { + return rspData.detail; + } + }, + ); +}; + export const useActorDetail = () => { - const params = useParams() as { id: string }; + const params = useParams() as { actorId: string }; const [msg, setMsg] = useState("Loading the actor infos..."); const { namespaceMap } = useContext(GlobalContext); - const { data: actorDetail } = useSWR( - ["useActorDetail", params.id], + const { data: actorDetail, isLoading } = useSWR( + ["useActorDetail", params.actorId], async ([_, actorId]) => { const actor_resp = await getActor(actorId); const data: ActorResp = actor_resp?.data; @@ -35,6 +50,7 @@ export const useActorDetail = () => { params, actorDetail, msg, + isLoading, namespaceMap, }; }; diff --git a/dashboard/client/src/pages/actor/index.tsx b/dashboard/client/src/pages/actor/index.tsx index d66ce9f8aae3..6c0995cb4c58 100644 --- a/dashboard/client/src/pages/actor/index.tsx +++ b/dashboard/client/src/pages/actor/index.tsx @@ -1,13 +1,12 @@ import { makeStyles } from "@material-ui/core"; import React from "react"; -import TitleCard from "../../components/TitleCard"; -import { MainNavPageInfo } from "../layout/mainNavContext"; import ActorList from "./ActorList"; const useStyles = makeStyles((theme) => ({ root: { padding: theme.spacing(2), width: "100%", + backgroundColor: "white", }, })); @@ -19,16 +18,7 @@ const Actors = () => { return (
- - - - +
); }; diff --git a/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.tsx b/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.tsx index 6ba92e3ffcbf..d9fbb29d86d9 100644 --- a/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.tsx +++ b/dashboard/client/src/pages/job/AdvancedProgressBar/AdvancedProgressBar.tsx @@ -15,6 +15,7 @@ import { RiCloseLine, RiSubtractLine, } from "react-icons/ri"; +import { Link } from "react-router-dom"; import { ClassNameProps } from "../../../common/props"; import { JobProgressGroup, NestedJobProgressLink } from "../../../type/job"; import { MiniTaskProgressBar } from "../TaskProgressBar"; @@ -166,15 +167,21 @@ export const AdvancedProgressBarSegment = ({ }} /> {link ? ( - + link.type === "actor" ? ( + + ) : ( + + {name} + + ) ) : ( name )} diff --git a/dashboard/client/src/pages/job/JobDetail.tsx b/dashboard/client/src/pages/job/JobDetail.tsx index 888ae5eeb48d..460cf0eacb83 100644 --- a/dashboard/client/src/pages/job/JobDetail.tsx +++ b/dashboard/client/src/pages/job/JobDetail.tsx @@ -1,38 +1,48 @@ -import { Box, Grid, makeStyles, Typography } from "@material-ui/core"; -import React, { useContext, useRef, useState } from "react"; -import { Link } from "react-router-dom"; -import { GlobalContext } from "../../App"; +import { Box, makeStyles } from "@material-ui/core"; +import React, { useRef, useState } from "react"; import { CollapsibleSection } from "../../common/CollapsibleSection"; -import { DurationText } from "../../common/DurationText"; -import { formatDateFromTimeMs } from "../../common/formatUtils"; +import { Section } from "../../common/Section"; import { - CpuProfilingLink, - CpuStackTraceLink, -} from "../../common/ProfilingLink"; + NodeStatusCard, + ResourceStatusCard, +} from "../../components/AutoscalerStatusCards"; import Loading from "../../components/Loading"; -import { MetadataSection } from "../../components/MetadataSection"; import { StatusChip } from "../../components/StatusChip"; import TitleCard from "../../components/TitleCard"; -import { NestedJobProgressLink, UnifiedJob } from "../../type/job"; +import { NestedJobProgressLink } from "../../type/job"; import ActorList from "../actor/ActorList"; +import { NodeCountCard } from "../overview/cards/NodeCountCard"; import PlacementGroupList from "../state/PlacementGroup"; import TaskList from "../state/task"; - import { useRayStatus } from "./hook/useClusterStatus"; import { useJobDetail } from "./hook/useJobDetail"; +import { JobMetadataSection } from "./JobDetailInfoPage"; +import { JobDriverLogs } from "./JobDriverLogs"; import { JobProgressBar } from "./JobProgressBar"; import { TaskTimeline } from "./TaskTimeline"; const useStyle = makeStyles((theme) => ({ root: { padding: theme.spacing(2), + backgroundColor: "white", + }, + section: { + marginBottom: theme.spacing(4), + }, + autoscalerSection: { + flexWrap: "wrap", + [theme.breakpoints.up("md")]: { + flexWrap: "nowrap", + }, + }, + nodeCountCard: { + flex: "1 0 500px", }, })); export const JobDetailChartsPage = () => { const classes = useStyle(); - const { job, msg, params } = useJobDetail(); - const jobId = params.id; + const { job, msg, isLoading, params } = useJobDetail(); const [taskListFilter, setTaskListFilter] = useState(); const [taskTableExpanded, setTaskTableExpanded] = useState(false); @@ -43,63 +53,10 @@ export const JobDetailChartsPage = () => { const actorTableRef = useRef(null); const { cluster_status } = useRayStatus(); - const formatNodeStatus = (cluster_status: string) => { - // ==== auto scaling status - // Node status - // .... - // Resources - // .... - const sections = cluster_status.split("Resources"); - return formatClusterStatus( - "Node Status", - sections[0].split("Node status")[1], - ); - }; - - const formatResourcesStatus = (cluster_status: string) => { - // ==== auto scaling status - // Node status - // .... - // Resources - // .... - const sections = cluster_status.split("Resources"); - return formatClusterStatus("Resource Status", sections[1]); - }; - - const formatClusterStatus = (title: string, cluster_status: string) => { - const cluster_status_rows = cluster_status.split("\n"); - - return ( -
- - {title} - - {cluster_status_rows.map((i, key) => { - // Format the output. - // See format_info_string in util.py - if (i.startsWith("-----") || i.startsWith("=====")) { - // Separator - return
; - } else if (i.endsWith(":")) { - return ( -
- {i} -
- ); - } else if (i === "") { - return
; - } else { - return
{i}
; - } - })} -
- ); - }; - if (!job) { return (
- +
@@ -145,212 +102,115 @@ export const JobDetailChartsPage = () => { return (
- - , - }, - { - label: "Job ID", - content: job.job_id - ? { - value: job.job_id, - copyableValue: job.job_id, - } - : { value: "-" }, - }, - { - label: "Submission ID", - content: job.submission_id - ? { - value: job.submission_id, - copyableValue: job.submission_id, - } - : { - value: "-", - }, - }, - { - label: "Duration", - content: job.start_time ? ( - - ) : ( - - - ), - }, - { - label: "Started at", - content: { - value: job.start_time - ? formatDateFromTimeMs(job.start_time) - : "-", - }, - }, - { - label: "Ended at", - content: { - value: job.end_time ? formatDateFromTimeMs(job.end_time) : "-", - }, - }, - { - label: "Actions", - content: ( -
- -
- -
- -
- ), - }, - ]} - /> -
- - - - - - - - - - - {cluster_status?.data - ? formatNodeStatus(cluster_status?.data.clusterStatus) - : "No cluster status."} - - - - - - - {cluster_status?.data - ? formatResourcesStatus(cluster_status?.data.clusterStatus) - : "No cluster status."} - - - - - - { - setTaskTableExpanded(!taskTableExpanded); - }} - > - + + +
+ - - - +
+
+ + +
+ +
+
+ + {job.job_id && ( { - setActorTableExpanded(!actorTableExpanded); - }} + title="Task Timeline (beta)" + startExpanded + className={classes.section} > - +
+ +
-
- - - - - + )} + + + + +
+ +
+
+ +
+
+
+ + {job.job_id && ( + + { + setTaskTableExpanded(!taskTableExpanded); + }} + className={classes.section} + > +
+ +
+
+ + { + setActorTableExpanded(!actorTableExpanded); + }} + className={classes.section} + > +
+ +
+
+ + +
+ +
+
+
+ )}
); }; - -type JobLogsLinkProps = { - job: Pick< - UnifiedJob, - | "driver_agent_http_address" - | "driver_info" - | "job_id" - | "submission_id" - | "type" - >; -}; - -export const JobLogsLink = ({ - job: { driver_agent_http_address, driver_info, job_id, submission_id, type }, -}: JobLogsLinkProps) => { - const { ipLogMap } = useContext(GlobalContext); - - let link: string | undefined; - - if (driver_agent_http_address) { - link = `/logs/${encodeURIComponent(`${driver_agent_http_address}/logs`)}`; - } else if (driver_info && ipLogMap[driver_info.node_ip_address]) { - link = `/logs/${encodeURIComponent(ipLogMap[driver_info.node_ip_address])}`; - } - - if (link) { - link += `?fileName=${ - type === "DRIVER" ? job_id : `driver-${submission_id}` - }`; - return ( - - Log - - ); - } - - return -; -}; diff --git a/dashboard/client/src/pages/job/JobDetailActorPage.tsx b/dashboard/client/src/pages/job/JobDetailActorPage.tsx index b31d653ca3ed..0e0d09437bf5 100644 --- a/dashboard/client/src/pages/job/JobDetailActorPage.tsx +++ b/dashboard/client/src/pages/job/JobDetailActorPage.tsx @@ -1,7 +1,7 @@ import { makeStyles } from "@material-ui/core"; -import React from "react"; +import React, { PropsWithChildren } from "react"; -import TitleCard from "../../components/TitleCard"; +import { Section } from "../../common/Section"; import ActorList from "../actor/ActorList"; import { MainNavPageInfo } from "../layout/mainNavContext"; import { useJobDetail } from "./hook/useJobDetail"; @@ -9,31 +9,43 @@ import { useJobDetail } from "./hook/useJobDetail"; const useStyle = makeStyles((theme) => ({ root: { padding: theme.spacing(2), + backgroundColor: "white", }, })); export const JobDetailActorsPage = () => { const classes = useStyle(); - const { job, params } = useJobDetail(); - - const pageInfo = job - ? { - title: "Actors", - id: "actors", - path: job.job_id ? `/jobs/${job.job_id}/actors` : undefined, - } - : { - title: "Actors", - id: "actors", - path: undefined, - }; + const { params } = useJobDetail(); return (
- - + +
- +
+
+ ); +}; + +export const JobDetailActorDetailWrapper = ({ + children, +}: PropsWithChildren<{}>) => { + return ( +
+ + {children}
); }; diff --git a/dashboard/client/src/pages/job/JobDetailInfoPage.tsx b/dashboard/client/src/pages/job/JobDetailInfoPage.tsx index 24bb85b23e17..3f1a19ae661b 100644 --- a/dashboard/client/src/pages/job/JobDetailInfoPage.tsx +++ b/dashboard/client/src/pages/job/JobDetailInfoPage.tsx @@ -1,11 +1,22 @@ -import { makeStyles } from "@material-ui/core"; +import { createStyles, makeStyles, Typography } from "@material-ui/core"; import React from "react"; +import { + CodeDialogButton, + CodeDialogButtonWithPreview, +} from "../../common/CodeDialogButton"; import { DurationText } from "../../common/DurationText"; import { formatDateFromTimeMs } from "../../common/formatUtils"; +import { JobStatusWithIcon } from "../../common/JobStatus"; +import { + CpuProfilingLink, + CpuStackTraceLink, +} from "../../common/ProfilingLink"; +import { filterRuntimeEnvSystemVariables } from "../../common/util"; import Loading from "../../components/Loading"; import { MetadataSection } from "../../components/MetadataSection"; import { StatusChip } from "../../components/StatusChip"; import TitleCard from "../../components/TitleCard"; +import { UnifiedJob } from "../../type/job"; import { MainNavPageInfo } from "../layout/mainNavContext"; import { useJobDetail } from "./hook/useJobDetail"; @@ -13,6 +24,7 @@ import { useJobDetail } from "./hook/useJobDetail"; const useStyle = makeStyles((theme) => ({ root: { padding: theme.spacing(2), + backgroundColor: "white", }, })); @@ -20,7 +32,7 @@ export const JobDetailInfoPage = () => { // TODO(aguo): Add more content to this page! const classes = useStyle(); - const { job, msg, params } = useJobDetail(); + const { job, msg, isLoading, params } = useJobDetail(); if (!job) { return ( @@ -29,10 +41,10 @@ export const JobDetailInfoPage = () => { pageInfo={{ title: "Info", id: "job-info", - path: undefined, + path: "info", }} /> - +
@@ -51,70 +63,135 @@ export const JobDetailInfoPage = () => { path: job.job_id ? `/jobs/${job.job_id}/info` : undefined, }} /> - - , - }, - { - label: "Job ID", - content: job.job_id - ? { - value: job.job_id, - copyableValue: job.job_id, - } - : { value: "-" }, - }, - { - label: "Submission ID", - content: job.submission_id - ? { - value: job.submission_id, - copyableValue: job.submission_id, - } - : { - value: "-", - }, - }, - { - label: "Duration", - content: job.start_time ? ( - - ) : ( - - - ), - }, - { - label: "Started at", - content: { - value: job.start_time - ? formatDateFromTimeMs(job.start_time) - : "-", + {job.job_id} + +
+ ); +}; + +const useJobMetadataSectionStyles = makeStyles((theme) => + createStyles({ + metadataButton: { + display: "inline-flex", + maxWidth: "100%", + }, + }), +); + +type JobMetadataSectionProps = { + job: UnifiedJob; +}; + +export const JobMetadataSection = ({ job }: JobMetadataSectionProps) => { + const classes = useJobMetadataSectionStyles(); + + return ( + , + }, + { + label: "Job ID", + content: job.job_id + ? { + value: job.job_id, + copyableValue: job.job_id, + } + : { value: "-" }, + }, + { + label: "Submission ID", + content: job.submission_id + ? { + value: job.submission_id, + copyableValue: job.submission_id, + } + : { + value: "-", }, - }, - { - label: "Ended at", - content: { - value: job.end_time ? formatDateFromTimeMs(job.end_time) : "-", + }, + { + label: "Duration", + content: job.start_time ? ( + + ) : ( + - + ), + }, + { + label: "Started at", + content: { + value: job.start_time ? formatDateFromTimeMs(job.start_time) : "-", + }, + }, + { + label: "Ended at", + content: { + value: job.end_time ? formatDateFromTimeMs(job.end_time) : "-", + }, + }, + { + label: "Runtime environemnt", + ...(job.runtime_env + ? { + content: ( + + ), + } + : { + content: { + value: "-", + }, + }), + }, + ...(job.type === "SUBMISSION" + ? [ + { + label: "User-provided metadata", + content: + job.metadata && Object.keys(job.metadata).length ? ( + + ) : undefined, }, - }, - ]} - /> - -
+ ] + : []), + { + label: "Actions", + content: ( +
+ +
+ +
+ ), + }, + ]} + /> ); }; diff --git a/dashboard/client/src/pages/job/JobDetailLayout.tsx b/dashboard/client/src/pages/job/JobDetailLayout.tsx index 7825a9bf993f..0d4119eda364 100644 --- a/dashboard/client/src/pages/job/JobDetailLayout.tsx +++ b/dashboard/client/src/pages/job/JobDetailLayout.tsx @@ -4,29 +4,38 @@ import { RiInformationLine, RiLineChartLine, } from "react-icons/ri"; +import { Outlet } from "react-router-dom"; import { MainNavPageInfo } from "../layout/mainNavContext"; import { SideTabLayout, SideTabRouteLink } from "../layout/SideTabLayout"; import { useJobDetail } from "./hook/useJobDetail"; -export const JobDetailLayout = () => { - const { job } = useJobDetail(); +export const JobPage = () => { + const { job, params } = useJobDetail(); - const pageInfo = job + const jobId = job?.job_id ?? job?.submission_id; + const pageInfo = jobId ? { - title: job.job_id ?? "Job", - pageTitle: job.job_id ? `${job.job_id} | Job` : undefined, + title: jobId ?? "Job", + pageTitle: jobId ? `${jobId} | Job` : undefined, id: "job-detail", - path: job.job_id ? `/jobs/${job.job_id}` : undefined, + path: jobId, } : { title: "Job", id: "job-detail", - path: undefined, + path: params.id, }; + return ( +
+ + +
+ ); +}; +export const JobDetailLayout = () => { return ( - { + it("renders", async () => { + expect.assertions(6); + + mockedGet.mockResolvedValue({ + headers: { + "content-type": "text/plain", + }, + data: "1log line\nthis is a line\nHi\n10\nfoo", + }); + + render( + , + { wrapper: TEST_APP_WRAPPER }, + ); + + await screen.findByText(/log line/); + expect(screen.getByText(/log line/)).toBeVisible(); + expect(screen.getByText(/this is a line/)).toBeVisible(); + expect(screen.getByText(/Hi/)).toBeVisible(); + expect(screen.getByText(/10/)).toBeVisible(); + expect(screen.getByText(/foo/)).toBeVisible(); + + expect(mockedGet).toBeCalledWith( + "api/v0/logs/file?node_id=node-id-0&filename=job-driver-raysubmit_12345.log&lines=-1", + ); + }); +}); diff --git a/dashboard/client/src/pages/job/JobDriverLogs.tsx b/dashboard/client/src/pages/job/JobDriverLogs.tsx new file mode 100644 index 000000000000..62697d01d440 --- /dev/null +++ b/dashboard/client/src/pages/job/JobDriverLogs.tsx @@ -0,0 +1,67 @@ +import React, { useContext } from "react"; +import { GlobalContext } from "../../App"; +import { MultiTabLogViewer } from "../../common/MultiTabLogViewer"; +import { UnifiedJob } from "../../type/job"; + +type JobDriverLogsProps = { + job: Pick< + UnifiedJob, + | "job_id" + | "driver_node_id" + | "submission_id" + | "driver_agent_http_address" + | "driver_info" + | "type" + >; +}; + +export const JobDriverLogs = ({ job }: JobDriverLogsProps) => { + const { driver_node_id, submission_id, type } = job; + const filename = submission_id + ? `job-driver-${submission_id}.log` + : undefined; + + const { ipLogMap } = useContext(GlobalContext); + + let link: string | undefined; + + if (job.driver_agent_http_address) { + link = `/logs/${encodeURIComponent( + `${job.driver_agent_http_address}/logs`, + )}`; + } else if (job.driver_info && ipLogMap[job.driver_info.node_ip_address]) { + link = `/logs/${encodeURIComponent( + ipLogMap[job.driver_info.node_ip_address], + )}`; + } + + if (link && job.job_id) { + link += `?fileName=${job.job_id}`; + } else { + // Don't show "other logs" link if link is not available + // or job_id does not exist. + link = undefined; + } + + return ( + + ); +}; diff --git a/dashboard/client/src/pages/job/JobProgressBar.tsx b/dashboard/client/src/pages/job/JobProgressBar.tsx index d5d81b41124e..48be5e452952 100644 --- a/dashboard/client/src/pages/job/JobProgressBar.tsx +++ b/dashboard/client/src/pages/job/JobProgressBar.tsx @@ -1,4 +1,4 @@ -import { makeStyles } from "@material-ui/core"; +import { LinearProgress, makeStyles } from "@material-ui/core"; import React, { useEffect, useState } from "react"; import { UnifiedJob } from "../../type/job"; import { @@ -15,7 +15,7 @@ const useStyles = makeStyles((theme) => ({ })); type JobProgressBarProps = { - jobId: string; + jobId: string | undefined; job: Pick; } & Pick; @@ -41,12 +41,14 @@ export const JobProgressBar = ({ const { progress, + isLoading: progressLoading, driverExists, totalTasks, latestFetchTimestamp: progressTimestamp, } = useJobProgress(jobId, advancedProgressBarExpanded); const { progressGroups, + isLoading: progressGroupsLoading, total, totalTasks: advancedTotalTasks, latestFetchTimestamp: totalTimestamp, @@ -58,10 +60,20 @@ export const JobProgressBar = ({ if (!driverExists) { return ; } + + if ( + progressLoading && + (progressGroupsLoading || !advancedProgressBarRendered) + ) { + return ; + } + const { status } = job; // Use whichever data was received the most recently // Note these values may disagree in some way. It might better to consistently use one endpoint. const [totalProgress, finalTotalTasks] = + total === undefined || + advancedTotalTasks === undefined || progressTimestamp > totalTimestamp ? [progress, totalTasks] : [total, advancedTotalTasks]; diff --git a/dashboard/client/src/pages/job/JobRow.tsx b/dashboard/client/src/pages/job/JobRow.tsx index 724d69530408..fb5d1f856c34 100644 --- a/dashboard/client/src/pages/job/JobRow.tsx +++ b/dashboard/client/src/pages/job/JobRow.tsx @@ -1,17 +1,16 @@ -import { TableCell, TableRow, Tooltip } from "@material-ui/core"; +import { Link, TableCell, TableRow, Tooltip } from "@material-ui/core"; import { makeStyles } from "@material-ui/core/styles"; import React from "react"; -import { Link } from "react-router-dom"; +import { Link as RouterLink } from "react-router-dom"; import { DurationText } from "../../common/DurationText"; import { formatDateFromTimeMs } from "../../common/formatUtils"; +import { JobStatusWithIcon } from "../../common/JobStatus"; import { CpuProfilingLink, CpuStackTraceLink, } from "../../common/ProfilingLink"; -import { StatusChip } from "../../components/StatusChip"; import { UnifiedJob } from "../../type/job"; import { useJobProgress } from "./hook/useJobProgress"; -import { JobLogsLink } from "./JobDetail"; import { MiniTaskProgressBar } from "./TaskProgressBar"; const useStyles = makeStyles((theme) => ({ @@ -57,10 +56,22 @@ export const JobRow = ({ job }: JobRowProps) => { } })(); + const jobId = job_id ? job_id : submission_id; + return ( - {job_id ? {job_id} : "-"} + {job_id ? ( + + {job_id} + + ) : submission_id ? ( + + (no ray driver) + + ) : ( + "(no ray driver)" + )} {submission_id ?? "-"} @@ -74,7 +85,7 @@ export const JobRow = ({ job }: JobRowProps) => { - + {start_time && start_time > 0 ? ( @@ -85,10 +96,14 @@ export const JobRow = ({ job }: JobRowProps) => { {progressBar} - {/* TODO(aguo): Also show logs for the job id instead - of just the submission's logs */} - -
+ {jobId && ( + + + Log + +
+
+ )} ({ - root: { - padding: theme.spacing(2, 0, 0), - }, button: { marginTop: theme.spacing(2), }, @@ -26,7 +23,7 @@ export const TaskTimeline = ({ jobId }: TaskTimelineProps) => { const classes = useStyle(); return ( -
+
{/* TODO(aguo): Add link to external documentation about Timeline view. */} Timeline view shows how tasks are executed across different nodes and diff --git a/dashboard/client/src/pages/job/hook/mockedUseJobList.ts b/dashboard/client/src/pages/job/hook/mockedUseJobList.ts new file mode 100644 index 000000000000..e343379a015d --- /dev/null +++ b/dashboard/client/src/pages/job/hook/mockedUseJobList.ts @@ -0,0 +1,29 @@ +import { JobStatus } from "../../../type/job"; + +export const JOB_LIST = [ + { + job_id: "01000000", + submission_id: "raysubmit_12345", + status: JobStatus.PENDING, + }, + { + job_id: "02000000", + submission_id: null, + status: JobStatus.FAILED, + }, + { + job_id: null, + submission_id: "raysubmit_23456", + status: JobStatus.RUNNING, + }, + { + job_id: "04000000", + submission_id: "raysubmit_34567", + status: JobStatus.STOPPED, + }, + { + job_id: "05000000", + submission_id: "raysubmit_45678", + status: JobStatus.SUCCEEDED, + }, +] as any; diff --git a/dashboard/client/src/pages/job/hook/useJobDetail.ts b/dashboard/client/src/pages/job/hook/useJobDetail.ts index 7dda767a297e..9c31ab4ad1a0 100644 --- a/dashboard/client/src/pages/job/hook/useJobDetail.ts +++ b/dashboard/client/src/pages/job/hook/useJobDetail.ts @@ -10,11 +10,11 @@ export const useJobDetail = () => { const [msg, setMsg] = useState("Loading the job detail"); const [refreshing, setRefresh] = useState(true); const { ipLogMap } = useContext(GlobalContext); - const { data: job } = useSWR( - "useJobDetail", - async () => { + const { data: job, isLoading } = useSWR( + ["useJobDetail", params.id], + async ([_, jobId]) => { try { - const rsp = await getJobDetail(params.id); + const rsp = await getJobDetail(jobId); return rsp.data; } catch (e) { setMsg("Job Query Error Please Check JobId"); @@ -26,6 +26,7 @@ export const useJobDetail = () => { return { job, + isLoading, msg, params, ipLogMap, diff --git a/dashboard/client/src/pages/job/hook/useJobList.ts b/dashboard/client/src/pages/job/hook/useJobList.ts index ba4beedb6b6c..8ed6079be5a4 100644 --- a/dashboard/client/src/pages/job/hook/useJobList.ts +++ b/dashboard/client/src/pages/job/hook/useJobList.ts @@ -30,7 +30,7 @@ export const useJobList = () => { }; refreshRef.current = isRefreshing; - const { data } = useSWR( + const { data, isLoading } = useSWR( "useJobList", async () => { const rsp = await getJobList(); @@ -52,6 +52,7 @@ export const useJobList = () => { filter.every((f) => node[f.key] && (node[f.key] ?? "").includes(f.val)), ), msg, + isLoading, isRefreshing, onSwitchChange, changeFilter, diff --git a/dashboard/client/src/pages/job/hook/useJobProgress.ts b/dashboard/client/src/pages/job/hook/useJobProgress.ts index 8da49fe719fb..a3cc6fc01e87 100644 --- a/dashboard/client/src/pages/job/hook/useJobProgress.ts +++ b/dashboard/client/src/pages/job/hook/useJobProgress.ts @@ -15,21 +15,42 @@ import { } from "../../../type/job"; import { TypeTaskStatus } from "../../../type/task"; -const TASK_STATE_NAME_TO_PROGRESS_KEY: Record< - TypeTaskStatus, +export enum TaskStatus { + PENDING_ARGS_AVAIL = "PENDING_ARGS_AVAIL", + PENDING_NODE_ASSIGNMENT = "PENDING_NODE_ASSIGNMENT", + SUBMITTED_TO_WORKER = "SUBMITTED_TO_WORKER", + RUNNING = "RUNNING", + FINISHED = "FINISHED", + FAILED = "FAILED", + UNKNOWN = "UNKNOWN", +} + +const TASK_STATE_NAME_TO_PROGRESS_KEY: Record = { + [TypeTaskStatus.PENDING_ARGS_AVAIL]: TaskStatus.PENDING_ARGS_AVAIL, + [TypeTaskStatus.PENDING_NODE_ASSIGNMENT]: TaskStatus.PENDING_NODE_ASSIGNMENT, + [TypeTaskStatus.PENDING_OBJ_STORE_MEM_AVAIL]: + TaskStatus.PENDING_NODE_ASSIGNMENT, + [TypeTaskStatus.PENDING_ARGS_FETCH]: TaskStatus.PENDING_NODE_ASSIGNMENT, + [TypeTaskStatus.SUBMITTED_TO_WORKER]: TaskStatus.SUBMITTED_TO_WORKER, + [TypeTaskStatus.RUNNING]: TaskStatus.RUNNING, + [TypeTaskStatus.RUNNING_IN_RAY_GET]: TaskStatus.RUNNING, + [TypeTaskStatus.RUNNING_IN_RAY_WAIT]: TaskStatus.RUNNING, + [TypeTaskStatus.FINISHED]: TaskStatus.FINISHED, + [TypeTaskStatus.FAILED]: TaskStatus.FAILED, + [TypeTaskStatus.NIL]: TaskStatus.UNKNOWN, +}; + +export const TaskStatusToTaskProgressMapping: Record< + TaskStatus, keyof TaskProgress > = { - [TypeTaskStatus.PENDING_ARGS_AVAIL]: "numPendingArgsAvail", - [TypeTaskStatus.PENDING_NODE_ASSIGNMENT]: "numPendingNodeAssignment", - [TypeTaskStatus.PENDING_OBJ_STORE_MEM_AVAIL]: "numPendingNodeAssignment", - [TypeTaskStatus.PENDING_ARGS_FETCH]: "numPendingNodeAssignment", - [TypeTaskStatus.SUBMITTED_TO_WORKER]: "numSubmittedToWorker", - [TypeTaskStatus.RUNNING]: "numRunning", - [TypeTaskStatus.RUNNING_IN_RAY_GET]: "numRunning", - [TypeTaskStatus.RUNNING_IN_RAY_WAIT]: "numRunning", - [TypeTaskStatus.FINISHED]: "numFinished", - [TypeTaskStatus.FAILED]: "numFailed", - [TypeTaskStatus.NIL]: "numUnknown", + [TaskStatus.PENDING_ARGS_AVAIL]: "numPendingArgsAvail", + [TaskStatus.PENDING_NODE_ASSIGNMENT]: "numPendingNodeAssignment", + [TaskStatus.SUBMITTED_TO_WORKER]: "numSubmittedToWorker", + [TaskStatus.RUNNING]: "numRunning", + [TaskStatus.FINISHED]: "numFinished", + [TaskStatus.FAILED]: "numFailed", + [TaskStatus.UNKNOWN]: "numUnknown", }; const useFetchStateApiProgressByTaskName = ( @@ -52,7 +73,7 @@ const useFetchStateApiProgressByTaskName = ( const summary = formatSummaryToTaskProgress( rsp.data.data.result.result, ); - return { summary, totalTasks: rsp.data.data.result.total }; + return { summary, totalTasks: rsp.data.data.result.num_filtered }; } else { setError(true); setRefresh(false); @@ -82,7 +103,7 @@ export const useJobProgress = ( const [error, setError] = useState(false); const [isRefreshing, setRefresh] = useState(true); const [latestFetchTimestamp, setLatestFetchTimestamp] = useState(0); - const { data } = useFetchStateApiProgressByTaskName( + const { data, isLoading } = useFetchStateApiProgressByTaskName( jobId, isRefreshing, setMsg, @@ -104,6 +125,7 @@ export const useJobProgress = ( return { progress: summed, totalTasks: data?.totalTasks, + isLoading, msg, error, driverExists, @@ -128,7 +150,7 @@ export const useJobProgressByTaskName = (jobId: string) => { setRefresh(event.target.checked); }; - const { data } = useFetchStateApiProgressByTaskName( + const { data, isLoading } = useFetchStateApiProgressByTaskName( jobId, isRefreshing, setMsg, @@ -167,6 +189,7 @@ export const useJobProgressByTaskName = (jobId: string) => { page: { pageNo: page, pageSize: 10 }, total: formattedTasks.length, totalTasks: data?.totalTasks, + isLoading, setPage, msg, error, @@ -179,8 +202,11 @@ const formatStateCountsToProgress = (stateCounts: { }) => { const formattedProgress: TaskProgress = {}; Object.entries(stateCounts).forEach(([state, count]) => { + const taskStatus: TaskStatus = + TASK_STATE_NAME_TO_PROGRESS_KEY[state as TypeTaskStatus]; + const key: keyof TaskProgress = - TASK_STATE_NAME_TO_PROGRESS_KEY[state as TypeTaskStatus] ?? "numUnknown"; + TaskStatusToTaskProgressMapping[taskStatus] ?? "numUnknown"; formattedProgress[key] = (formattedProgress[key] ?? 0) + count; }); @@ -252,7 +278,7 @@ export const useJobProgressByLineage = ( const [isRefreshing, setRefresh] = useState(true); const [latestFetchTimestamp, setLatestFetchTimestamp] = useState(0); - const { data } = useSWR( + const { data, isLoading } = useSWR( jobId ? ["useJobProgressByLineageAndName", jobId] : null, async ([_, jobId]) => { const rsp = await getStateApiJobProgressByLineage(jobId); @@ -263,7 +289,7 @@ export const useJobProgressByLineage = ( const summary = formatNestedJobProgressToJobProgressGroup( rsp.data.data.result.result, ); - return { summary, totalTasks: rsp.data.data.result.total }; + return { summary, totalTasks: rsp.data.data.result.num_filtered }; } else { setError(true); setRefresh(false); @@ -280,6 +306,7 @@ export const useJobProgressByLineage = ( progressGroups: data?.summary?.progressGroups, total: data?.summary?.total, totalTasks: data?.totalTasks, + isLoading, msg, error, latestFetchTimestamp, diff --git a/dashboard/client/src/pages/job/index.tsx b/dashboard/client/src/pages/job/index.tsx index 4fe82836ab34..596630cc3ce9 100644 --- a/dashboard/client/src/pages/job/index.tsx +++ b/dashboard/client/src/pages/job/index.tsx @@ -45,14 +45,7 @@ const columns = [ helpInfo: ( The progress of the all submitted tasks per job. Tasks that are not yet - submitted will not show up in the progress bar. -
-
- Note: This column requires that prometheus is running. See{" "} - - here - {" "} - for instructions. + submitted do not show up in the progress bar.
), }, @@ -68,6 +61,7 @@ const JobList = () => { const classes = useStyles(); const { msg, + isLoading, isRefreshing, onSwitchChange, jobList, @@ -78,7 +72,7 @@ const JobList = () => { return (
- + Auto Refresh: { return null; } + let currentPath = ""; + return (
{mainNavPageHierarchy.map(({ title, id, path }, index) => { + if (path) { + if (path.startsWith("/")) { + currentPath = path; + } else { + currentPath = `${currentPath}/${path}`; + } + } const linkOrText = path ? ( {title} diff --git a/dashboard/client/src/pages/layout/mainNavContext.ts b/dashboard/client/src/pages/layout/mainNavContext.ts index 0937fd082d2a..ff030d803670 100644 --- a/dashboard/client/src/pages/layout/mainNavContext.ts +++ b/dashboard/client/src/pages/layout/mainNavContext.ts @@ -11,13 +11,16 @@ export type MainNavPage = { pageTitle?: string; /** * This helps identifies the current page a user is on and highlights the nav bar correctly. - * This should be unique per page. + * This should be unique per page within an hiearchy. i.e. you should NOT put two pages with the same ID + * as parents or children of each other. * DO NOT change the pageId of a page. The behavior of the main nav and * breadcrumbs is undefined in that case. */ id: string; /** * URL to link to access this route. + * If this begins with a `/`, it is treated as an absolute path. + * If not, this is treated as a relative path and the path is appended to the parent breadcrumb's path. */ path?: string; }; diff --git a/dashboard/client/src/pages/log/LogViewer.tsx b/dashboard/client/src/pages/log/LogViewer.tsx new file mode 100644 index 000000000000..cb155b9417c7 --- /dev/null +++ b/dashboard/client/src/pages/log/LogViewer.tsx @@ -0,0 +1,195 @@ +import { + Button, + createStyles, + InputAdornment, + LinearProgress, + makeStyles, + Switch, + TextField, +} from "@material-ui/core"; +import { SearchOutlined } from "@material-ui/icons"; +import React, { useState } from "react"; +import LogVirtualView from "../../components/LogView/LogVirtualView"; + +const useStyles = makeStyles((theme) => + createStyles({ + search: { + margin: theme.spacing(1), + }, + }), +); + +const useLogViewer = () => { + const [search, setSearch] = + useState<{ + keywords?: string; + lineNumber?: string; + fontSize?: number; + revert?: boolean; + }>(); + const [startTime, setStart] = useState(); + const [endTime, setEnd] = useState(); + + return { + search, + setSearch, + startTime, + setStart, + endTime, + setEnd, + }; +}; + +type LogViewerProps = { + path?: string; + log: string; + downloadUrl?: string; + onRefreshClick?: () => void; + height?: number; +}; + +export const LogViewer = ({ + path, + log, + downloadUrl, + onRefreshClick, + height = 600, +}: LogViewerProps) => { + const classes = useStyles(); + + const { search, setSearch, startTime, setStart, endTime, setEnd } = + useLogViewer(); + + return ( + + {log !== "Loading..." && ( +
+
+ { + setSearch({ ...search, keywords: value }); + }, + type: "", + endAdornment: ( + + + + ), + }} + /> + { + setSearch({ ...search, lineNumber: value }); + }, + type: "", + endAdornment: ( + + + + ), + }} + /> + { + setSearch({ ...search, fontSize: Number(value) }); + }, + type: "", + }} + /> + { + setStart(val.target.value); + }} + InputLabelProps={{ + shrink: true, + }} + /> + { + setEnd(val.target.value); + }} + InputLabelProps={{ + shrink: true, + }} + /> +
+ Reverse:{" "} + setSearch({ ...search, revert: v })} + /> + {onRefreshClick && ( + + )} + + {downloadUrl && path && ( + + )} +
+
+ +
+ )} + {log === "Loading..." && ( +
+
+ +
+ )} +
+ ); +}; diff --git a/dashboard/client/src/pages/log/Logs.tsx b/dashboard/client/src/pages/log/Logs.tsx index a44253595a51..61eb93468840 100644 --- a/dashboard/client/src/pages/log/Logs.tsx +++ b/dashboard/client/src/pages/log/Logs.tsx @@ -1,22 +1,11 @@ -import { - Button, - InputAdornment, - LinearProgress, - List, - ListItem, - makeStyles, - Paper, - Switch, - TextField, -} from "@material-ui/core"; -import { SearchOutlined } from "@material-ui/icons"; -import React, { useEffect, useRef, useState } from "react"; +import { Button, List, ListItem, makeStyles, Paper } from "@material-ui/core"; +import React, { useEffect, useState } from "react"; import { Outlet, useLocation, useParams } from "react-router-dom"; -import LogVirtualView from "../../components/LogView/LogVirtualView"; import { SearchInput } from "../../components/SearchComponent"; import TitleCard from "../../components/TitleCard"; import { getLogDetail, getLogDownloadUrl } from "../../service/log"; import { MainNavPageInfo } from "../layout/mainNavContext"; +import { LogViewer } from "./LogViewer"; const useStyles = makeStyles((theme) => ({ root: { @@ -36,30 +25,16 @@ const useStyles = makeStyles((theme) => ({ }, })); -type LogsProps = { - theme?: "dark" | "light"; -}; - -const useLogs = ({ theme }: LogsProps) => { +const useLogs = () => { const { search: urlSearch } = useLocation(); const { host, path } = useParams(); const searchMap = new URLSearchParams(urlSearch); const urlFileName = searchMap.get("fileName"); - const el = useRef(null); const [origin, setOrigin] = useState(); - const [search, setSearch] = - useState<{ - keywords?: string; - lineNumber?: string; - fontSize?: number; - revert?: boolean; - }>(); const [fileName, setFileName] = useState(searchMap.get("fileName") || ""); const [log, setLogs] = useState(); const [downloadUrl, setDownloadUrl] = useState(); - const [startTime, setStart] = useState(); - const [endTime, setEnd] = useState(); useEffect(() => { setFileName(urlFileName || ""); @@ -97,37 +72,14 @@ const useLogs = ({ theme }: LogsProps) => { downloadUrl, host, path, - el, - search, - setSearch, - theme, fileName, setFileName, - startTime, - setStart, - endTime, - setEnd, }; }; -const Logs = (props: LogsProps) => { +const Logs = () => { const classes = useStyles(); - const { - log, - origin, - downloadUrl, - path, - el, - search, - setSearch, - theme, - fileName, - setFileName, - startTime, - setStart, - endTime, - setEnd, - } = useLogs(props); + const { log, origin, downloadUrl, path, fileName, setFileName } = useLogs(); let href = "#/logs/"; if (origin) { @@ -142,7 +94,7 @@ const Logs = (props: LogsProps) => { } } return ( -
+
{!origin &&

Select a node to view logs

} @@ -191,125 +143,8 @@ const Logs = (props: LogsProps) => { ))} )} - {typeof log === "string" && log !== "Loading..." && ( -
-
- { - setSearch({ ...search, keywords: value }); - }, - type: "", - endAdornment: ( - - - - ), - }} - /> - { - setSearch({ ...search, lineNumber: value }); - }, - type: "", - endAdornment: ( - - - - ), - }} - /> - { - setSearch({ ...search, fontSize: Number(value) }); - }, - type: "", - }} - /> - { - setStart(val.target.value); - }} - InputLabelProps={{ - shrink: true, - }} - /> - { - setEnd(val.target.value); - }} - InputLabelProps={{ - shrink: true, - }} - /> -
- Reverse:{" "} - setSearch({ ...search, revert: v })} - /> - - {downloadUrl && path && ( - - )} -
-
- -
- )} - {log === "Loading..." && ( -
-
- -
+ {typeof log === "string" && ( + )}
diff --git a/dashboard/client/src/pages/log/hooks.ts b/dashboard/client/src/pages/log/hooks.ts new file mode 100644 index 000000000000..fe333ba96a2e --- /dev/null +++ b/dashboard/client/src/pages/log/hooks.ts @@ -0,0 +1,31 @@ +import useSWR from "swr"; +import { + getStateApiDownloadLogUrl, + getStateApiLog, + StateApiLogInput, +} from "../../service/log"; + +export const useStateApiLogs = ( + props: StateApiLogInput, + path: string | undefined, +) => { + const downloadUrl = getStateApiDownloadLogUrl(props); + + const { + data: log, + isLoading, + mutate, + } = useSWR( + downloadUrl ? ["useDriverLogs", downloadUrl] : null, + async ([_]) => { + return getStateApiLog(props); + }, + ); + + return { + log: isLoading ? "Loading..." : log, + downloadUrl, + refresh: mutate, + path, + }; +}; diff --git a/dashboard/client/src/pages/node/ClusterDetailInfoPage.tsx b/dashboard/client/src/pages/node/ClusterDetailInfoPage.tsx index 06a69ea404b7..a44636abebaf 100644 --- a/dashboard/client/src/pages/node/ClusterDetailInfoPage.tsx +++ b/dashboard/client/src/pages/node/ClusterDetailInfoPage.tsx @@ -18,7 +18,7 @@ export const ClusterDetailInfoPage = () => { // TODO(aguo): Add more content to this page! const classes = useStyle(); - const { clusterDetail, msg } = useClusterDetail(); + const { clusterDetail, msg, isLoading } = useClusterDetail(); if (!clusterDetail) { return ( @@ -27,10 +27,10 @@ export const ClusterDetailInfoPage = () => { pageInfo={{ title: "Cluster Info", id: "cluster-info", - path: undefined, + path: "info", }} /> - +
diff --git a/dashboard/client/src/pages/node/ClusterLayout.tsx b/dashboard/client/src/pages/node/ClusterLayout.tsx index c74f6d7a3909..16a02d05eb97 100644 --- a/dashboard/client/src/pages/node/ClusterLayout.tsx +++ b/dashboard/client/src/pages/node/ClusterLayout.tsx @@ -1,17 +1,12 @@ import React from "react"; -import { RiInformationLine, RiTableAltLine } from "react-icons/ri"; +import { RiInformationLine, RiTableLine } from "react-icons/ri"; import { SideTabLayout, SideTabRouteLink } from "../layout/SideTabLayout"; export const ClusterLayout = () => { return ( - + ); }; diff --git a/dashboard/client/src/pages/node/NodeDetail.tsx b/dashboard/client/src/pages/node/NodeDetail.tsx index 6c3399301777..9a6dd2aee7f0 100644 --- a/dashboard/client/src/pages/node/NodeDetail.tsx +++ b/dashboard/client/src/pages/node/NodeDetail.tsx @@ -43,6 +43,7 @@ const NodeDetailPage = () => { selectedTab, nodeDetail, msg, + isLoading, isRefreshing, onRefreshChange, raylet, @@ -59,7 +60,7 @@ const NodeDetailPage = () => { path: `/cluster/nodes/${params.id}`, }} /> - + { it("renders", async () => { - const node: NodeDetail = { - hostname: "test-hostname", - ip: "192.168.0.1", - cpu: 15, - mem: [100, 95, 5], - disk: { - "/": { - used: 20000000, - total: 200000000, - free: 180000000, - percent: 10, - }, - "/tmp": { - used: 0, - total: 200, - free: 200, - percent: 0, - }, - }, - networkSpeed: [5, 10], - raylet: { - state: "ALIVE", - nodeId: "1234567890ab", - isHeadNode: true, - numWorkers: 0, - pid: 2345, - startTime: 100, - terminateTime: -1, - brpcPort: 3456, - nodeManagerPort: 5890, - objectStoreAvailableMemory: 40, - objectStoreUsedMemory: 10, - }, - logUrl: "http://192.16.0.1/logs", - } as NodeDetail; render( { /* purposefully empty */ @@ -77,64 +99,31 @@ describe("NodeRow", () => { expect(screen.getByText(/5.0000B\/s/)).toBeVisible(); expect(screen.getByText(/10.0000B\/s/)).toBeVisible(); }); -}); -describe("WorkerRow", () => { - it("renders", async () => { - const node: NodeDetail = { - hostname: "test-hostname", - ip: "192.168.0.1", - cpu: 15, - mem: [100, 95, 5], - disk: { - "/": { - used: 20000000, - total: 200000000, - free: 180000000, - percent: 10, - }, - "/tmp": { - used: 0, - total: 200, - free: 200, - percent: 0, - }, - }, - networkSpeed: [5, 10], - raylet: { - state: "ALIVE", - nodeId: "1234567890ab", - isHeadNode: true, - numWorkers: 0, - pid: 2345, - startTime: 100, - terminateTime: -1, - brpcPort: 3456, - nodeManagerPort: 5890, - objectStoreAvailableMemory: 40, - objectStoreUsedMemory: 10, + it("Disable actions for Dead node", async () => { + render( + , + { + wrapper: ({ children }) => ( + + + {children} +
+
+ ), }, - logUrl: "http://192.16.0.1/logs", - } as NodeDetail; + ); + await screen.findByText("test-hostname"); + // Could not access logs for Dead nodes(the log is hidden) + expect(screen.queryByLabelText(/Log/)).not.toBeInTheDocument(); - const worker: Worker = { - cmdline: ["echo hi"], - pid: 3456, - cpuPercent: 14, - memoryInfo: { - rss: 75, - vms: 0, - pageins: 0, - pfaults: 0, - }, - coreWorkerStats: [ - { - workerId: "worker-12345", - } as CoreWorkerStats, - ], - } as Worker; + expect(screen.getByText(/3456/)).toBeVisible(); + }); +}); - render(, { +describe("WorkerRow", () => { + it("renders", async () => { + render(, { wrapper: ({ children }) => ( diff --git a/dashboard/client/src/pages/node/NodeRow.tsx b/dashboard/client/src/pages/node/NodeRow.tsx index a8d9d4a1b811..5456afef45df 100644 --- a/dashboard/client/src/pages/node/NodeRow.tsx +++ b/dashboard/client/src/pages/node/NodeRow.tsx @@ -60,6 +60,11 @@ export const NodeRow = ({ const objectStoreTotalMemory = raylet.objectStoreAvailableMemory + raylet.objectStoreUsedMemory; + /** + * Why do we use raylet.state instead of node.state in the following code? + * Because in ray, raylet == node + */ + return ( @@ -94,7 +99,9 @@ export const NodeRow = ({ - Log + {raylet.state !== "DEAD" && ( + Log + )} diff --git a/dashboard/client/src/pages/node/hook/useClusterDetail.ts b/dashboard/client/src/pages/node/hook/useClusterDetail.ts index 7d0abc5d6ef6..8bfe35c44085 100644 --- a/dashboard/client/src/pages/node/hook/useClusterDetail.ts +++ b/dashboard/client/src/pages/node/hook/useClusterDetail.ts @@ -6,7 +6,7 @@ import { getClusterMetadata } from "../../../service/global"; export const useClusterDetail = () => { const [msg, setMsg] = useState("Loading the job detail"); const [refreshing, setRefresh] = useState(true); - const { data: clusterDetail } = useSWR( + const { data: clusterDetail, isLoading } = useSWR( "useClusterDetail", async () => { try { @@ -23,5 +23,6 @@ export const useClusterDetail = () => { return { clusterDetail, msg, + isLoading, }; }; diff --git a/dashboard/client/src/pages/node/hook/useNodeDetail.ts b/dashboard/client/src/pages/node/hook/useNodeDetail.ts index 3dd00e01818b..5d06fba25446 100644 --- a/dashboard/client/src/pages/node/hook/useNodeDetail.ts +++ b/dashboard/client/src/pages/node/hook/useNodeDetail.ts @@ -15,7 +15,7 @@ export const useNodeDetail = () => { setRefresh(event.target.checked); }; - const { data: nodeDetail } = useSWR( + const { data: nodeDetail, isLoading } = useSWR( ["useNodeDetail", params.id], async ([_, nodeId]) => { const { data } = await getNodeDetail(nodeId); @@ -47,6 +47,7 @@ export const useNodeDetail = () => { selectedTab, nodeDetail, msg, + isLoading, isRefreshing, onRefreshChange, raylet, diff --git a/dashboard/client/src/pages/node/hook/useNodeList.ts b/dashboard/client/src/pages/node/hook/useNodeList.ts index 3409138ad712..ce9c764cf1a5 100644 --- a/dashboard/client/src/pages/node/hook/useNodeList.ts +++ b/dashboard/client/src/pages/node/hook/useNodeList.ts @@ -26,7 +26,7 @@ export const useNodeList = () => { const onSwitchChange = (event: React.ChangeEvent) => { setRefresh(event.target.checked); }; - const { data } = useSWR( + const { data, isLoading } = useSWR( "useNodeList", async () => { const { data } = await getNodeList(); @@ -62,6 +62,7 @@ export const useNodeList = () => { filter.every((f) => node[f.key] && node[f.key].includes(f.val)), ), msg, + isLoading, isRefreshing, onSwitchChange, changeFilter, diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx index 9f2f4bf8fc8e..3753af2d2213 100644 --- a/dashboard/client/src/pages/node/index.tsx +++ b/dashboard/client/src/pages/node/index.tsx @@ -158,6 +158,7 @@ const Nodes = () => { const classes = useStyles(); const { msg, + isLoading, isRefreshing, onSwitchChange, nodeList, @@ -172,7 +173,7 @@ const Nodes = () => { return (
- + Auto Refresh: createStyles({ @@ -31,6 +39,9 @@ const useStyles = makeStyles((theme) => maxWidth: `calc((100% - ${theme.spacing(3)}px * 2) / 3)`, }, }, + autoscalerCard: { + padding: theme.spacing(2, 3), + }, section: { marginTop: theme.spacing(4), }, @@ -40,6 +51,8 @@ const useStyles = makeStyles((theme) => export const OverviewPage = () => { const classes = useStyles(); + const { cluster_status } = useRayStatus(); + return (
{ />
- +
+ + { +
+ + + + + + + +
+ } +
+ { expect(link).toHaveAttribute("href"); }); - it("disables link when job_id is null", async () => { + it("link is active for driverless job(only have submission_id)", async () => { render(, { wrapper: MemoryRouter }); await screen.findByText("01000000"); - expect(screen.queryByRole("link", { name: "raysubmit_23456" })).toBeNull(); + + expect( + screen.queryByRole("link", { name: "raysubmit_23456" }), + ).toBeVisible(); }); }); diff --git a/dashboard/client/src/pages/overview/cards/RecentJobsCard.tsx b/dashboard/client/src/pages/overview/cards/RecentJobsCard.tsx index 1a0ccdb684c3..9d43820505f1 100644 --- a/dashboard/client/src/pages/overview/cards/RecentJobsCard.tsx +++ b/dashboard/client/src/pages/overview/cards/RecentJobsCard.tsx @@ -1,33 +1,15 @@ -import { createStyles, makeStyles, Typography } from "@material-ui/core"; -import classNames from "classnames"; +import { createStyles, makeStyles } from "@material-ui/core"; import _ from "lodash"; import React from "react"; -import { - RiCheckboxCircleFill, - RiCloseCircleFill, - RiLoader4Line, -} from "react-icons/ri"; -import { Link } from "react-router-dom"; +import { JobStatusIcon } from "../../../common/JobStatus"; +import { ListItemCard } from "../../../components/ListItemCard"; import { UnifiedJob } from "../../../type/job"; import { useJobList } from "../../job/hook/useJobList"; -import { LinkWithArrow, OverviewCard } from "./OverviewCard"; const useStyles = makeStyles((theme) => createStyles({ - root: { - display: "flex", - flexDirection: "column", - padding: theme.spacing(2, 3), - }, - listContainer: { - marginTop: theme.spacing(2), - flex: 1, - overflow: "hidden", - }, - listItem: { - "&:not(:first-child)": { - marginTop: theme.spacing(1), - }, + icon: { + marginRight: theme.spacing(1), }, }), ); @@ -36,6 +18,15 @@ type RecentJobsCardProps = { className?: string; }; +const getLink = (job: UnifiedJob) => { + if (job.job_id !== null && job.job_id !== "") { + return `/jobs/${job.job_id}`; + } else if (job.submission_id !== null && job.submission_id !== "") { + return `/jobs/${job.submission_id}`; + } + return undefined; +}; + export const RecentJobsCard = ({ className }: RecentJobsCardProps) => { const classes = useStyles(); @@ -43,135 +34,24 @@ export const RecentJobsCard = ({ className }: RecentJobsCardProps) => { const sortedJobs = _.orderBy(jobList, ["startTime"], ["desc"]).slice(0, 6); - return ( - - Recent jobs -
- {sortedJobs.map((job) => ( - - ))} - {sortedJobs.length === 0 && ( - No jobs yet... - )} -
- -
- ); -}; + const sortedJobToRender = sortedJobs.map((job) => { + return { + title: job.job_id ?? job.submission_id ?? undefined, + subtitle: job.entrypoint, + link: getLink(job), + className: className, + icon: , + }; + }); -const useRecentJobListItemStyles = makeStyles((theme) => - createStyles({ - root: { - display: "flex", - flexDirection: "row", - flexWrap: "nowrap", - alignItems: "center", - textDecoration: "none", - }, - icon: { - width: 24, - height: 24, - marginRight: theme.spacing(1), - flex: "0 0 20px", - }, - "@keyframes spinner": { - from: { - transform: "rotate(0deg)", - }, - to: { - transform: "rotate(360deg)", - }, - }, - colorSuccess: { - color: theme.palette.success.main, - }, - colorError: { - color: theme.palette.error.main, - }, - iconRunning: { - color: "#1E88E5", - animationName: "$spinner", - animationDuration: "1000ms", - animationIterationCount: "infinite", - animationTimingFunction: "linear", - }, - textContainer: { - flex: "1 1 auto", - width: `calc(100% - ${theme.spacing(1) + 20}px)`, - }, - title: { - color: "#036DCF", - }, - entrypoint: { - overflow: "hidden", - textOverflow: "ellipsis", - whiteSpace: "nowrap", - color: "#5F6469", - }, - }), -); - -type RecentJobListItemProps = { - job: UnifiedJob; - className?: string; -}; - -const RecentJobListItem = ({ job, className }: RecentJobListItemProps) => { - const classes = useRecentJobListItemStyles(); - - const icon = (() => { - switch (job.status) { - case "SUCCEEDED": - return ( - - ); - case "FAILED": - case "STOPPED": - return ( - - ); - default: - return ( - - ); - } - })(); - const cardContent = ( - - {icon} -
- - {job.job_id ?? job.submission_id} - - - {job.entrypoint} - -
-
- ); return ( -
- {job.job_id !== null && job.job_id !== "" ? ( - - {cardContent} - - ) : ( -
{cardContent}
- )} -
+ ); }; diff --git a/dashboard/client/src/pages/overview/cards/RecentServeCard.component.test.tsx b/dashboard/client/src/pages/overview/cards/RecentServeCard.component.test.tsx new file mode 100644 index 000000000000..ccb62851d5bc --- /dev/null +++ b/dashboard/client/src/pages/overview/cards/RecentServeCard.component.test.tsx @@ -0,0 +1,83 @@ +import { render, screen } from "@testing-library/react"; +import React from "react"; +import { getServeApplications } from "../../../service/serve"; +import { + ServeApplicationStatus, + ServeDeploymentMode, +} from "../../../type/serve"; +import { TEST_APP_WRAPPER } from "../../../util/test-utils"; +import { RecentServeCard } from "./RecentServeCard"; + +jest.mock("../../../service/serve"); + +const mockGetServeApplications = jest.mocked(getServeApplications); + +describe("RecentServeCard", () => { + beforeEach(() => { + mockGetServeApplications.mockResolvedValue({ + data: { + http_options: { host: "1.2.3.4", port: 8000 }, + proxy_location: ServeDeploymentMode.EveryNode, + applications: { + home: { + name: "home", + route_prefix: "/", + message: null, + status: ServeApplicationStatus.RUNNING, + deployed_app_config: { + import_path: "home:graph", + }, + last_deployed_time_s: new Date().getTime() / 1000, + }, + "second-app": { + name: "second-app", + route_prefix: "/second-app", + message: null, + status: ServeApplicationStatus.DEPLOYING, + deployed_app_config: null, + last_deployed_time_s: new Date().getTime() / 1000, + deployments: {}, + }, + }, + }, + } as any); + }); + + it("should display serve applications with deployed_app_config", async () => { + render(, { + wrapper: TEST_APP_WRAPPER, + }); + + await screen.findByText("View all applications"); + + expect.assertions(3); + expect(screen.getByText("home")).toBeInTheDocument(); + expect(screen.getByText("home:graph")).toBeInTheDocument(); + expect(screen.getByText("Serve Applications")).toBeInTheDocument(); + }); + + it("should display serve applications without deployed_app_config", async () => { + render(, { + wrapper: TEST_APP_WRAPPER, + }); + + await screen.findByText("View all applications"); + + expect.assertions(3); + expect(screen.getByText("second-app")).toBeInTheDocument(); + expect(screen.getByText("-")).toBeInTheDocument(); // default value for no deployed_app_config + expect(screen.getByText("Serve Applications")).toBeInTheDocument(); + }); + + it("should navigate to the applications page when the 'View all applications' link is clicked", async () => { + render(, { + wrapper: TEST_APP_WRAPPER, + }); + + await screen.findByText("View all applications"); + const link = screen.getByRole("link", { + name: /view all applications/i, + }); + expect(link).toHaveAttribute("href"); + }); +}); diff --git a/dashboard/client/src/pages/overview/cards/RecentServeCard.tsx b/dashboard/client/src/pages/overview/cards/RecentServeCard.tsx new file mode 100644 index 000000000000..960cd8738006 --- /dev/null +++ b/dashboard/client/src/pages/overview/cards/RecentServeCard.tsx @@ -0,0 +1,53 @@ +import { createStyles, makeStyles } from "@material-ui/core"; +import _ from "lodash"; +import React from "react"; +import { ServeStatusIcon } from "../../../common/ServeStatus"; +import { ListItemCard } from "../../../components/ListItemCard"; +import { useServeApplications } from "../../serve/hook/useServeApplications"; + +const useStyles = makeStyles((theme) => + createStyles({ + icon: { + marginRight: theme.spacing(1), + }, + }), +); + +type RecentServeCardProps = { + className?: string; +}; + +export const RecentServeCard = ({ className }: RecentServeCardProps) => { + const classes = useStyles(); + + // Use mock data by uncommenting the following line + // const applications = mockServeApplications.applications; + const { allServeApplications: applications } = useServeApplications(); + + const sortedApplications = _.orderBy( + applications, + ["last_deployed_time_s"], + ["desc"], + ).slice(0, 6); + + const sortedApplicationsToRender = sortedApplications.map((app) => { + return { + title: app.name, + subtitle: app?.deployed_app_config?.import_path || "-", + link: app.name ? `/serve/applications/${app.name}` : undefined, + className: className, + icon: , + }; + }); + + return ( + + ); +}; diff --git a/dashboard/client/src/pages/serve/ServeApplicationDetailPage.tsx b/dashboard/client/src/pages/serve/ServeApplicationDetailPage.tsx index 3febd2a31856..a3245b44b49f 100644 --- a/dashboard/client/src/pages/serve/ServeApplicationDetailPage.tsx +++ b/dashboard/client/src/pages/serve/ServeApplicationDetailPage.tsx @@ -105,13 +105,13 @@ export const ServeApplicationDetailPage = () => { content: { value: Object.values(application.deployments) .map(({ replicas }) => replicas.length) - .reduce((acc, curr) => acc + curr) + .reduce((acc, curr) => acc + curr, 0) .toString(), }, }, { label: "Application config", - content: ( + content: application.deployed_app_config ? ( { } code={application.deployed_app_config} /> + ) : ( + - ), }, { @@ -138,6 +140,12 @@ export const ServeApplicationDetailPage = () => { /> ), }, + { + label: "Import path", + content: { + value: application?.deployed_app_config?.import_path || "-", + }, + }, ]} /> diff --git a/dashboard/client/src/pages/serve/ServeApplicationRow.tsx b/dashboard/client/src/pages/serve/ServeApplicationRow.tsx index d3cf37a24eec..54d06cd964ca 100644 --- a/dashboard/client/src/pages/serve/ServeApplicationRow.tsx +++ b/dashboard/client/src/pages/serve/ServeApplicationRow.tsx @@ -54,10 +54,16 @@ export const ServeApplicationRow = ({ - + {deployed_app_config ? ( + + ) : ( + "-" + )} ); diff --git a/dashboard/client/src/pages/serve/ServeApplicationsListPage.component.test.tsx b/dashboard/client/src/pages/serve/ServeApplicationsListPage.component.test.tsx index 43b2ca9b464b..57684b20e55e 100644 --- a/dashboard/client/src/pages/serve/ServeApplicationsListPage.component.test.tsx +++ b/dashboard/client/src/pages/serve/ServeApplicationsListPage.component.test.tsx @@ -1,22 +1,51 @@ import { render, screen } from "@testing-library/react"; import userEvent from "@testing-library/user-event"; import React from "react"; +import { getActor } from "../../service/actor"; import { getServeApplications } from "../../service/serve"; -import { ServeApplicationStatus, ServeDeploymentMode } from "../../type/serve"; +import { + ServeApplicationStatus, + ServeDeploymentMode, + ServeSystemActorStatus, +} from "../../type/serve"; import { TEST_APP_WRAPPER } from "../../util/test-utils"; import { ServeApplicationsListPage } from "./ServeApplicationsListPage"; +jest.mock("../../service/actor"); jest.mock("../../service/serve"); const mockGetServeApplications = jest.mocked(getServeApplications); +const mockGetActor = jest.mocked(getActor); describe("ServeApplicationsListPage", () => { it("renders list", async () => { - expect.assertions(11); + expect.assertions(15); + + // Mock ServeController actor fetch + mockGetActor.mockResolvedValue({ + data: { + data: { + detail: { + state: "ALIVE", + }, + }, + }, + } as any); mockGetServeApplications.mockResolvedValue({ data: { http_options: { host: "1.2.3.4", port: 8000 }, + http_proxies: { + foo: { + node_id: "node:12345", + status: ServeSystemActorStatus.STARTING, + actor_id: "actor:12345", + }, + }, + controller_info: { + node_id: "node:12345", + actor_id: "actor:12345", + }, proxy_location: ServeDeploymentMode.EveryNode, applications: { home: { @@ -54,11 +83,19 @@ describe("ServeApplicationsListPage", () => { const user = userEvent.setup(); - await screen.findByText("Config"); - expect(screen.getByText("Config")).toBeVisible(); + await screen.findByText("System"); + expect(screen.getByText("System")).toBeVisible(); expect(screen.getByText("1.2.3.4")).toBeVisible(); expect(screen.getByText("8000")).toBeVisible(); + // HTTP Proxy row + expect(screen.getByText("HTTPProxyActor:node:12345")).toBeVisible(); + expect(screen.getByText("STARTING")).toBeVisible(); + + // Serve Controller row + expect(screen.getByText("Serve Controller")).toBeVisible(); + expect(screen.getByText("HEALTHY")).toBeVisible(); + // First row expect(screen.getByText("home")).toBeVisible(); expect(screen.getByText("/")).toBeVisible(); diff --git a/dashboard/client/src/pages/serve/ServeApplicationsListPage.tsx b/dashboard/client/src/pages/serve/ServeApplicationsListPage.tsx index acd835b2c10b..c2523bf32cba 100644 --- a/dashboard/client/src/pages/serve/ServeApplicationsListPage.tsx +++ b/dashboard/client/src/pages/serve/ServeApplicationsListPage.tsx @@ -17,11 +17,11 @@ import { Alert, Autocomplete, Pagination } from "@material-ui/lab"; import React, { ReactElement } from "react"; import { CollapsibleSection } from "../../common/CollapsibleSection"; import Loading from "../../components/Loading"; -import { MetadataSection } from "../../components/MetadataSection"; import { HelpInfo } from "../../components/Tooltip"; import { useServeApplications } from "./hook/useServeApplications"; import { ServeApplicationRow } from "./ServeApplicationRow"; import { ServeMetricsSection } from "./ServeMetricsSection"; +import { ServeSystemDetails } from "./ServeSystemDetails"; const useStyles = makeStyles((theme) => createStyles({ @@ -34,6 +34,9 @@ const useStyles = makeStyles((theme) => helpInfo: { marginLeft: theme.spacing(1), }, + applicationsSection: { + marginTop: theme.spacing(4), + }, metricsSection: { marginTop: theme.spacing(4), }, @@ -56,10 +59,13 @@ export const ServeApplicationsListPage = () => { const { serveDetails, filteredServeApplications, + httpProxies, error, allServeApplications, page, setPage, + httpProxiesPage, + setHttpProxiesPage, changeFilter, } = useServeApplications(); @@ -73,132 +79,121 @@ export const ServeApplicationsListPage = () => { return (
- - {serveDetails.host && serveDetails.port ? ( - + Serve not started. Please deploy a serve application first. + + ) : ( + + - ) : ( - - Serve not started. Please deploy a serve application first. - - )} - - - -
- (e.name ? e.name : "-")), - ), - )} - onInputChange={(_: any, value: string) => { - changeFilter("name", value.trim() !== "-" ? value.trim() : ""); - }} - renderInput={(params: TextFieldProps) => ( - - )} - /> - e.status)), - )} - onInputChange={(_: any, value: string) => { - changeFilter("status", value.trim()); - }} - renderInput={(params: TextFieldProps) => ( - - )} - /> - { - setPage("pageSize", Math.min(Number(value), 500) || 10); - }, - endAdornment: ( - Per Page - ), - }} - /> -
-
- setPage("pageNo", pageNo)} - /> -
-
- - - {columns.map(({ label, helpInfo, width }) => ( - - - {label} - {helpInfo && ( - - {helpInfo} - - )} - - - ))} - - - - {filteredServeApplications - .slice( - (page.pageNo - 1) * page.pageSize, - page.pageNo * page.pageSize, - ) - .map((application) => ( - - ))} - -
- - + + +
+ (e.name ? e.name : "-")), + ), + )} + onInputChange={(_: any, value: string) => { + changeFilter( + "name", + value.trim() !== "-" ? value.trim() : "", + ); + }} + renderInput={(params: TextFieldProps) => ( + + )} + /> + e.status)), + )} + onInputChange={(_: any, value: string) => { + changeFilter("status", value.trim()); + }} + renderInput={(params: TextFieldProps) => ( + + )} + /> + { + setPage("pageSize", Math.min(Number(value), 500) || 10); + }, + endAdornment: ( + Per Page + ), + }} + /> +
+
+ setPage("pageNo", pageNo)} + /> +
+ + + + {columns.map(({ label, helpInfo, width }) => ( + + + {label} + {helpInfo && ( + + {helpInfo} + + )} + + + ))} + + + + {filteredServeApplications + .slice( + (page.pageNo - 1) * page.pageSize, + page.pageNo * page.pageSize, + ) + .map((application) => ( + + ))} + +
+
+
+ + )}
); diff --git a/dashboard/client/src/pages/serve/ServeDeploymentRow.tsx b/dashboard/client/src/pages/serve/ServeDeploymentRow.tsx index 24e9386b406b..9c749b8bf7d7 100644 --- a/dashboard/client/src/pages/serve/ServeDeploymentRow.tsx +++ b/dashboard/client/src/pages/serve/ServeDeploymentRow.tsx @@ -22,7 +22,6 @@ import { ServeReplica, } from "../../type/serve"; import { useViewServeDeploymentMetricsButtonUrl } from "./ServeDeploymentMetricsSection"; -import { ServeReplicaLogsLink } from "./ServeReplicaDetailPage"; const useStyles = makeStyles((theme) => createStyles({ @@ -144,18 +143,24 @@ export const ServeReplicaRow = ({ - {replica_id} - + - - + + Log + {metricsUrl && (
diff --git a/dashboard/client/src/pages/serve/ServeReplicaDetailPage.tsx b/dashboard/client/src/pages/serve/ServeReplicaDetailPage.tsx index be40900f8625..d1572729f26e 100644 --- a/dashboard/client/src/pages/serve/ServeReplicaDetailPage.tsx +++ b/dashboard/client/src/pages/serve/ServeReplicaDetailPage.tsx @@ -1,16 +1,20 @@ -import { createStyles, Link, makeStyles, Typography } from "@material-ui/core"; -import React, { useContext } from "react"; -import { Link as RouterLink, useParams } from "react-router-dom"; -import { GlobalContext } from "../../App"; +import { createStyles, makeStyles, Typography } from "@material-ui/core"; +import React from "react"; +import { useParams } from "react-router-dom"; import { CodeDialogButton } from "../../common/CodeDialogButton"; import { CollapsibleSection } from "../../common/CollapsibleSection"; import { DurationText } from "../../common/DurationText"; import { formatDateFromTimeMs } from "../../common/formatUtils"; import { generateActorLink, generateNodeLink } from "../../common/links"; +import { + MultiTabLogViewer, + MultiTabLogViewerTabDetails, +} from "../../common/MultiTabLogViewer"; +import { Section } from "../../common/Section"; import Loading from "../../components/Loading"; import { MetadataSection } from "../../components/MetadataSection"; import { StatusChip } from "../../components/StatusChip"; -import { ServeDeployment, ServeReplica } from "../../type/serve"; +import { ServeReplica } from "../../type/serve"; import { MainNavPageInfo } from "../layout/mainNavContext"; import TaskList from "../state/task"; import { useServeReplicaDetails } from "./hook/useServeApplications"; @@ -83,12 +87,6 @@ export const ServeReplicaDetailPage = () => { label: "State", content: , }, - { - label: "Logs", - content: ( - - ), - }, { label: "Actor ID", content: { @@ -147,6 +145,11 @@ export const ServeReplicaDetailPage = () => { }, ]} /> + +
+ +
+
{ ); }; -export type ServeReplicaLogsLinkProps = { - replica: ServeReplica; - deployment: ServeDeployment; +type ServeReplicaLogsProps = { + replica: Pick; }; -export const ServeReplicaLogsLink = ({ - replica: { replica_id, node_ip }, - deployment: { name: deploymentName }, -}: ServeReplicaLogsLinkProps) => { - const { ipLogMap } = useContext(GlobalContext); - - let link: string | undefined; - - if (node_ip && ipLogMap[node_ip]) { - // TODO(aguo): Clean up this logic after re-writing the log viewer - const logsRoot = ipLogMap[node_ip].endsWith("/logs") - ? ipLogMap[node_ip].substring( - 0, - ipLogMap[node_ip].length - "/logs".length, - ) - : ipLogMap[node_ip]; - // TODO(aguo): Have API return the location of the logs. - const path = `/logs/serve/deployment_${deploymentName}_${replica_id}.log`; - link = `/logs/${encodeURIComponent(logsRoot)}/${encodeURIComponent(path)}`; - } - - if (link) { - return ( - - Log - - ); - } - - return -; +const ServeReplicaLogs = ({ + replica: { log_file_path, node_id, actor_id }, +}: ServeReplicaLogsProps) => { + const tabs: MultiTabLogViewerTabDetails[] = [ + ...(log_file_path + ? [ + { + title: "replica", + nodeId: node_id, + filename: log_file_path.startsWith("/") + ? log_file_path.substring(1) + : log_file_path, + }, + ] + : []), + ]; + return ; }; diff --git a/dashboard/client/src/pages/serve/ServeSystemActorDetailPage.tsx b/dashboard/client/src/pages/serve/ServeSystemActorDetailPage.tsx new file mode 100644 index 000000000000..228ac278a7d2 --- /dev/null +++ b/dashboard/client/src/pages/serve/ServeSystemActorDetailPage.tsx @@ -0,0 +1,255 @@ +import { Typography } from "@material-ui/core"; +import React from "react"; +import { useParams } from "react-router-dom"; +import { CollapsibleSection } from "../../common/CollapsibleSection"; +import { generateActorLink, generateNodeLink } from "../../common/links"; +import { + MultiTabLogViewer, + MultiTabLogViewerTabDetails, +} from "../../common/MultiTabLogViewer"; +import { Section } from "../../common/Section"; +import Loading from "../../components/Loading"; +import { MetadataSection } from "../../components/MetadataSection"; +import { StatusChip } from "../../components/StatusChip"; +import { ActorDetail, ActorEnum } from "../../type/actor"; +import { + ServeHttpProxy, + ServeSystemActor, + ServeSystemActorStatus, +} from "../../type/serve"; +import { useFetchActor } from "../actor/hook/useActorDetail"; +import { MainNavPageInfo } from "../layout/mainNavContext"; +import { + useServeControllerDetails, + useServeHTTPProxyDetails, +} from "./hook/useServeApplications"; + +export const ServeHttpProxyDetailPage = () => { + const { httpProxyId } = useParams(); + + const { httpProxy, loading } = useServeHTTPProxyDetails(httpProxyId); + + if (loading) { + return ; + } + + if (!httpProxy) { + return ( + + HTTPProxyActor with id "{httpProxyId}" not found. + + ); + } + + return ( +
+ + +
+ ); +}; + +export const ServeControllerDetailPage = () => { + const { controller, loading } = useServeControllerDetails(); + + if (loading) { + return ; + } + + if (!controller) { + return Serve controller not found.; + } + + return ( +
+ + +
+ ); +}; + +type ActorInfo = + | { + type: "httpProxy"; + detail: ServeHttpProxy; + } + | { + type: "controller"; + detail: ServeSystemActor; + }; + +type ServeSystemActorDetailProps = { + actor: ActorInfo; +}; + +export const convertActorStateForServeController = ( + actorState: ActorEnum | string, +) => { + if (actorState === ActorEnum.ALIVE) { + return ServeSystemActorStatus.HEALTHY; + } else if (actorState === ActorEnum.DEAD) { + return ServeSystemActorStatus.UNHEALTHY; + } else { + return ServeSystemActorStatus.STARTING; + } +}; + +export const ServeSystemActorDetail = ({ + actor, +}: ServeSystemActorDetailProps) => { + const name = + actor.type === "httpProxy" + ? `HTTPProxyActor:${actor.detail.actor_id}` + : "Serve Controller"; + + const { data: fetchedActor } = useFetchActor(actor.detail.actor_id); + + return ( +
+ + ) : fetchedActor ? ( + + ) : ( + { + value: "-", + } + ), + }, + { + label: "Actor ID", + content: actor.detail.actor_id + ? { + value: actor.detail.actor_id, + copyableValue: actor.detail.actor_id, + link: actor.detail.actor_id + ? generateActorLink(actor.detail.actor_id) + : undefined, + } + : { + value: "-", + }, + }, + { + label: "Actor name", + content: { + value: actor.detail.actor_name ? actor.detail.actor_name : "-", + }, + }, + { + label: "Worker ID", + content: actor.detail.worker_id + ? { + value: actor.detail.worker_id, + copyableValue: actor.detail.worker_id, + } + : { + value: "-", + }, + }, + { + label: "Node ID", + content: actor.detail.node_id + ? { + value: actor.detail.node_id, + copyableValue: actor.detail.node_id, + link: actor.detail.node_id + ? generateNodeLink(actor.detail.node_id) + : undefined, + } + : { + value: "-", + }, + }, + { + label: "Node IP", + content: { + value: actor.detail.node_ip ? actor.detail.node_ip : "-", + }, + }, + ]} + /> + {fetchedActor && actor.detail.log_file_path && ( + +
+ +
+
+ )} +
+ ); +}; + +type ServeSystemActorLogsProps = { + type: "controller" | "httpProxy"; + actor: Pick; + systemLogFilePath: string; +}; + +const ServeSystemActorLogs = ({ + type, + actor: { + actorId, + pid, + address: { workerId, rayletId }, + }, + systemLogFilePath, +}: ServeSystemActorLogsProps) => { + const tabs: MultiTabLogViewerTabDetails[] = [ + { + title: type === "controller" ? "Controller logs" : "HTTP proxy logs", + nodeId: rayletId, + filename: systemLogFilePath.startsWith("/") + ? systemLogFilePath.substring(1) + : systemLogFilePath, + }, + ]; + return ; +}; diff --git a/dashboard/client/src/pages/serve/ServeSystemDetailRows.tsx b/dashboard/client/src/pages/serve/ServeSystemDetailRows.tsx new file mode 100644 index 000000000000..f641499adb9a --- /dev/null +++ b/dashboard/client/src/pages/serve/ServeSystemDetailRows.tsx @@ -0,0 +1,135 @@ +import { + createStyles, + Link, + makeStyles, + TableCell, + TableRow, + Tooltip, +} from "@material-ui/core"; +import React from "react"; +import { Link as RouterLink } from "react-router-dom"; +import { StatusChip } from "../../components/StatusChip"; +import { ServeHttpProxy, ServeSystemActor } from "../../type/serve"; +import { useFetchActor } from "../actor/hook/useActorDetail"; +import { convertActorStateForServeController } from "./ServeSystemActorDetailPage"; + +const useStyles = makeStyles((theme) => + createStyles({ + idCol: { + display: "inline-block", + width: "50px", + overflow: "hidden", + textOverflow: "ellipsis", + whiteSpace: "nowrap", + verticalAlign: "bottom", + }, + }), +); + +export type ServeHttpProxyRowProps = { + httpProxy: ServeHttpProxy; +}; + +export const ServeHttpProxyRow = ({ httpProxy }: ServeHttpProxyRowProps) => { + const { status } = httpProxy; + + return ( + } + /> + ); +}; + +export type ServeControllerRowProps = { + controller: ServeSystemActor; +}; + +export const ServeControllerRow = ({ controller }: ServeControllerRowProps) => { + const { data: actor } = useFetchActor(controller.actor_id); + + const status = actor?.state; + + return ( + + ) : ( + "-" + ) + } + /> + ); +}; + +type ServeSystemActorRowProps = { + actor: ServeSystemActor; + type: "controller" | "httpProxy"; + status: React.ReactNode; +}; + +const ServeSystemActorRow = ({ + actor, + type, + status, +}: ServeSystemActorRowProps) => { + const { node_id, actor_id } = actor; + const classes = useStyles(); + + return ( + + + {type === "httpProxy" ? ( + + HTTPProxyActor:{node_id} + + ) : ( + + Serve Controller + + )} + + {status} + + {type === "httpProxy" ? ( + + Log + + ) : ( + + Log + + )} + + + {node_id ? ( + + + {node_id} + + + ) : ( + "-" + )} + + + {actor_id ? ( + + + {actor_id} + + + ) : ( + "-" + )} + + + ); +}; diff --git a/dashboard/client/src/pages/serve/ServeSystemDetails.tsx b/dashboard/client/src/pages/serve/ServeSystemDetails.tsx new file mode 100644 index 000000000000..d709b73945da --- /dev/null +++ b/dashboard/client/src/pages/serve/ServeSystemDetails.tsx @@ -0,0 +1,152 @@ +import { + Box, + createStyles, + makeStyles, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, +} from "@material-ui/core"; +import { Pagination } from "@material-ui/lab"; +import React, { ReactElement } from "react"; +import { RiErrorWarningFill } from "react-icons/ri"; +import { CollapsibleSection } from "../../common/CollapsibleSection"; +import { MetadataSection } from "../../components/MetadataSection"; +import { HelpInfo } from "../../components/Tooltip"; +import { ServeApplicationsRsp, ServeHttpProxy } from "../../type/serve"; +import { ServeControllerRow, ServeHttpProxyRow } from "./ServeSystemDetailRows"; + +const useStyles = makeStyles((theme) => + createStyles({ + table: {}, + helpInfo: { + marginLeft: theme.spacing(1), + }, + errorIcon: { + color: theme.palette.error.main, + width: 20, + height: 20, + }, + }), +); + +export type ServeDetails = Pick< + ServeApplicationsRsp, + "http_options" | "proxy_location" | "controller_info" +>; + +type ServeSystemDetailsProps = { + serveDetails: ServeDetails; + httpProxies: ServeHttpProxy[]; + page: { pageSize: number; pageNo: number }; + setPage: (key: string, value: number) => void; +}; + +const columns: { label: string; helpInfo?: ReactElement; width?: string }[] = [ + { label: "Name" }, + { label: "Status" }, + { label: "Actions" }, + { label: "Node ID" }, + { label: "Actor ID" }, +]; + +export const ServeSystemDetails = ({ + serveDetails, + httpProxies, + page, + setPage, +}: ServeSystemDetailsProps) => { + const classes = useStyles(); + + const isUnhealthy = httpProxies.some(({ status }) => status === "UNHEALTHY"); + + return ( + + ) : undefined + } + > + {serveDetails.http_options && ( + + )} + +
+ setPage("pageNo", pageNo)} + /> +
+ + + + {columns.map(({ label, helpInfo, width }) => ( + + + {label} + {helpInfo && ( + + {helpInfo} + + )} + + + ))} + + + + + {httpProxies + .slice( + (page.pageNo - 1) * page.pageSize, + page.pageNo * page.pageSize, + ) + .map((httpProxy) => ( + + ))} + +
+
+
+ ); +}; diff --git a/dashboard/client/src/pages/serve/hook/useServeApplications.ts b/dashboard/client/src/pages/serve/hook/useServeApplications.ts index 4ab54eb23158..1f7fdfb759ca 100644 --- a/dashboard/client/src/pages/serve/hook/useServeApplications.ts +++ b/dashboard/client/src/pages/serve/hook/useServeApplications.ts @@ -3,6 +3,17 @@ import useSWR from "swr"; import { GlobalContext } from "../../../App"; import { API_REFRESH_INTERVAL_MS } from "../../../common/constants"; import { getServeApplications } from "../../../service/serve"; +import { ServeSystemActorStatus } from "../../../type/serve"; +import { ServeDetails } from "../ServeSystemDetails"; + +const SERVE_HTTP_PROXY_STATUS_SORT_ORDER: Record< + ServeSystemActorStatus, + number +> = { + [ServeSystemActorStatus.UNHEALTHY]: 0, + [ServeSystemActorStatus.STARTING]: 1, + [ServeSystemActorStatus.HEALTHY]: 2, +}; export const useServeApplications = () => { const [page, setPage] = useState({ pageSize: 10, pageNo: 1 }); @@ -23,6 +34,11 @@ export const useServeApplications = () => { setFilter([...filter]); }; + const [httpProxiesPage, setHttpProxiesPage] = useState({ + pageSize: 10, + pageNo: 1, + }); + const { data, error } = useSWR( "useServeApplications", async () => { @@ -35,8 +51,12 @@ export const useServeApplications = () => { { refreshInterval: API_REFRESH_INTERVAL_MS }, ); - const serveDetails = data - ? { ...data.http_options, proxy_location: data.proxy_location } + const serveDetails: ServeDetails | undefined = data + ? { + http_options: data.http_options, + proxy_location: data.proxy_location, + controller_info: data.controller_info, + } : undefined; const serveApplicationsList = data ? Object.values(data.applications).sort( @@ -44,6 +64,15 @@ export const useServeApplications = () => { ) : []; + const httpProxies = + data && data.http_proxies + ? Object.values(data.http_proxies).sort( + (a, b) => + SERVE_HTTP_PROXY_STATUS_SORT_ORDER[b.status] - + SERVE_HTTP_PROXY_STATUS_SORT_ORDER[a.status], + ) + : []; + return { serveDetails, filteredServeApplications: serveApplicationsList.filter((app) => @@ -51,10 +80,14 @@ export const useServeApplications = () => { f.val ? app[f.key] && (app[f.key] ?? "").includes(f.val) : true, ), ), + httpProxies, error, changeFilter, page, setPage: (key: string, val: number) => setPage({ ...page, [key]: val }), + httpProxiesPage, + setHttpProxiesPage: (key: string, val: number) => + setHttpProxiesPage({ ...httpProxiesPage, [key]: val }), ipLogMap, allServeApplications: serveApplicationsList, }; @@ -162,3 +195,49 @@ export const useServeReplicaDetails = ( error, }; }; + +export const useServeHTTPProxyDetails = (httpProxyId: string | undefined) => { + const { data, error, isLoading } = useSWR( + "useServeHTTPProxyDetails", + async () => { + const rsp = await getServeApplications(); + + if (rsp) { + return rsp.data; + } + }, + { refreshInterval: API_REFRESH_INTERVAL_MS }, + ); + + const httpProxy = httpProxyId ? data?.http_proxies?.[httpProxyId] : undefined; + + // Need to expose loading because it's not clear if undefined values + // for http proxies means loading or missing data. + return { + loading: isLoading, + httpProxy, + error, + }; +}; + +export const useServeControllerDetails = () => { + const { data, error, isLoading } = useSWR( + "useServeControllerDetails", + async () => { + const rsp = await getServeApplications(); + + if (rsp) { + return rsp.data; + } + }, + { refreshInterval: API_REFRESH_INTERVAL_MS }, + ); + + // Need to expose loading because it's not clear if undefined values + // for serve controller means loading or missing data. + return { + loading: isLoading, + controller: data?.controller_info, + error, + }; +}; diff --git a/dashboard/client/src/pages/serve/mockServeApplication.ts b/dashboard/client/src/pages/serve/mockServeApplication.ts new file mode 100644 index 000000000000..5ed996d2829f --- /dev/null +++ b/dashboard/client/src/pages/serve/mockServeApplication.ts @@ -0,0 +1,63 @@ +import { ServeApplicationStatus } from "../../type/serve"; + +export const mockServeApplications = { + applications: { + app1: { + name: "app1", + route_prefix: "/app1", + message: null, + status: ServeApplicationStatus.RUNNING, + deployed_app_config: { + import_path: "app1:graph", + }, + last_deployed_time_s: new Date().getTime() / 1000, + }, + app2: { + name: "app2", + route_prefix: "/app2", + message: null, + status: ServeApplicationStatus.RUNNING, + deployed_app_config: null, + last_deployed_time_s: new Date().getTime() / 1000, + deployments: {}, + }, + app3: { + name: "app3", + route_prefix: "/app3", + message: null, + status: ServeApplicationStatus.DEPLOYING, + deployed_app_config: null, + last_deployed_time_s: new Date().getTime() / 1000, + deployments: {}, + }, + app4: { + name: "app4", + route_prefix: "/app4", + message: null, + status: ServeApplicationStatus.RUNNING, + deployed_app_config: { + import_path: "app4:graph", + }, + last_deployed_time_s: new Date().getTime() / 1000, + }, + app5: { + name: "app5", + route_prefix: "/app5", + message: null, + status: ServeApplicationStatus.DEPLOY_FAILED, + deployed_app_config: { + import_path: "app5:graph", + }, + last_deployed_time_s: new Date().getTime() / 1000, + }, + app6: { + name: "app6", + route_prefix: "/app6", + message: null, + status: ServeApplicationStatus.DELETING, + deployed_app_config: null, + last_deployed_time_s: new Date().getTime() / 1000, + deployments: {}, + }, + }, +}; diff --git a/dashboard/client/src/pages/state/hook/mockedPlacementGroup.ts b/dashboard/client/src/pages/state/hook/mockedPlacementGroup.ts new file mode 100644 index 000000000000..68723a806902 --- /dev/null +++ b/dashboard/client/src/pages/state/hook/mockedPlacementGroup.ts @@ -0,0 +1,61 @@ +export const bundles = [ + { + bundle_id: "bundle-1", + node_id: "node-1", + unit_resources: { + cpu: 4, + memory: 8192, + }, + }, + { + bundle_id: "bundle-2", + node_id: null, + unit_resources: { + cpu: 2, + memory: 4096, + }, + }, + { + bundle_id: "bundle-3", + node_id: "node-2", + unit_resources: { + cpu: 8, + memory: 16384, + }, + }, +]; + +export const mockData = [ + { + placement_group_id: "pg-123456789", + name: "MyPlacementGroup", + creator_job_id: "job-987654321", + state: "CREATED", + stats: null, + bundles, + }, + { + placement_group_id: "pg-123456789", + name: "MyPlacementGroup", + creator_job_id: "job-987654321", + state: "REMOVED", + stats: null, + bundles, + }, + { + placement_group_id: "pg-123456789", + name: "MyPlacementGroup", + creator_job_id: "job-987654321", + state: "RESCHEDULING", + stats: null, + bundles, + }, + { + placement_group_id: "pg-123456789", + name: "MyPlacementGroup", + creator_job_id: "job-987654321", + state: "PENDING", + stats: null, + bundles, + }, +]; diff --git a/dashboard/client/src/pages/state/hook/useStateApi.ts b/dashboard/client/src/pages/state/hook/useStateApi.ts index d8ec1187784b..3ee375e4785a 100644 --- a/dashboard/client/src/pages/state/hook/useStateApi.ts +++ b/dashboard/client/src/pages/state/hook/useStateApi.ts @@ -1,6 +1,7 @@ import { AxiosResponse } from "axios"; import useSWR, { Key } from "swr"; import { PER_JOB_PAGE_REFRESH_INTERVAL_MS } from "../../../common/constants"; +import { getTask } from "../../../service/task"; import { AsyncFunction, StateApiResponse, @@ -29,3 +30,23 @@ export const useStateApiList = ( return data; }; + +export const useStateApiTask = (taskId: string | undefined) => { + const { data, isLoading } = useSWR( + taskId ? ["useStateApiTask", taskId] : null, + async ([_, taskId]) => { + const rsp = await getTask(taskId); + if (rsp?.data?.data?.result?.result) { + return rsp.data.data.result.result[0]; + } else { + return undefined; + } + }, + { refreshInterval: PER_JOB_PAGE_REFRESH_INTERVAL_MS }, + ); + + return { + task: data, + isLoading, + }; +}; diff --git a/dashboard/client/src/pages/task/TaskPage.tsx b/dashboard/client/src/pages/task/TaskPage.tsx new file mode 100644 index 000000000000..619dc5a2ead6 --- /dev/null +++ b/dashboard/client/src/pages/task/TaskPage.tsx @@ -0,0 +1,270 @@ +import { Box, createStyles, makeStyles, Typography } from "@material-ui/core"; +import React from "react"; +import { useParams } from "react-router-dom"; +import { CodeDialogButtonWithPreview } from "../../common/CodeDialogButton"; +import { CollapsibleSection } from "../../common/CollapsibleSection"; +import { DurationText } from "../../common/DurationText"; +import { formatDateFromTimeMs } from "../../common/formatUtils"; +import { generateActorLink, generateNodeLink } from "../../common/links"; +import { + MultiTabLogViewer, + MultiTabLogViewerTabDetails, +} from "../../common/MultiTabLogViewer"; +import { Section } from "../../common/Section"; +import Loading from "../../components/Loading"; +import { MetadataSection } from "../../components/MetadataSection"; +import { StatusChip } from "../../components/StatusChip"; +import { Task } from "../../type/task"; +import { MainNavPageInfo } from "../layout/mainNavContext"; +import { useStateApiTask } from "../state/hook/useStateApi"; + +const useStyles = makeStyles((theme) => + createStyles({ + root: { + padding: theme.spacing(2), + backgroundColor: "white", + }, + }), +); + +export const TaskPage = () => { + const { taskId } = useParams(); + const { task, isLoading } = useStateApiTask(taskId); + + const classes = useStyles(); + + return ( +
+ + +
+ ); +}; + +type TaskPageContentsProps = { + taskId?: string; + task?: Task; + isLoading: boolean; +}; + +const TaskPageContents = ({ + taskId, + task, + isLoading, +}: TaskPageContentsProps) => { + if (isLoading) { + return ; + } + + if (!task) { + return ( + Task with ID "{taskId}" not found. + ); + } + + const { + task_id, + actor_id, + end_time_ms, + start_time_ms, + node_id, + placement_group_id, + required_resources, + state, + type, + worker_id, + job_id, + func_or_class_name, + name, + } = task; + + return ( +
+ , + }, + { + label: "Job ID", + content: { + value: job_id, + copyableValue: job_id, + }, + }, + { + label: "Function or class name", + content: { + value: func_or_class_name, + }, + }, + { + label: "Actor ID", + content: actor_id + ? { + value: actor_id, + copyableValue: actor_id, + link: generateActorLink(actor_id), + } + : { + value: "-", + }, + }, + { + label: "Node ID", + content: node_id + ? { + value: node_id, + copyableValue: node_id, + link: generateNodeLink(node_id), + } + : { + value: "-", + }, + }, + { + label: "Worker ID", + content: worker_id + ? { + value: worker_id, + copyableValue: worker_id, + } + : { + value: "-", + }, + }, + { + label: "Type", + content: { + value: type, + }, + }, + { + label: "Placement group ID", + content: placement_group_id + ? { + value: placement_group_id, + copyableValue: placement_group_id, + } + : { + value: "-", + }, + }, + { + label: "Required resources", + content: + Object.entries(required_resources).length > 0 ? ( + + + + ) : ( + { + value: "{}", + } + ), + }, + { + label: "Started at", + content: { + value: start_time_ms ? formatDateFromTimeMs(start_time_ms) : "-", + }, + }, + { + label: "Ended at", + content: { + value: end_time_ms ? formatDateFromTimeMs(end_time_ms) : "-", + }, + }, + { + label: "Duration", + content: start_time_ms ? ( + + ) : ( + { + value: "-", + } + ), + }, + ]} + /> + +
+ +
+
+
+ ); +}; + +type TaskLogsProps = { + task: Task; +}; + +const TaskLogs = ({ + task: { task_id, error_message, error_type, worker_id, node_id }, +}: TaskLogsProps) => { + const errorDetails = + error_type !== null && error_message !== null + ? `Error Type: ${error_type}\n\n${error_message}` + : undefined; + + const tabs: MultiTabLogViewerTabDetails[] = [ + ...(worker_id !== null && node_id !== null + ? ([ + { + title: "stderr", + taskId: task_id, + suffix: "err", + }, + { + title: "stdout", + taskId: task_id, + suffix: "out", + }, + ] as const) + : []), + // TODO(aguo): uncomment once PID is available in the API. + // { + // title: "system", + // nodeId: node_id, + // // TODO(aguo): Have API return the log file name. + // filename: `python-core-worker-${worker_id}_${pid}.log`, + // }, + ...(errorDetails + ? [{ title: "Error stack trace", contents: errorDetails }] + : []), + ]; + return ; +}; diff --git a/dashboard/client/src/service/log.ts b/dashboard/client/src/service/log.ts index d301254c1b37..51ca4902b59b 100644 --- a/dashboard/client/src/service/log.ts +++ b/dashboard/client/src/service/log.ts @@ -49,3 +49,69 @@ export const getLogDetail = async (url: string) => { return rsp.data as string; }; + +export type StateApiLogInput = { + nodeId?: string | null; + /** + * If actorId is provided, nodeId is not necessary + */ + actorId?: string | null; + /** + * If taskId is provided, nodeId is not necessary + */ + taskId?: string | null; + suffix?: string; + /** + * If filename is provided, suffix is not necessary + */ + filename?: string | null; +}; + +export const getStateApiDownloadLogUrl = ({ + nodeId, + filename, + taskId, + actorId, + suffix, +}: StateApiLogInput) => { + if ( + nodeId === null || + actorId === null || + taskId === null || + filename === null + ) { + // Null means data is not ready yet. + return null; + } + const variables = [ + ...(nodeId !== undefined ? [`node_id=${encodeURIComponent(nodeId)}`] : []), + ...(filename !== undefined + ? [`filename=${encodeURIComponent(filename)}`] + : []), + ...(taskId !== undefined ? [`task_id=${encodeURIComponent(taskId)}`] : []), + ...(actorId !== undefined + ? [`actor_id=${encodeURIComponent(actorId)}`] + : []), + ...(suffix !== undefined ? [`suffix=${encodeURIComponent(suffix)}`] : []), + "lines=-1", + ]; + + return `api/v0/logs/file?${variables.join("&")}`; +}; + +export const getStateApiLog = async (props: StateApiLogInput) => { + const url = getStateApiDownloadLogUrl(props); + if (url === null) { + return undefined; + } + const resp = await get(url); + // Handle case where log file is empty. + if (resp.status === 200 && resp.data.length === 0) { + return ""; + } + // TODO(aguo): get rid of this first byte check once we support state-api logs without this streaming byte. + if (resp.data[0] !== "1") { + throw new Error(resp.data.substring(1)); + } + return resp.data.substring(1); +}; diff --git a/dashboard/client/src/service/log.unit.test.ts b/dashboard/client/src/service/log.unit.test.ts new file mode 100644 index 000000000000..c7e437df9779 --- /dev/null +++ b/dashboard/client/src/service/log.unit.test.ts @@ -0,0 +1,65 @@ +import { getStateApiDownloadLogUrl } from "./log"; + +describe("getStateApiDownloadLogUrl", () => { + it("only uses parameters provided but doesn't fetch when parameters are null", () => { + expect.assertions(8); + + expect( + getStateApiDownloadLogUrl({ + nodeId: "node-id", + filename: "file.log", + }), + ).toStrictEqual( + "api/v0/logs/file?node_id=node-id&filename=file.log&lines=-1", + ); + + expect( + getStateApiDownloadLogUrl({ + taskId: "task-id", + suffix: "err", + }), + ).toStrictEqual("api/v0/logs/file?task_id=task-id&suffix=err&lines=-1"); + + expect( + getStateApiDownloadLogUrl({ + taskId: "task-id", + suffix: "out", + }), + ).toStrictEqual("api/v0/logs/file?task_id=task-id&suffix=out&lines=-1"); + + expect( + getStateApiDownloadLogUrl({ + actorId: "actor-id", + suffix: "err", + }), + ).toStrictEqual("api/v0/logs/file?actor_id=actor-id&suffix=err&lines=-1"); + + expect( + getStateApiDownloadLogUrl({ + nodeId: null, + filename: "file.log", + }), + ).toBeNull(); + + expect( + getStateApiDownloadLogUrl({ + nodeId: null, + filename: null, + }), + ).toBeNull(); + + expect( + getStateApiDownloadLogUrl({ + taskId: null, + suffix: "err", + }), + ).toBeNull(); + + expect( + getStateApiDownloadLogUrl({ + actorId: null, + suffix: "err", + }), + ).toBeNull(); + }); +}); diff --git a/dashboard/client/src/service/task.ts b/dashboard/client/src/service/task.ts index 41a9355585ba..17441651de07 100644 --- a/dashboard/client/src/service/task.ts +++ b/dashboard/client/src/service/task.ts @@ -10,6 +10,13 @@ export const getTasks = (jobId: string | undefined) => { return get>(url); }; +export const getTask = (taskId: string) => { + const url = `api/v0/tasks?detail=1&limit=1&filter_keys=task_id&filter_predicates=%3D&filter_values=${encodeURIComponent( + taskId, + )}`; + return get>(url); +}; + export const downloadTaskTimelineHref = (jobId: string | undefined) => { let url = "/api/v0/tasks/timeline?download=1"; if (jobId) { diff --git a/dashboard/client/src/theme.ts b/dashboard/client/src/theme.ts index 6259f9b65b22..37e935117a6f 100644 --- a/dashboard/client/src/theme.ts +++ b/dashboard/client/src/theme.ts @@ -17,10 +17,12 @@ const basicTheme: ThemeOptions = { '"Segoe UI Symbol"', ].join(","), h1: { - fontSize: "2rem", + fontSize: "1.5rem", + fontWeight: 500, }, h2: { - fontSize: "1.5rem", + fontSize: "1.25rem", + fontWeight: 500, }, h3: { fontSize: "1rem", @@ -81,7 +83,7 @@ export const lightTheme = createTheme({ ...basicTheme, palette: { primary: { - main: "#538DF9", + main: "#036DCF", }, secondary: lightBlue, success: { diff --git a/dashboard/client/src/type/actor.ts b/dashboard/client/src/type/actor.ts index 7ac5274307ee..aedb5e4053df 100644 --- a/dashboard/client/src/type/actor.ts +++ b/dashboard/client/src/type/actor.ts @@ -1,9 +1,10 @@ import { GPUStats } from "./node"; export enum ActorEnum { + DEPENDENCIES_UNREADY = "DEPENDENCIES_UNREADY", + PENDING_CREATION = "PENDING_CREATION", ALIVE = "ALIVE", - PENDING = "PENDING", - RECONSTRUCTING = "RECONSTRUCTING", + RESTARTING = "RESTARTING", DEAD = "DEAD", } diff --git a/dashboard/client/src/type/job.d.ts b/dashboard/client/src/type/job.ts similarity index 94% rename from dashboard/client/src/type/job.d.ts rename to dashboard/client/src/type/job.ts index 4fc6632c56a7..80f697ac855d 100644 --- a/dashboard/client/src/type/job.d.ts +++ b/dashboard/client/src/type/job.ts @@ -58,11 +58,19 @@ export type JobDetail = { export type JobListRsp = UnifiedJob[]; +export enum JobStatus { + PENDING = "PENDING", + RUNNING = "RUNNING", + STOPPED = "STOPPED", + SUCCEEDED = "SUCCEEDED", + FAILED = "FAILED", +} + export type UnifiedJob = { job_id: string | null; submission_id: string | null; type: string; - status: string; + status: JobStatus; entrypoint: string; message: string | null; error_type: string | null; @@ -72,6 +80,7 @@ export type UnifiedJob = { runtime_env: { [key: string]: string } | null; driver_info: DriverInfo | null; driver_agent_http_address: string | null; + driver_node_id: string | null; }; export type DriverInfo = { diff --git a/dashboard/client/src/type/serve.ts b/dashboard/client/src/type/serve.ts index 839613ec548f..ea396b329919 100644 --- a/dashboard/client/src/type/serve.ts +++ b/dashboard/client/src/type/serve.ts @@ -14,7 +14,7 @@ export type ServeApplication = { status: ServeApplicationStatus; message: string; last_deployed_time_s: number; - deployed_app_config: Record; + deployed_app_config: Record | null; // It could be null if user did not provide deployed_app_config deployments: { [name: string]: ServeDeployment; }; @@ -74,6 +74,7 @@ export type ServeReplica = { node_id: string | null; node_ip: string | null; start_time_s: number; + log_file_path: string | null; }; // Keep in sync with DeploymentMode in python/ray/serve/config.py @@ -84,12 +85,38 @@ export enum ServeDeploymentMode { FixedNumber = "FixedNumber", } +// Keep in sync with HTTPProxyStatus in python/ray/serve/_private/common.py +export enum ServeSystemActorStatus { + STARTING = "STARTING", + HEALTHY = "HEALTHY", + UNHEALTHY = "UNHEALTHY", +} + +export type ServeSystemActor = { + node_id: string | null; + node_ip: string | null; + actor_id: string | null; + actor_name: string | null; + worker_id: string | null; + log_file_path: string | null; +}; + +export type ServeHttpProxy = { + status: ServeSystemActorStatus; +} & ServeSystemActor; + export type ServeApplicationsRsp = { - http_options: { - host: string; - port: number; - }; + http_options: + | { + host: string; + port: number; + } + | undefined; proxy_location: ServeDeploymentMode; + controller_info: ServeSystemActor; + http_proxies: { + [name: string]: ServeHttpProxy; + } | null; applications: { [name: string]: ServeApplication; }; diff --git a/dashboard/client/src/type/task.ts b/dashboard/client/src/type/task.ts index 168d3154f9bc..ddb65b47580e 100644 --- a/dashboard/client/src/type/task.ts +++ b/dashboard/client/src/type/task.ts @@ -27,7 +27,7 @@ export type Task = { state: TypeTaskStatus; job_id: string; node_id: string; - actor_id: string; + actor_id: string | null; placement_group_id: string | null; type: TypeTaskType; func_or_class_name: string; diff --git a/dashboard/client/src/util/converter.unit.test.ts b/dashboard/client/src/util/converter.unit.test.ts new file mode 100644 index 000000000000..a3d544c78d60 --- /dev/null +++ b/dashboard/client/src/util/converter.unit.test.ts @@ -0,0 +1,40 @@ +import { memoryConverter } from "./converter"; + +describe("memoryConverter", () => { + const table: { name: string; input: number; expected: string }[] = [ + { + name: "convert to Bytes", + input: 4, + expected: "4.0000B", + }, + { + name: "convert to KB", + input: 5 * 1024 ** 1, + expected: "5.00KB", + }, + { + name: "convert to MB", + input: 6 * 1024 ** 2, + expected: "6.00MB", + }, + { + name: "convert to GB", + input: 7 * 1024 ** 3, + expected: "7.00GB", + }, + { + name: "convert to TB", + input: 8 * 1024 ** 4, + expected: "8.00TB", + }, + { + name: "convert to PB", + input: 9 * 1024 ** 5, + expected: "9.00PB", + }, + ]; + + test.each(table)("$name", ({ input, expected }) => { + expect(memoryConverter(input)).toEqual(expected); + }); +}); diff --git a/dashboard/client/src/util/test-utils.tsx b/dashboard/client/src/util/test-utils.tsx index 009599636021..6ee6713d5d47 100644 --- a/dashboard/client/src/util/test-utils.tsx +++ b/dashboard/client/src/util/test-utils.tsx @@ -1,6 +1,7 @@ import { ThemeProvider } from "@material-ui/styles"; import React, { PropsWithChildren } from "react"; import { MemoryRouter } from "react-router-dom"; +import { SWRConfig } from "swr"; import { GlobalContext, GlobalContextType } from "../App"; import { lightTheme } from "../theme"; @@ -23,9 +24,14 @@ export const TEST_APP_WRAPPER = ({ children }: PropsWithChildren<{}>) => { return ( - - {children} - + {/* + Clear SWR cache between tests so that tests do impact each other. + */} + new Map() }}> + + {children} + + ); }; diff --git a/dashboard/consts.py b/dashboard/consts.py index df345acced28..360a332570b2 100644 --- a/dashboard/consts.py +++ b/dashboard/consts.py @@ -25,6 +25,7 @@ PURGE_DATA_INTERVAL_SECONDS = 60 * 10 ORGANIZE_DATA_INTERVAL_SECONDS = 2 DASHBOARD_RPC_ADDRESS = "dashboard_rpc" +DASHBOARD_RPC_PORT = env_integer("RAY_DASHBOARD_RPC_PORT", 0) GCS_SERVER_ADDRESS = "GcsServerAddress" # GCS check alive GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR = env_integer( diff --git a/dashboard/dashboard.py b/dashboard/dashboard.py index 55f465bf024f..273fbc4c904d 100644 --- a/dashboard/dashboard.py +++ b/dashboard/dashboard.py @@ -13,7 +13,6 @@ import ray.dashboard.consts as dashboard_consts import ray.dashboard.head as dashboard_head import ray.dashboard.utils as dashboard_utils -from ray._private.gcs_pubsub import GcsPublisher from ray._private.ray_logging import setup_component_logger from typing import Optional, Set @@ -35,6 +34,8 @@ class Dashboard: port: Port number of dashboard aiohttp server. port_retries: The retry times to select a valid port. gcs_address: GCS address of the cluster + grpc_port: Port used to listen for gRPC on. + node_ip_address: The IP address of the dashboard. serve_frontend: If configured, frontend HTML is not served from the dashboard. log_dir: Log directory of dashboard. @@ -46,6 +47,8 @@ def __init__( port: int, port_retries: int, gcs_address: str, + grpc_port: int, + node_ip_address: str, log_dir: str = None, temp_dir: str = None, session_dir: str = None, @@ -58,6 +61,8 @@ def __init__( http_port=port, http_port_retries=port_retries, gcs_address=gcs_address, + node_ip_address=node_ip_address, + grpc_port=grpc_port, log_dir=log_dir, temp_dir=temp_dir, session_dir=session_dir, @@ -88,6 +93,19 @@ async def run(self): parser.add_argument( "--gcs-address", required=True, type=str, help="The address (ip:port) of GCS." ) + parser.add_argument( + "--grpc-port", + required=False, + type=int, + default=dashboard_consts.DASHBOARD_RPC_PORT, + help="The port for the dashboard to listen for gRPC on.", + ) + parser.add_argument( + "--node-ip-address", + required=True, + type=str, + help="The IP address of the node where this is running.", + ) parser.add_argument( "--logging-level", required=False, @@ -200,10 +218,12 @@ async def run(self): # https://github.com/grpc/grpc/blob/master/src/python/grpcio/grpc/_cython/_cygrpc/aio/common.pyx.pxi#L174-L188 loop = ray._private.utils.get_or_create_event_loop() dashboard = Dashboard( - args.host, - args.port, - args.port_retries, - args.gcs_address, + host=args.host, + port=args.port, + port_retries=args.port_retries, + gcs_address=args.gcs_address, + grpc_port=args.grpc_port, + node_ip_address=args.node_ip_address, log_dir=args.log_dir, temp_dir=args.temp_dir, session_dir=args.session_dir, @@ -240,7 +260,7 @@ def sigterm_handler(): raise e # Something went wrong, so push an error to all drivers. - gcs_publisher = GcsPublisher(address=args.gcs_address) + gcs_publisher = ray._raylet.GcsPublisher(address=args.gcs_address) ray._private.utils.publish_error_to_driver( ray_constants.DASHBOARD_DIED_ERROR, message, diff --git a/dashboard/head.py b/dashboard/head.py index ec160bfdc2ad..85de8854639e 100644 --- a/dashboard/head.py +++ b/dashboard/head.py @@ -75,6 +75,8 @@ def __init__( http_port: int, http_port_retries: int, gcs_address: str, + node_ip_address: str, + grpc_port: int, log_dir: str, temp_dir: str, session_dir: str, @@ -94,6 +96,7 @@ def __init__( minimal: Whether or not it will load the minimal modules. serve_frontend: If configured, frontend HTML is served from the dashboard. + grpc_port: The port used to listen for gRPC on. modules_to_load: A set of module name in string to load. By default (None), it loads all available modules. Note that available modules could be changed depending on @@ -124,14 +127,13 @@ def __init__( self.gcs_aio_client = None self.gcs_error_subscriber = None self.gcs_log_subscriber = None - self.ip = ray.util.get_node_ip_address() + self.ip = node_ip_address DataOrganizer.head_node_ip = self.ip - ip, port = gcs_address.split(":") self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0),)) grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server( - self.server, f"{grpc_ip}:0" + self.server, f"{grpc_ip}:{grpc_port}" ) logger.info("Dashboard head grpc address: %s:%s", grpc_ip, self.grpc_port) # If the dashboard is started as non-minimal version, http server should @@ -302,12 +304,21 @@ async def _async_notify(): logger.info("Initialize the http server.") self.http_server = await self._configure_http_server(modules) http_host, http_port = self.http_server.get_address() + logger.info(f"http server initialized at {http_host}:{http_port}") else: logger.info("http server disabled.") + + # We need to expose dashboard's node's ip for other worker nodes + # if it's listening to all interfaces. + dashboard_http_host = ( + self.ip + if self.http_host != ray_constants.DEFAULT_DASHBOARD_IP + else http_host + ) await asyncio.gather( self.gcs_aio_client.internal_kv_put( ray_constants.DASHBOARD_ADDRESS.encode(), - f"{http_host}:{http_port}".encode(), + f"{dashboard_http_host}:{http_port}".encode(), True, namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ), diff --git a/dashboard/http_server_agent.py b/dashboard/http_server_agent.py index 590da4cab2a2..9547d899a7ee 100644 --- a/dashboard/http_server_agent.py +++ b/dashboard/http_server_agent.py @@ -1,10 +1,7 @@ import logging from ray._private.utils import get_or_create_event_loop -try: - from packaging.version import Version -except ImportError: - from distutils.version import LooseVersion as Version +from packaging.version import Version import ray.dashboard.optional_utils as dashboard_optional_utils diff --git a/dashboard/http_server_head.py b/dashboard/http_server_head.py index 8583ff57aed5..d66de2d68746 100644 --- a/dashboard/http_server_head.py +++ b/dashboard/http_server_head.py @@ -9,10 +9,7 @@ from ray._private.utils import get_or_create_event_loop from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag -try: - from packaging.version import Version -except ImportError: - from distutils.version import LooseVersion as Version +from packaging.version import Version import ray.dashboard.optional_utils as dashboard_optional_utils import ray.dashboard.utils as dashboard_utils diff --git a/dashboard/modules/dashboard_sdk.py b/dashboard/modules/dashboard_sdk.py index 946736158a51..8d7691e7ecf1 100644 --- a/dashboard/modules/dashboard_sdk.py +++ b/dashboard/modules/dashboard_sdk.py @@ -2,12 +2,14 @@ import importlib import logging import json +import os import yaml from pathlib import Path import tempfile -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from pkg_resources import packaging import ray +import ssl try: import requests @@ -202,8 +204,8 @@ def __init__( cookies: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, Any]] = None, + verify: Optional[Union[str, bool]] = True, ): - # Remove any trailing slashes if address is not None and address.endswith("/"): address = address.rstrip("/") @@ -221,6 +223,24 @@ def __init__( # Headers used for all requests sent to job server, optional and only # needed for cases like authentication to remote cluster. self._headers = cluster_info.headers + # Set SSL verify parameter for the requests library and create an ssl_context + # object when needed for the aiohttp library. + self._verify = verify + if isinstance(self._verify, str): + if os.path.isdir(self._verify): + cafile, capath = None, self._verify + elif os.path.isfile(self._verify): + cafile, capath = self._verify, None + else: + raise FileNotFoundError( + f"Path to CA certificates: '{self._verify}', does not exist." + ) + self._ssl_context = ssl.create_default_context(cafile=cafile, capath=capath) + else: + if self._verify is False: + self._ssl_context = False + else: + self._ssl_context = None def _check_connection_and_version( self, min_version: str = "1.9", version_error_message: str = None @@ -287,6 +307,7 @@ def _do_request( data=data, json=json_data, headers=self._headers, + verify=self._verify, **kwargs, ) diff --git a/dashboard/modules/event/tests/test_event.py b/dashboard/modules/event/tests/test_event.py index c0d7e79fd230..07bae50ddea3 100644 --- a/dashboard/modules/event/tests/test_event.py +++ b/dashboard/modules/event/tests/test_event.py @@ -17,7 +17,7 @@ import numpy as np import ray -from ray.experimental.state.api import list_cluster_events +from ray.util.state import list_cluster_events from ray._private.utils import binary_to_hex from ray.cluster_utils import AutoscalingCluster from ray._private.event.event_logger import get_event_logger diff --git a/dashboard/modules/job/cli.py b/dashboard/modules/job/cli.py index 3b3d35423208..69584e55efe7 100644 --- a/dashboard/modules/job/cli.py +++ b/dashboard/modules/job/cli.py @@ -12,14 +12,17 @@ from ray.autoscaler._private.cli_logger import add_click_logging_options, cf, cli_logger from ray.dashboard.modules.dashboard_sdk import parse_runtime_env_args from ray.job_submission import JobStatus, JobSubmissionClient +from ray.dashboard.modules.job.cli_utils import add_common_job_options from ray.util.annotations import PublicAPI from ray._private.utils import parse_resources_json def _get_sdk_client( - address: Optional[str], create_cluster_if_needed: bool = False + address: Optional[str], + create_cluster_if_needed: bool = False, + verify: Union[bool, str] = True, ) -> JobSubmissionClient: - client = JobSubmissionClient(address, create_cluster_if_needed) + client = JobSubmissionClient(address, create_cluster_if_needed, verify=verify) client_address = client.get_address() cli_logger.labeled_value("Job submission server address", client_address) return client @@ -152,6 +155,7 @@ def job_cli_group(): default=False, help="If set, will not stream logs and wait for the job to exit.", ) +@add_common_job_options @add_click_logging_options @click.argument("entrypoint", nargs=-1, required=True, type=click.UNPROCESSED) @PublicAPI @@ -167,13 +171,13 @@ def submit( entrypoint_num_gpus: Optional[Union[int, float]], entrypoint_resources: Optional[str], no_wait: bool, + verify: Union[bool, str], ): """Submits a job to be run on the cluster. Example: `ray job submit -- python my_script.py --arg=val` """ - if job_id: cli_logger.warning( "--job-id option is deprecated. Please use --submission-id instead." @@ -201,7 +205,7 @@ def submit( no_wait=no_wait, ) - client = _get_sdk_client(address, create_cluster_if_needed=True) + client = _get_sdk_client(address, create_cluster_if_needed=True, verify=verify) final_runtime_env = parse_runtime_env_args( runtime_env=runtime_env, @@ -261,15 +265,16 @@ def submit( ), ) @click.argument("job-id", type=str) +@add_common_job_options @add_click_logging_options @PublicAPI(stability="stable") -def status(address: Optional[str], job_id: str): +def status(address: Optional[str], job_id: str, verify: Union[bool, str]): """Queries for the current status of a job. Example: `ray job status ` """ - client = _get_sdk_client(address) + client = _get_sdk_client(address, verify=verify) _log_job_status(client, job_id) @@ -292,15 +297,16 @@ def status(address: Optional[str], job_id: str): help="If set, will not wait for the job to exit.", ) @click.argument("job-id", type=str) +@add_common_job_options @add_click_logging_options @PublicAPI(stability="stable") -def stop(address: Optional[str], no_wait: bool, job_id: str): +def stop(address: Optional[str], no_wait: bool, job_id: str, verify: Union[bool, str]): """Attempts to stop a job. Example: `ray job stop ` """ - client = _get_sdk_client(address) + client = _get_sdk_client(address, verify=verify) cli_logger.print(f"Attempting to stop job '{job_id}'") client.stop_job(job_id) @@ -333,9 +339,10 @@ def stop(address: Optional[str], no_wait: bool, job_id: str): ), ) @click.argument("job-id", type=str) +@add_common_job_options @add_click_logging_options @PublicAPI(stability="alpha") -def delete(address: Optional[str], job_id: str): +def delete(address: Optional[str], job_id: str, verify: Union[bool, str]): """Deletes a stopped job and its associated data from memory. Only supported for jobs that are already in a terminal state. @@ -347,7 +354,7 @@ def delete(address: Optional[str], job_id: str): Example: ray job delete """ - client = _get_sdk_client(address) + client = _get_sdk_client(address, verify=verify) client.delete_job(job_id) cli_logger.print(f"Job '{job_id}' deleted successfully") @@ -372,15 +379,16 @@ def delete(address: Optional[str], job_id: str): default=False, help="If set, follow the logs (like `tail -f`).", ) +@add_common_job_options @add_click_logging_options @PublicAPI(stability="stable") -def logs(address: Optional[str], job_id: str, follow: bool): +def logs(address: Optional[str], job_id: str, follow: bool, verify: Union[bool, str]): """Gets the logs of a job. Example: `ray job logs ` """ - client = _get_sdk_client(address) + client = _get_sdk_client(address, verify=verify) sdk_version = client.get_version() # sdk version 0 did not have log streaming if follow: @@ -409,15 +417,16 @@ def logs(address: Optional[str], job_id: str, follow: bool): "using the RAY_ADDRESS environment variable." ), ) +@add_common_job_options @add_click_logging_options @PublicAPI(stability="stable") -def list(address: Optional[str]): +def list(address: Optional[str], verify: Union[bool, str]): """Lists all running jobs and their information. Example: `ray job list` """ - client = _get_sdk_client(address) + client = _get_sdk_client(address, verify=verify) # Set no_format to True because the logs may have unescaped "{" and "}" # and the CLILogger calls str.format(). cli_logger.print(pprint.pformat(client.list_jobs()), no_format=True) diff --git a/dashboard/modules/job/cli_utils.py b/dashboard/modules/job/cli_utils.py new file mode 100644 index 000000000000..0e2afe41d652 --- /dev/null +++ b/dashboard/modules/job/cli_utils.py @@ -0,0 +1,46 @@ +from typing import Union + +import click +import functools + + +def bool_cast(string: str) -> Union[bool, str]: + """Cast a string to a boolean if possible, otherwise return the string.""" + if string.lower() == "true" or string == "1": + return True + elif string.lower() == "false" or string == "0": + return False + else: + return string + + +class BoolOrStringParam(click.ParamType): + """A click parameter that can be either a boolean or a string.""" + + name = "BOOL | TEXT" + + def convert(self, value, param, ctx): + if isinstance(value, bool): + return value + else: + return bool_cast(value) + + +def add_common_job_options(func): + """Decorator for adding CLI flags shared by all `ray job` commands.""" + + @click.option( + "--verify", + default=True, + show_default=True, + type=BoolOrStringParam(), + help=( + "Boolean indication to verify the server's TLS certificate or a path to" + " a file or directory of trusted certificates." + ), + ) + @functools.wraps(func) + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + return wrapper diff --git a/dashboard/modules/job/common.py b/dashboard/modules/job/common.py index 0bc1d0c8f26f..e77dfb659e45 100644 --- a/dashboard/modules/job/common.py +++ b/dashboard/modules/job/common.py @@ -25,6 +25,7 @@ # In order to get information about SupervisorActors launched by different jobs, # they must be set to the same namespace. SUPERVISOR_ACTOR_RAY_NAMESPACE = "SUPERVISOR_ACTOR_RAY_NAMESPACE" +JOB_LOGS_PATH_TEMPLATE = "job-driver-{submission_id}.log" @PublicAPI(stability="stable") diff --git a/dashboard/modules/job/job_head.py b/dashboard/modules/job/job_head.py index 801fa1079923..acedb61fde4d 100644 --- a/dashboard/modules/job/job_head.py +++ b/dashboard/modules/job/job_head.py @@ -68,13 +68,11 @@ async def _raise_error(self, resp: ClientResponse): raise RuntimeError(f"Request failed with status code {status}: {error_text}.") async def submit_job_internal(self, req: JobSubmitRequest) -> JobSubmitResponse: - logger.debug(f"Submitting job with submission_id={req.submission_id}.") async with self._session.post( f"{self._agent_address}/api/job_agent/jobs/", json=dataclasses.asdict(req) ) as resp: - if resp.status == 200: result_json = await resp.json() return JobSubmitResponse(**result_json) @@ -82,13 +80,11 @@ async def submit_job_internal(self, req: JobSubmitRequest) -> JobSubmitResponse: await self._raise_error(resp) async def stop_job_internal(self, job_id: str) -> JobStopResponse: - logger.debug(f"Stopping job with job_id={job_id}.") async with self._session.post( f"{self._agent_address}/api/job_agent/jobs/{job_id}/stop" ) as resp: - if resp.status == 200: result_json = await resp.json() return JobStopResponse(**result_json) @@ -96,7 +92,6 @@ async def stop_job_internal(self, job_id: str) -> JobStopResponse: await self._raise_error(resp) async def delete_job_internal(self, job_id: str) -> JobDeleteResponse: - logger.debug(f"Deleting job with job_id={job_id}.") async with self._session.delete( @@ -401,6 +396,9 @@ async def get_job_info(self, req: Request) -> Response: content_type="application/json", ) + # TODO(rickyx): This endpoint's logic is also mirrored in state API's endpoint. + # We should eventually unify the backend logic (and keep the logic in sync before + # that). @routes.get("/api/jobs/") async def list_jobs(self, req: Request) -> Response: driver_jobs, submission_job_drivers = await get_driver_jobs( diff --git a/dashboard/modules/job/job_manager.py b/dashboard/modules/job/job_manager.py index 6e5043c12c6c..eb1e2a2f0dfa 100644 --- a/dashboard/modules/job/job_manager.py +++ b/dashboard/modules/job/job_manager.py @@ -33,6 +33,7 @@ JOB_ID_METADATA_KEY, JOB_NAME_METADATA_KEY, JOB_ACTOR_NAME_TEMPLATE, + JOB_LOGS_PATH_TEMPLATE, SUPERVISOR_ACTOR_RAY_NAMESPACE, JobInfo, JobInfoStorageClient, @@ -86,7 +87,6 @@ class JobLogStorageClient: Disk storage for stdout / stderr of driver script logs. """ - JOB_LOGS_PATH = "job-driver-{job_id}.log" # Number of last N lines to put in job message upon failure. NUM_LOG_LINES_ON_ERROR = 10 # Maximum number of characters to print out of the logs to avoid @@ -133,7 +133,7 @@ def get_log_file_path(self, job_id: str) -> Tuple[str, str]: """ return os.path.join( ray._private.worker._global_node.get_logs_dir_path(), - self.JOB_LOGS_PATH.format(job_id=job_id), + JOB_LOGS_PATH_TEMPLATE.format(submission_id=job_id), ) @@ -197,6 +197,7 @@ def _get_driver_runtime_env( # & actors. env_vars = curr_runtime_env.get("env_vars", {}) env_vars.pop(ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR) + env_vars.pop(ray_constants.RAY_WORKER_NICENESS) curr_runtime_env["env_vars"] = env_vars return curr_runtime_env @@ -756,6 +757,8 @@ def _get_supervisor_runtime_env( if env_vars is None: env_vars = {} + env_vars[ray_constants.RAY_WORKER_NICENESS] = "0" + if not resources_specified: # Don't set CUDA_VISIBLE_DEVICES for the supervisor actor so the # driver can use GPUs if it wants to. This will be removed from diff --git a/dashboard/modules/job/pydantic_models.py b/dashboard/modules/job/pydantic_models.py index b7c4404a6c00..5b7edd9f6f23 100644 --- a/dashboard/modules/job/pydantic_models.py +++ b/dashboard/modules/job/pydantic_models.py @@ -22,7 +22,11 @@ class DriverInfo(BaseModel): @PublicAPI(stability="beta") class JobType(str, Enum): - """An enumeration for describing the different job types.""" + """An enumeration for describing the different job types. + + NOTE: + This field is still experimental and may change in the future. + """ #: A job that was initiated by the Ray Jobs API. SUBMISSION = "SUBMISSION" @@ -37,9 +41,6 @@ class JobDetails(BaseModel): """ type: JobType = Field(..., description="The type of job.") - entrypoint: Optional[str] = Field( - None, description="The entrypoint command for this job." - ) job_id: Optional[str] = Field( None, description="The job ID. An ID that is created for every job that is " diff --git a/dashboard/modules/job/sdk.py b/dashboard/modules/job/sdk.py index e8a8e00443da..9c7896ce3cee 100644 --- a/dashboard/modules/job/sdk.py +++ b/dashboard/modules/job/sdk.py @@ -60,6 +60,8 @@ class JobSubmissionClient(SubmissionClient): via a simple dict update. headers: Headers to use when sending requests to the HTTP job server, used for cases like authentication to a remote cluster. + verify: Boolean indication to verify the server's TLS certificate or a path to + a file or directory of trusted certificates. Default: True. """ def __init__( @@ -69,6 +71,7 @@ def __init__( cookies: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, Any]] = None, + verify: Optional[Union[str, bool]] = True, ): self._client_ray_version = ray.__version__ """Initialize a JobSubmissionClient and check the connection to the cluster.""" @@ -77,7 +80,6 @@ def __init__( "The Ray jobs CLI & SDK require the ray[default] " "installation: `pip install 'ray[default]'`" ) - # Check types of arguments if address is not None and not isinstance(address, str): raise TypeError(f"address must be a string, got {type(address)}") @@ -92,6 +94,8 @@ def __init__( raise TypeError(f"metadata must be a dict, got {type(metadata)}") if headers is not None and not isinstance(headers, dict): raise TypeError(f"headers must be a dict, got {type(headers)}") + if not (isinstance(verify, str) or isinstance(verify, bool)): + raise TypeError(f"verify must be a str or bool, got {type(verify)}") api_server_url = get_address_for_submission_client(address) @@ -101,6 +105,7 @@ def __init__( cookies=cookies, metadata=metadata, headers=headers, + verify=verify, ) self._check_connection_and_version( min_version="1.9", @@ -454,7 +459,7 @@ async def tail_job_logs(self, job_id: str) -> Iterator[str]: cookies=self._cookies, headers=self._headers ) as session: ws = await session.ws_connect( - f"{self._address}/api/jobs/{job_id}/logs/tail" + f"{self._address}/api/jobs/{job_id}/logs/tail", ssl=self._ssl_context ) while True: diff --git a/dashboard/modules/job/tests/subprocess_driver_scripts/check_niceness.py b/dashboard/modules/job/tests/subprocess_driver_scripts/check_niceness.py new file mode 100644 index 000000000000..1fbf3d3985df --- /dev/null +++ b/dashboard/modules/job/tests/subprocess_driver_scripts/check_niceness.py @@ -0,0 +1,22 @@ +""" +A dummy ray driver script that executes in subprocess. +Checks that job manager's environment variable is different. +""" + +import ray +import os + + +def run(): + ray.init() + + @ray.remote + def foo(): + print("worker", os.nice(0)) + + ray.get(foo.remote()) + + +if __name__ == "__main__": + print("driver", os.nice(0)) + run() diff --git a/dashboard/modules/job/tests/test_cli.py b/dashboard/modules/job/tests/test_cli.py index 4d030c0997a6..c25367421086 100644 --- a/dashboard/modules/job/tests/test_cli.py +++ b/dashboard/modules/job/tests/test_cli.py @@ -89,20 +89,22 @@ def _job_cli_group_test_address(mock_sdk_client, cmd, *args): create_cluster_if_needed = True if cmd == "submit" else False # Test passing address via command line. result = runner.invoke(job_cli_group, [cmd, "--address=arg_addr", *args]) - mock_sdk_client.assert_called_with("arg_addr", create_cluster_if_needed) + mock_sdk_client.assert_called_with( + "arg_addr", create_cluster_if_needed, verify=True + ) with pytest.raises(AssertionError): - mock_sdk_client.assert_called_with("some_other_addr", True) + mock_sdk_client.assert_called_with("some_other_addr", True, verify=True) check_exit_code(result, 0) # Test passing address via env var. with set_env_var("RAY_ADDRESS", "env_addr"): result = runner.invoke(job_cli_group, [cmd, *args]) check_exit_code(result, 0) # RAY_ADDRESS is read inside the SDK client. - mock_sdk_client.assert_called_with(None, create_cluster_if_needed) + mock_sdk_client.assert_called_with(None, create_cluster_if_needed, verify=True) # Test passing no address. result = runner.invoke(job_cli_group, [cmd, *args]) check_exit_code(result, 0) - mock_sdk_client.assert_called_with(None, create_cluster_if_needed) + mock_sdk_client.assert_called_with(None, create_cluster_if_needed, verify=True) class TestList: @@ -390,6 +392,29 @@ def test_entrypoint_resources_invalid_json(self, mock_sdk_client): assert result.exit_code == 1 assert "not a valid JSON string" in result.output + @pytest.mark.parametrize( + "cli_val, verify_param", + [ + ("True", True), + ("true", True), + ("1", True), + ("False", False), + ("false", False), + ("0", False), + ("a/rel/path", "a/rel/path"), + ("/an/abs/path", "/an/abs/path"), + ], + ) + def test_entrypoint_verify(self, mock_sdk_client, cli_val, verify_param): + runner = CliRunner() + with set_env_var("RAY_ADDRESS", "env_addr"): + result = runner.invoke( + job_cli_group, + ["submit", f"--verify={cli_val}", "--", "echo hello"], + ) + assert result.exit_code == 0 + mock_sdk_client.assert_called_with(None, True, verify=verify_param) + class TestDelete: def test_address(self, mock_sdk_client): diff --git a/dashboard/modules/job/tests/test_http_job_server.py b/dashboard/modules/job/tests/test_http_job_server.py index 4b4c1b5378a5..eff548dd2c19 100644 --- a/dashboard/modules/job/tests/test_http_job_server.py +++ b/dashboard/modules/job/tests/test_http_job_server.py @@ -610,7 +610,6 @@ def test_version_endpoint(job_sdk_client): def test_request_headers(job_sdk_client): client = job_sdk_client - with patch("requests.request") as mock_request: _ = client._do_request( "POST", @@ -624,6 +623,7 @@ def test_request_headers(job_sdk_client): data=None, json={"entrypoint": "ls"}, headers={"Connection": "keep-alive", "Authorization": "TOK:"}, + verify=True, ) diff --git a/dashboard/modules/job/tests/test_https_connection.py b/dashboard/modules/job/tests/test_https_connection.py new file mode 100644 index 000000000000..90e1ed3e9cb1 --- /dev/null +++ b/dashboard/modules/job/tests/test_https_connection.py @@ -0,0 +1,46 @@ +import pytest +import ssl +import sys +import trustme + +import ray +from ray.job_submission import JobSubmissionClient + + +@pytest.fixture(scope="session") +def ca(): + return trustme.CA() + + +@pytest.fixture(scope="session") +def httpserver_ssl_context(ca): + context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + localhost_cert = ca.issue_cert("localhost") + localhost_cert.configure_cert(context) + return context + + +@pytest.fixture(scope="session") +def httpclient_ssl_context(ca): + with ca.cert_pem.tempfile() as ca_temp_path: + return ssl.create_default_context(cafile=ca_temp_path) + + +def test_mock_https_connection(httpserver, ca): + """Test connections to a mock HTTPS job submission server.""" + httpserver.expect_request("/api/version").respond_with_json( + {"ray_version": ray.__version__} + ) + mock_url = httpserver.url_for("/") + # Connection without SSL certificate should fail + with pytest.raises(ConnectionError): + JobSubmissionClient(mock_url) + # Connecton with SSL verification skipped should succeed + JobSubmissionClient(mock_url, verify=False) + # Connection with SSL verification should succeed + with ca.cert_pem.tempfile() as ca_temp_path: + JobSubmissionClient(mock_url, verify=ca_temp_path) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/dashboard/modules/job/tests/test_job_agent.py b/dashboard/modules/job/tests/test_job_agent.py index 159577d02544..910e740923a5 100644 --- a/dashboard/modules/job/tests/test_job_agent.py +++ b/dashboard/modules/job/tests/test_job_agent.py @@ -33,7 +33,7 @@ ) from ray.dashboard.tests.conftest import * # noqa from ray.runtime_env.runtime_env import RuntimeEnv, RuntimeEnvConfig -from ray.experimental.state.api import list_nodes +from ray.util.state import list_nodes from ray.job_submission import JobStatus, JobSubmissionClient from ray.tests.conftest import _ray_start from ray.dashboard.modules.job.job_head import JobAgentSubmissionClient diff --git a/dashboard/modules/job/tests/test_job_manager.py b/dashboard/modules/job/tests/test_job_manager.py index 56cf0af866dc..17255f4a39cc 100644 --- a/dashboard/modules/job/tests/test_job_manager.py +++ b/dashboard/modules/job/tests/test_job_manager.py @@ -465,6 +465,19 @@ async def test_pass_env_var(self, job_manager): ) assert job_manager.get_job_logs(job_id) == "233\n" + async def test_niceness(self, job_manager): + job_id = await job_manager.submit_job( + entrypoint=f"python {_driver_script_path('check_niceness.py')}", + ) + + await async_wait_for_condition_async_predicate( + check_job_succeeded, job_manager=job_manager, job_id=job_id + ) + + logs = job_manager.get_job_logs(job_id) + assert "driver 0" in logs + assert "worker 15" in logs + async def test_multiple_runtime_envs(self, job_manager): # Test that you can run two jobs in different envs without conflict. job_id_1 = await job_manager.submit_job( diff --git a/dashboard/modules/job/tests/test_sdk.py b/dashboard/modules/job/tests/test_sdk.py index 516b87468edd..ae30c2be7f42 100644 --- a/dashboard/modules/job/tests/test_sdk.py +++ b/dashboard/modules/job/tests/test_sdk.py @@ -27,7 +27,7 @@ from ray.tests.conftest import _ray_start import ray import ray.experimental.internal_kv as kv -from ray.experimental.state.api import list_nodes +from ray.util.state import list_nodes def _check_job_succeeded(client: JobSubmissionClient, job_id: str) -> bool: diff --git a/dashboard/modules/job/utils.py b/dashboard/modules/job/utils.py index b232c7372791..572e5f67ae60 100644 --- a/dashboard/modules/job/utils.py +++ b/dashboard/modules/job/utils.py @@ -142,7 +142,7 @@ async def parse_and_validate_request( async def get_driver_jobs( - gcs_aio_client: GcsAioClient, + gcs_aio_client: GcsAioClient, timeout: Optional[int] = None ) -> Tuple[Dict[str, JobDetails], Dict[str, DriverInfo]]: """Returns a tuple of dictionaries related to drivers. @@ -151,7 +151,7 @@ async def get_driver_jobs( It's keyed by the submission job's submission id. Only the last driver of a submission job is returned. """ - reply = await gcs_aio_client.get_all_job_info() + reply = await gcs_aio_client.get_all_job_info(timeout=timeout) jobs = {} submission_job_drivers = {} diff --git a/dashboard/modules/log/log_agent.py b/dashboard/modules/log/log_agent.py index a3abeeb54cdf..8df5417f062f 100644 --- a/dashboard/modules/log/log_agent.py +++ b/dashboard/modules/log/log_agent.py @@ -1,21 +1,252 @@ import logging +from typing import Tuple +import concurrent.futures import ray.dashboard.modules.log.log_utils as log_utils import ray.dashboard.modules.log.log_consts as log_consts import ray.dashboard.utils as dashboard_utils import ray.dashboard.optional_utils as dashboard_optional_utils +from ray._private.ray_constants import env_integer import asyncio +import grpc import io import os + from pathlib import Path from ray.core.generated import reporter_pb2 from ray.core.generated import reporter_pb2_grpc +from ray._private.ray_constants import ( + LOG_PREFIX_TASK_ATTEMPT_START, + LOG_PREFIX_TASK_ATTEMPT_END, +) logger = logging.getLogger(__name__) routes = dashboard_optional_utils.ClassMethodRouteTable +# 64 KB +BLOCK_SIZE = 1 << 16 + +# Keep-alive interval for reading the file +DEFAULT_KEEP_ALIVE_INTERVAL_SEC = 1 + +RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT = env_integer( + "RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT", default=2 +) + + +def find_offset_of_content_in_file( + file: io.BufferedIOBase, content: bytes, start_offset: int = 0 +) -> int: + """Find the offset of the first occurrence of content in a file. + + Args: + file: File object + content: Content to find + start_offset: Start offset to read from, inclusive. + + Returns: + Offset of the first occurrence of content in a file. + """ + logger.debug(f"Finding offset of content {content} in file") + file.seek(start_offset, io.SEEK_SET) # move file pointer to start of file + offset = start_offset + while True: + # Read in block + block_data = file.read(BLOCK_SIZE) + if block_data == b"": + # Stop reading + return -1 + # Find the offset of the first occurrence of content in the block + block_offset = block_data.find(content) + if block_offset != -1: + # Found the offset in the block + return offset + block_offset + # Continue reading + offset += len(block_data) + + +def find_end_offset_file(file: io.BufferedIOBase) -> int: + """ + Find the offset of the end of a file without changing the file pointer. + + Args: + file: File object + + Returns: + Offset of the end of a file. + """ + old_pos = file.tell() # store old position + file.seek(0, io.SEEK_END) # move file pointer to end of file + end = file.tell() # return end of file offset + file.seek(old_pos, io.SEEK_SET) + return end + + +def find_end_offset_next_n_lines_from_offset( + file: io.BufferedIOBase, start_offset: int, n: int +) -> int: + """ + Find the offsets of next n lines from a start offset. + + Args: + file: File object + start_offset: Start offset to read from, inclusive. + n: Number of lines to find. + + Returns: + Offset of the end of the next n line (exclusive) + """ + file.seek(start_offset) # move file pointer to start offset + end_offset = None + for _ in range(n): # loop until we find n lines or reach end of file + line = file.readline() # read a line and consume new line character + if not line: # end of file + break + end_offset = file.tell() # end offset. + + logger.debug(f"Found next {n} lines from {start_offset} offset") + return ( + end_offset if end_offset is not None else file.seek(0, io.SEEK_END) + ) # return last line offset or end of file offset if no lines found + + +def find_start_offset_last_n_lines_from_offset( + file: io.BufferedIOBase, offset: int, n: int, block_size: int = BLOCK_SIZE +) -> int: + """ + Find the offset of the beginning of the line of the last X lines from an offset. + + Args: + file: File object + offset: Start offset from which to find last X lines, -1 means end of file. + The offset is exclusive, i.e. data at the offset is not included + in the result. + n: Number of lines to find + block_size: Block size to read from file + + Returns: + Offset of the beginning of the line of the last X lines from a start offset. + """ + logger.debug(f"Finding last {n} lines from {offset} offset") + if offset == -1: + offset = file.seek(0, io.SEEK_END) # move file pointer to end of file + else: + file.seek(offset, io.SEEK_SET) # move file pointer to start offset + + if n == 0: + return offset + nbytes_from_end = ( + 0 # Number of bytes that should be tailed from the end of the file + ) + # Non new line terminating offset, adjust the line count and treat the non-newline + # terminated line as the last line. e.g. line 1\nline 2 + file.seek(max(0, offset - 1), os.SEEK_SET) + if file.read(1) != b"\n": + n -= 1 + + # Remaining number of lines to tail + lines_more = n + read_offset = max(0, offset - block_size) + # So that we know how much to read on the last block (the block 0) + prev_offset = offset + + while lines_more >= 0 and read_offset >= 0: + # Seek to the current block start + file.seek(read_offset, 0) + # Read the current block (or less than block) data + block_data = file.read(min(block_size, prev_offset - read_offset)) + num_lines = block_data.count(b"\n") + if num_lines > lines_more: + # This is the last block to read. + # Need to find the offset of exact number of lines to tail + # in the block. + # Use `split` here to split away the extra lines, i.e. + # first `num_lines - lines_more` lines. + lines = block_data.split(b"\n", num_lines - lines_more) + # Added the len of those lines that at the end of the block. + nbytes_from_end += len(lines[-1]) + break + + # Need to read more blocks. + lines_more -= num_lines + nbytes_from_end += len(block_data) + + if read_offset == 0: + # We have read all blocks (since the start) + break + # Continuing with the previous block + prev_offset = read_offset + read_offset = max(0, read_offset - block_size) + + offset_read_start = offset - nbytes_from_end + assert ( + offset_read_start >= 0 + ), f"Read start offset({offset_read_start}) should be non-negative" + return offset_read_start + + +async def _stream_log_in_chunk( + context: grpc.aio.ServicerContext, + file: io.BufferedIOBase, + start_offset: int, + end_offset: int = -1, + keep_alive_interval_sec: int = -1, + block_size: int = BLOCK_SIZE, +): + """Streaming log in chunk from start to end offset. + + Stream binary file content in chunks from start offset to an end + offset if provided, else to the end of the file. + + Args: + context: gRPC server side context + file: Binary file to stream + start_offset: File offset where streaming starts + end_offset: If -1, implying streaming til the EOF. + keep_alive_interval_sec: Duration for which streaming will be + retried when reaching the file end, -1 means no retry. + block_size: Number of bytes per chunk, exposed for testing + + Return: + Async generator of StreamReply + """ + assert "b" in file.mode, "Only binary file is supported." + assert not ( + keep_alive_interval_sec >= 0 and end_offset is not -1 + ), "Keep-alive is not allowed when specifying an end offset" + + file.seek(start_offset, 0) + cur_offset = start_offset + + # Until gRPC is done + while not context.done(): + # Read in block + if end_offset != -1: + to_read = min(end_offset - cur_offset, block_size) + else: + to_read = block_size + + bytes = file.read(to_read) + + if bytes == b"": + # Stop reading + if keep_alive_interval_sec >= 0: + await asyncio.sleep(keep_alive_interval_sec) + # Try reading again + continue + + # Have read the entire file, done + break + logger.debug(f"Sending {len(bytes)} bytes at {cur_offset}") + yield reporter_pb2.StreamLogReply(data=bytes) + + # Have read the requested section [start_offset, end_offset), done + cur_offset += len(bytes) + if end_offset != -1 and cur_offset >= end_offset: + break + class LogAgent(dashboard_utils.DashboardAgentModule): def __init__(self, dashboard_agent): @@ -31,13 +262,12 @@ def is_minimal_module(): return False -# 64 KB -BLOCK_SIZE = 1 << 16 +_task_log_search_worker_pool = concurrent.futures.ThreadPoolExecutor( + max_workers=RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT +) -class LogAgentV1Grpc( - dashboard_utils.DashboardAgentModule, reporter_pb2_grpc.ReporterServiceServicer -): +class LogAgentV1Grpc(dashboard_utils.DashboardAgentModule): def __init__(self, dashboard_agent): super().__init__(dashboard_agent) @@ -45,15 +275,17 @@ async def run(self, server): if server: reporter_pb2_grpc.add_LogServiceServicer_to_server(self, server) - # TODO: should this return True @staticmethod def is_minimal_module(): + # Dashboard is only available with non-minimal install now. return False async def ListLogs(self, request, context): """ Lists all files in the active Ray logs directory. + Part of `LogService` gRPC. + NOTE: These RPCs are used by state_head.py, not log_head.py """ path = Path(self._dashboard_agent.log_dir) @@ -64,91 +296,151 @@ async def ListLogs(self, request, context): ) log_files = [] for p in path.glob(request.glob_filter): - log_files.append(p.name) + log_files.append(str(p.relative_to(path))) return reporter_pb2.ListLogsReply(log_files=log_files) + @classmethod + async def _find_task_log_offsets( + cls, task_id: str, attempt_number: int, lines: int, f: io.BufferedIOBase + ) -> Tuple[int, int]: + """Find the start and end offsets in the log file for a task attempt + Current task log is in the format of below: + + :job_id:xxx + :task_name:xxx + :task_attempt_start:- + ... + actual user logs + ... + :task_attempt_end:- + ... (other tasks) + + + For async actor tasks, task logs from multiple tasks might however + be interleaved. + """ + + # Find start + task_attempt_start_magic_line = ( + f"{LOG_PREFIX_TASK_ATTEMPT_START}{task_id}-{attempt_number}\n" + ) + + # Offload the heavy IO CPU work to a thread pool to avoid blocking the + # event loop for concurrent requests. + task_attempt_magic_line_offset = ( + await asyncio.get_running_loop().run_in_executor( + _task_log_search_worker_pool, + find_offset_of_content_in_file, + f, + task_attempt_start_magic_line.encode(), + ) + ) + + if task_attempt_magic_line_offset == -1: + raise FileNotFoundError( + f"Log for task attempt({task_id},{attempt_number}) not found" + ) + start_offset = task_attempt_magic_line_offset + len( + task_attempt_start_magic_line + ) + + # Find the end of the task log, which is the start of the next task log if any + # with the LOG_PREFIX_TASK_ATTEMPT_END magic line. + task_attempt_end_magic_line = ( + f"{LOG_PREFIX_TASK_ATTEMPT_END}{task_id}-{attempt_number}\n" + ) + end_offset = await asyncio.get_running_loop().run_in_executor( + _task_log_search_worker_pool, + find_offset_of_content_in_file, + f, + task_attempt_end_magic_line.encode(), + start_offset, + ) + + if end_offset == -1: + # No other tasks (might still be running), stream til the end. + end_offset = find_end_offset_file(f) + + if lines != -1: + # Tail lines specified, find end_offset - lines offsets. + start_offset = max( + find_start_offset_last_n_lines_from_offset(f, end_offset, lines), + start_offset, + ) + + return start_offset, end_offset + async def StreamLog(self, request, context): """ Streams the log in real time starting from `request.lines` number of lines from the end of the file if `request.keep_alive == True`. Else, it terminates the stream once there are no more bytes to read from the log file. + Part of `LogService` gRPC. + NOTE: These RPCs are used by state_head.py, not log_head.py """ # NOTE: If the client side connection is closed, this handler will # be automatically terminated. lines = request.lines if request.lines else 1000 + task_id = request.task_id if request.HasField("task_id") else None - filepath = f"{self._dashboard_agent.log_dir}/{request.log_file_name}" - if "/" in request.log_file_name or not os.path.isfile(filepath): + filepath = Path(self._dashboard_agent.log_dir) / request.log_file_name + if not filepath.is_file(): await context.send_initial_metadata( [[log_consts.LOG_GRPC_ERROR, log_consts.FILE_NOT_FOUND]] ) else: with open(filepath, "rb") as f: await context.send_initial_metadata([]) - # If requesting the whole file, we stream it since it may be large. - if lines == -1: - while not context.done(): - bytes = f.read(BLOCK_SIZE) - if bytes == b"": - end = f.tell() - break - yield reporter_pb2.StreamLogReply(data=bytes) - else: - bytes, end = tail(f, lines) - yield reporter_pb2.StreamLogReply(data=bytes + b"\n") - if request.keep_alive: - interval = request.interval if request.interval else 1 - f.seek(end) - while not context.done(): - await asyncio.sleep(interval) - bytes = f.read() - if bytes != b"": - yield reporter_pb2.StreamLogReply(data=bytes) + # Default stream entire file + start_offset = 0 + end_offset = find_end_offset_file(f) -def tail(f: io.TextIOBase, lines: int): - """Tails the given file (in 'rb' mode) + if task_id is not None: # Stream from task log. + attempt_number = ( + request.attempt_number + if request.HasField("attempt_number") + else 0 + ) + start_offset, end_offset = await self._find_task_log_offsets( + task_id, attempt_number, lines, f + ) + logger.info( + f"Tailing task logs from {start_offset} to {end_offset} for" + f"task attempt({task_id}, {attempt_number}) in {f.name}" + ) + elif lines != -1: # Default tailing files + # If specified tail line number, + # look for the file offset with the line count + start_offset = find_start_offset_last_n_lines_from_offset( + f, offset=end_offset, n=lines + ) - We assume that any "lines" parameter is not significant (<100,000 lines) - and will result in a buffer with a small memory profile (<1MB) - - Taken from: https://stackoverflow.com/a/136368/8299684 - - Examples: - Args: - f: text file in 'rb' mode - lines: The number of lines to read from the end of the file. - Returns: - string containing the lines of the file, - the position of the last byte read in units of bytes - """ - - total_lines_wanted = lines + # If keep alive: following the log every 'interval' + keep_alive_interval_sec = -1 + if request.keep_alive: + keep_alive_interval_sec = ( + request.interval + if request.interval + else DEFAULT_KEEP_ALIVE_INTERVAL_SEC + ) - # Seek to the end of the file - f.seek(0, 2) - block_end_byte = f.tell() + # When following (keep_alive), it will read beyond the end + end_offset = -1 - last_byte_read = block_end_byte - lines_to_go = total_lines_wanted - block_number = -1 - blocks = [] + logger.info( + f"Tailing logs from {start_offset} to {end_offset} for " + f"lines={lines}, with keep_alive={keep_alive_interval_sec}" + ) - # Read blocks into memory until we have seen at least - # `total_lines_wanted` number of lines. Then, return a string - # containing the last `total_lines_wanted` number of lines - while lines_to_go > 0 and block_end_byte > 0: - if block_end_byte - BLOCK_SIZE > 0: - f.seek(block_number * BLOCK_SIZE, 2) - blocks.append(f.read(BLOCK_SIZE)) - else: - f.seek(0, 0) - blocks.append(f.read(block_end_byte)) - lines_found = blocks[-1].count(b"\n") - lines_to_go -= lines_found - block_end_byte -= BLOCK_SIZE - block_number -= 1 - all_read_text = b"".join(reversed(blocks)) - return b"\n".join(all_read_text.splitlines()[-total_lines_wanted:]), last_byte_read + # Read and send the file data in chunk + async for chunk_res in _stream_log_in_chunk( + context=context, + file=f, + start_offset=start_offset, + end_offset=end_offset, + keep_alive_interval_sec=keep_alive_interval_sec, + ): + yield chunk_res diff --git a/dashboard/modules/log/log_manager.py b/dashboard/modules/log/log_manager.py index d3971b6e780a..cfdea8f25840 100644 --- a/dashboard/modules/log/log_manager.py +++ b/dashboard/modules/log/log_manager.py @@ -4,9 +4,14 @@ from collections import defaultdict from typing import List, Optional, Dict, AsyncIterable, Tuple, Callable -from ray.experimental.state.common import GetLogOptions -from ray.experimental.state.exception import DataSourceUnavailable -from ray.experimental.state.state_manager import StateDataSourceClient +from ray.dashboard.modules.job.common import JOB_LOGS_PATH_TEMPLATE +from ray.util.state.common import ( + GetLogOptions, + protobuf_to_task_state_dict, + DEFAULT_RPC_TIMEOUT, +) +from ray.util.state.exception import DataSourceUnavailable +from ray.util.state.state_manager import StateDataSourceClient # TODO(sang): Remove the usage of this class. from ray.dashboard.datacenter import DataSource @@ -76,10 +81,12 @@ async def stream_logs( log_filename=options.filename, actor_id=options.actor_id, task_id=options.task_id, + attempt_number=options.attempt_number, pid=options.pid, get_actor_fn=DataSource.actors.get, timeout=options.timeout, suffix=options.suffix, + submission_id=options.submission_id, ) keep_alive = options.media_type == "stream" @@ -93,6 +100,8 @@ async def stream_logs( # otherwise the stream will be terminated forcefully # after the deadline is expired. timeout=options.timeout if not keep_alive else None, + task_id=options.task_id, + attempt_number=options.attempt_number, ) async for streamed_log in stream: @@ -110,17 +119,86 @@ def _verify_node_registered(self, node_id: str): ) assert node_id is not None - async def resolve_filename( + async def _resolve_job_filename(self, sub_job_id: str) -> Tuple[str, str]: + """Return the log file name and node id for a given job submission id. + + Args: + sub_job_id: The job submission id. + + Returns: + The log file name and node id. + """ + job_infos = await self.client.get_job_info(timeout=DEFAULT_RPC_TIMEOUT) + target_job = None + for job_info in job_infos: + if job_info.submission_id == sub_job_id: + target_job = job_info + break + if target_job is None: + logger.info(f"Submission job ID {sub_job_id} not found.") + return None, None + + node_id = job_info.driver_node_id + if node_id is None: + raise ValueError( + f"Job {sub_job_id} has no driver node id info. " + "This is likely a bug. Please file an issue." + ) + + log_filename = JOB_LOGS_PATH_TEMPLATE.format(submission_id=sub_job_id) + return node_id, log_filename + + async def _resolve_worker_file( self, - *, node_id: str, - log_filename: Optional[str], - actor_id: Optional[str], - task_id: Optional[str], - pid: Optional[str], - get_actor_fn: Callable[[str], Dict], + worker_id: Optional[str], + pid: Optional[int], + suffix: str, timeout: int, + ) -> Optional[str]: + """Resolve worker log file.""" + if worker_id is not None and pid is not None: + raise ValueError( + f"Only one of worker id({worker_id}) or pid({pid}) should be provided." + ) + + if worker_id is not None: + log_files = await self.list_logs( + node_id, timeout, glob_filter=f"*{worker_id}*{suffix}" + ) + else: + log_files = await self.list_logs( + node_id, timeout, glob_filter=f"*{pid}*{suffix}" + ) + + # Find matching worker logs. + for filename in [*log_files["worker_out"], *log_files["worker_err"]]: + # Worker logs look like worker-[worker_id]-[job_id]-[pid].out + if worker_id is not None: + worker_id_from_filename = WORKER_LOG_PATTERN.match(filename).group(1) + if worker_id_from_filename == worker_id: + return filename + else: + worker_pid_from_filename = int( + WORKER_LOG_PATTERN.match(filename).group(3) + ) + if worker_pid_from_filename == pid: + return filename + return None + + async def resolve_filename( + self, + *, + node_id: Optional[str] = None, + log_filename: Optional[str] = None, + actor_id: Optional[str] = None, + task_id: Optional[str] = None, + attempt_number: Optional[int] = None, + pid: Optional[str] = None, + get_actor_fn: Optional[Callable[[str], Dict]] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, suffix: str = "out", + submission_id: Optional[str] = None, ) -> Tuple[str, str]: """Return the file name given all options. @@ -135,8 +213,12 @@ async def resolve_filename( specified by `node_id`. suffix: Log suffix if no `log_filename` is provided, when resolving by other ids'. Default to "out". + submission_id: The submission id for a submission job. """ if actor_id: + if get_actor_fn is None: + raise ValueError("get_actor_fn needs to be specified for actor_id") + actor_data = get_actor_fn(actor_id) if actor_data is None: raise ValueError(f"Actor ID {actor_id} not found.") @@ -158,34 +240,77 @@ async def resolve_filename( ) self._verify_node_registered(node_id) - # List all worker logs that match actor's worker id. - log_files = await self.list_logs( - node_id, timeout, glob_filter=f"*{worker_id}*{suffix}" + log_filename = await self._resolve_worker_file( + node_id=node_id, + worker_id=worker_id, + pid=None, + suffix=suffix, + timeout=timeout, ) - - # Find matching worker logs. - for filename in [*log_files["worker_out"], *log_files["worker_err"]]: - # Worker logs look like worker-[worker_id]-[job_id]-[pid].out - worker_id_from_filename = WORKER_LOG_PATTERN.match(filename).group(1) - if worker_id_from_filename == worker_id: - log_filename = filename - break elif task_id: - raise NotImplementedError("task_id is not supported yet.") - elif pid: - self._verify_node_registered(node_id) - log_files = await self.list_logs( - node_id, timeout, glob_filter=f"*{pid}*{suffix}" + reply = await self.client.get_all_task_info( + filters=[("task_id", "=", task_id)], timeout=timeout ) - for filename in [*log_files["worker_out"], *log_files["worker_err"]]: - # worker-[worker_id]-[job_id]-[pid].out - worker_pid_from_filename = int( - WORKER_LOG_PATTERN.match(filename).group(3) + # Check if the task is found. + if len(reply.events_by_task) == 0: + raise FileNotFoundError( + f"Could not find log file for task: {task_id}" + f" (attempt {attempt_number}) with suffix: {suffix}" ) - if worker_pid_from_filename == pid: - log_filename = filename + task_event = None + for t in reply.events_by_task: + if t.attempt_number == attempt_number: + task_event = t break + if task_event is None: + raise FileNotFoundError( + "Could not find log file for task attempt:" + f"{task_id}({attempt_number})" + ) + # Get the worker id and node id. + task = protobuf_to_task_state_dict(task_event) + + worker_id = task.get("worker_id", None) + node_id = task.get("node_id", None) + + if worker_id is None or node_id is None: + raise FileNotFoundError( + "Could not find log file for task attempt:" + f"{task_id}({attempt_number})." + f"Worker id = {worker_id}, node id = {node_id}" + ) + + log_filename = await self._resolve_worker_file( + node_id=node_id, + worker_id=worker_id, + pid=None, + suffix=suffix, + timeout=timeout, + ) + elif submission_id: + node_id, log_filename = await self._resolve_job_filename(submission_id) + + logger.info( + f"Resolving job {submission_id} on node {node_id} with " + f"filename {log_filename}" + ) + + elif pid: + if node_id is None: + raise ValueError( + "Node id needs to be specified for resolving" + f" filenames of pid {pid}" + ) + self._verify_node_registered(node_id) + log_filename = await self._resolve_worker_file( + node_id=node_id, + worker_id=None, + pid=pid, + suffix=suffix, + timeout=timeout, + ) + if log_filename is None: raise FileNotFoundError( "Could not find a log file. Please make sure the given " @@ -196,8 +321,9 @@ async def resolve_filename( f"\task_id: {task_id}\n" f"\tpid: {pid}\n" f"\tsuffix: {suffix}\n" + f"\tsubmission_id: {submission_id}\n" ) - + logger.info(f"Resolved log file: {log_filename} on node {node_id}") return log_filename, node_id def _categorize_log_files(self, log_files: List[str]) -> Dict[str, List[str]]: diff --git a/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json b/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json index c66c41e5d50e..ed72492a58da 100644 --- a/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json +++ b/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json @@ -36,7 +36,7 @@ ] }, "datasource": "Prometheus", - "definition": "label_values(ray_serve_deployment_request_counter{{{global_filters}}}, deployment)", + "definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, deployment)", "description": null, "error": null, "hide": 0, @@ -46,7 +46,7 @@ "name": "Deployment", "options": [], "query": { - "query": "label_values(ray_serve_deployment_request_counter{{{global_filters}}}, deployment)", + "query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, deployment)", "refId": "Prometheus-Instance-Variable-Query" }, "refresh": 2, @@ -71,7 +71,7 @@ ] }, "datasource": "Prometheus", - "definition": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, replica)", + "definition": "label_values(ray_serve_deployment_replica_healthy{{deployment=~\"$Deployment\",{global_filters}}}, replica)", "description": null, "error": null, "hide": 0, @@ -81,7 +81,7 @@ "name": "Replica", "options": [], "query": { - "query": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, replica)", + "query": "label_values(ray_serve_deployment_replica_healthy{{deployment=~\"$Deployment\",{global_filters}}}, replica)", "refId": "Prometheus-Instance-Variable-Query" }, "refresh": 2, @@ -131,6 +131,7 @@ } ] }, + "rayMeta": ["excludesSystemRoutes"], "time": { "from": "now-30m", "to": "now" diff --git a/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json b/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json index 4a1d66bc311a..14fb6f46404b 100644 --- a/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json +++ b/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json @@ -61,6 +61,7 @@ } ] }, + "rayMeta": ["excludesSystemRoutes"], "time": { "from": "now-30m", "to": "now" diff --git a/dashboard/modules/serve/tests/test_serve_agent.py b/dashboard/modules/serve/tests/test_serve_agent.py index 1ea921687c1d..6f2409e71b57 100644 --- a/dashboard/modules/serve/tests/test_serve_agent.py +++ b/dashboard/modules/serve/tests/test_serve_agent.py @@ -1,4 +1,5 @@ import copy +import os import sys from typing import Dict @@ -9,11 +10,20 @@ from ray import serve from ray._private.test_utils import wait_for_condition import ray._private.ray_constants as ray_constants -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray.serve._private.constants import SERVE_NAMESPACE, MULTI_APP_MIGRATION_MESSAGE from ray.serve.tests.conftest import * # noqa: F401 F403 from ray.serve.schema import ServeInstanceDetails -from ray.serve._private.common import ApplicationStatus, DeploymentStatus, ReplicaState +from ray.serve._private.common import ( + ApplicationStatus, + DeploymentStatus, + ReplicaState, + HTTPProxyStatus, +) +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) GET_OR_PUT_URL = "http://localhost:52365/api/serve/deployments/" STATUS_URL = "http://localhost:52365/api/serve/deployments/status" @@ -444,7 +454,10 @@ def test_get_status(ray_start_stop): deployment_statuses = serve_status["deployment_statuses"] assert len(deployment_statuses) == 2 - expected_deployment_names = {"f", "BasicDriver"} + expected_deployment_names = { + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}BasicDriver", + } for deployment_status in deployment_statuses: assert deployment_status["name"] in expected_deployment_names expected_deployment_names.remove(deployment_status["name"]) @@ -459,6 +472,12 @@ def test_get_status(ray_start_stop): print("Serve app status is correct.") +@pytest.mark.skipif(sys.platform == "darwin", reason="Flaky on OSX.") +def test_get_serve_instance_details_not_started(ray_start_stop): + """Test rest api when serve isn't started yet.""" + ServeInstanceDetails(**requests.get(GET_OR_PUT_URL_V2).json()) + + @pytest.mark.skipif(sys.platform == "darwin", reason="Flaky on OSX.") @pytest.mark.parametrize( "f_deployment_options", @@ -477,7 +496,7 @@ def test_get_status(ray_start_stop): def test_get_serve_instance_details(ray_start_stop, f_deployment_options): world_import_path = "ray.serve.tests.test_config_files.world.DagNode" fastapi_import_path = "ray.serve.tests.test_config_files.fastapi_deployment.node" - config1 = { + config = { "proxy_location": "HeadOnly", "http_options": { "host": "127.0.0.1", @@ -486,19 +505,31 @@ def test_get_serve_instance_details(ray_start_stop, f_deployment_options): "applications": [ { "name": "app1", - "route_prefix": "/app1", + "route_prefix": "/apple", "import_path": world_import_path, "deployments": [f_deployment_options], }, { "name": "app2", - "route_prefix": "/app2", + "route_prefix": "/banana", "import_path": fastapi_import_path, }, ], } + expected_values = { + "app1": { + "route_prefix": "/apple", + "docs_path": None, + "deployments": {"app1_f", "app1_BasicDriver"}, + }, + "app2": { + "route_prefix": "/banana", + "docs_path": "/my_docs", + "deployments": {"app2_FastAPIDeployment"}, + }, + } - deploy_config_multi_app(config1) + deploy_config_multi_app(config) def applications_running(): response = requests.get(GET_OR_PUT_URL_V2, timeout=15) @@ -519,43 +550,36 @@ def applications_running(): assert serve_details.http_options.host == "127.0.0.1" assert serve_details.http_options.port == 8005 print("Confirmed fetched proxy location, host and port metadata correct.") + # Check HTTP Proxy statuses + for proxy in serve_details.http_proxies.values(): + assert proxy.status == HTTPProxyStatus.HEALTHY + assert os.path.exists("/tmp/ray/session_latest/logs" + proxy.log_file_path) + print("Checked HTTP Proxy details.") + # Check controller info + assert serve_details.controller_info.actor_id + assert serve_details.controller_info.actor_name + assert serve_details.controller_info.node_id + assert serve_details.controller_info.node_ip + assert os.path.exists( + "/tmp/ray/session_latest/logs" + serve_details.controller_info.log_file_path + ) app_details = serve_details.applications + # CHECK: application details + for i, app in enumerate(["app1", "app2"]): + assert ( + app_details[app].deployed_app_config.dict(exclude_unset=True) + == config["applications"][i] + ) + assert app_details[app].last_deployed_time_s > 0 + assert app_details[app].route_prefix == expected_values[app]["route_prefix"] + assert app_details[app].docs_path == expected_values[app]["docs_path"] - # CHECK: app configs are equal - assert ( - app_details["app1"].deployed_app_config.dict(exclude_unset=True) - == config1["applications"][0] - ) - assert ( - app_details["app2"].deployed_app_config.dict(exclude_unset=True) - == config1["applications"][1] - ) - print("Confirmed the deployed app configs from the fetched metadata is correct.") - - # CHECK: deployment timestamp - assert app_details["app1"].last_deployed_time_s > 0 - assert app_details["app2"].last_deployed_time_s > 0 - print("Confirmed deployment timestamps are nonzero.") - - # CHECK: docs path - assert app_details["app1"].docs_path is None - assert app_details["app2"].docs_path == "/my_docs" - print("Confirmed docs paths are correct.") - - # CHECK: all deployments are present - assert app_details["app1"].deployments.keys() == { - "app1_f", - "app1_BasicDriver", - } - assert app_details["app2"].deployments.keys() == { - "app2_FastAPIDeployment", - } - print("Metadata for all deployed deployments are present.") + # CHECK: all deployments are present + assert ( + app_details[app].deployments.keys() == expected_values[app]["deployments"] + ) - # CHECK: application details - for app in ["app1", "app2"]: - assert app_details[app].route_prefix == f"/{app}" for deployment in app_details[app].deployments.values(): assert deployment.status == DeploymentStatus.HEALTHY # Route prefix should be app level options eventually @@ -576,6 +600,8 @@ def applications_running(): ) assert replica.actor_id and replica.node_id and replica.node_ip assert replica.start_time_s > app_details[app].last_deployed_time_s + file_path = "/tmp/ray/session_latest/logs" + replica.log_file_path + assert os.path.exists(file_path) print("Finished checking application details.") @@ -705,7 +731,10 @@ def test_serve_namespace(ray_start_stop): serve_status = client.get_serve_status() assert ( len(serve_status.deployment_statuses) == 2 - and serve_status.get_deployment_status("f") is not None + and serve_status.get_deployment_status( + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + ) + is not None ) print("Successfully retrieved deployment statuses with Python API.") print("Shutting down Python API.") diff --git a/dashboard/modules/state/state_head.py b/dashboard/modules/state/state_head.py index 93b9592ca867..8ac6bd8c9da5 100644 --- a/dashboard/modules/state/state_head.py +++ b/dashboard/modules/state/state_head.py @@ -20,7 +20,7 @@ from ray.dashboard.optional_utils import rest_response from ray.dashboard.state_aggregator import StateAPIManager from ray.dashboard.utils import Change -from ray.experimental.state.common import ( +from ray.util.state.common import ( RAY_MAX_LIMIT_FROM_API_SERVER, ListApiOptions, GetLogOptions, @@ -32,9 +32,9 @@ DEFAULT_LIMIT, DEFAULT_LOG_LIMIT, ) -from ray.experimental.state.exception import DataSourceUnavailable -from ray.experimental.state.state_manager import StateDataSourceClient -from ray.experimental.state.util import convert_string_to_type +from ray.util.state.exception import DataSourceUnavailable +from ray.util.state.state_manager import StateDataSourceClient +from ray.util.state.util import convert_string_to_type logger = logging.getLogger(__name__) @@ -406,16 +406,20 @@ async def get_logs(self, req: aiohttp.web.Request): filename=req.query.get("filename", None), actor_id=req.query.get("actor_id", None), task_id=req.query.get("task_id", None), + submission_id=req.query.get("submission_id", None), pid=req.query.get("pid", None), lines=req.query.get("lines", DEFAULT_LOG_LIMIT), interval=req.query.get("interval", None), suffix=req.query.get("suffix", "out"), + attempt_number=req.query.get("attempt_number", 0), ) response = aiohttp.web.StreamResponse() response.content_type = "text/plain" await response.prepare(req) + logger.info(f"Streaming logs with options: {options}") + # NOTE: The first byte indicates the success / failure of individual # stream. If the first byte is b"1", it means the stream was successful. # If it is b"0", it means it is failed. diff --git a/dashboard/state_aggregator.py b/dashboard/state_aggregator.py index e4e38c9f323b..d3e978f87b62 100644 --- a/dashboard/state_aggregator.py +++ b/dashboard/state_aggregator.py @@ -1,7 +1,8 @@ import asyncio import logging -from dataclasses import asdict, fields +from dataclasses import fields +import dataclasses from itertools import islice from typing import List, Tuple, Optional from datetime import datetime @@ -11,9 +12,10 @@ import ray.dashboard.memory_utils as memory_utils -from ray.experimental.state.common import ( +from ray.util.state.common import ( protobuf_message_to_dict, ActorState, + JobState, ListApiOptions, ListApiResponse, NodeState, @@ -36,12 +38,12 @@ PredicateType, protobuf_to_task_state_dict, ) -from ray.experimental.state.state_manager import ( +from ray.util.state.state_manager import ( DataSourceUnavailable, StateDataSourceClient, ) from ray.runtime_env import RuntimeEnv -from ray.experimental.state.util import convert_string_to_type +from ray.util.state.util import convert_string_to_type logger = logging.getLogger(__name__) @@ -80,7 +82,10 @@ def _convert_filters_type( A new list of filters with correct types that match the schema. """ new_filter = [] - schema = {field.name: field.type for field in fields(schema)} + if dataclasses.is_dataclass(schema): + schema = {field.name: field.type for field in fields(schema)} + else: + schema = schema.schema_dict() for col, predicate, val in filter: if col in schema: @@ -95,7 +100,7 @@ def _convert_filters_type( if isinstance(val, column_type): # Do nothing. pass - elif column_type is int: + elif column_type is int or column_type == "integer": try: val = convert_string_to_type(val, int) except ValueError: @@ -104,16 +109,19 @@ def _convert_filters_type( "column. Please provide an integer filter " f"`--filter {col} [int]`" ) - elif column_type is float: + elif column_type is float or column_type == "number": try: - val = convert_string_to_type(val, float) + val = convert_string_to_type( + val, + float, + ) except ValueError: raise ValueError( f"Invalid filter `--filter {col} {val}` for a float " "type column. Please provide an integer filter " f"`--filter {col} [float]`" ) - elif column_type is bool: + elif column_type is bool or column_type == "boolean": try: val = convert_string_to_type(val, bool) except ValueError: @@ -251,7 +259,6 @@ async def list_placement_groups(self, *, option: ListApiOptions) -> ListApiRespo result = [] for message in reply.placement_group_table_data: - data = protobuf_message_to_dict( message=message, fields_to_decode=["placement_group_id", "creator_job_id", "node_id"], @@ -352,22 +359,21 @@ async def list_workers(self, *, option: ListApiOptions) -> ListApiResponse: ) async def list_jobs(self, *, option: ListApiOptions) -> ListApiResponse: - # TODO(sang): Support limit & timeout & async calls. try: - result = [] - job_info = await self._client.get_job_info() - for job_id, data in job_info.items(): - data = asdict(data) - data["job_id"] = job_id - result.append(data) + result = await self._client.get_job_info(timeout=option.timeout) + result = [job.dict() for job in result] + total = len(result) + result = self._filter(result, option.filters, JobState, option.detail) + num_filtered = len(result) + result.sort(key=lambda entry: entry["job_id"] or "") + result = list(islice(result, option.limit)) except DataSourceUnavailable: raise DataSourceUnavailable(GCS_QUERY_FAILURE_WARNING) return ListApiResponse( result=result, - # TODO(sang): Support this. - total=len(result), - num_after_truncation=len(result), - num_filtered=len(result), + total=total, + num_after_truncation=total, + num_filtered=num_filtered, ) async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse: @@ -377,16 +383,10 @@ async def list_tasks(self, *, option: ListApiOptions) -> ListApiResponse: {task_id -> task_data_in_dict} task_data_in_dict's schema is in TaskState """ - job_id = None - for filter in option.filters: - if filter[0] == "job_id" and filter[1] == "=": - # Filtering by job_id == xxxx, pass it to source side filtering. - # tuple consists of (job_id, predicate, value) - job_id = filter[2] try: reply = await self._client.get_all_task_info( timeout=option.timeout, - job_id=job_id, + filters=option.filters, exclude_driver=option.exclude_driver, ) except DataSourceUnavailable: diff --git a/dashboard/tests/test_dashboard.py b/dashboard/tests/test_dashboard.py index 19d71ffddf43..50496c32ab43 100644 --- a/dashboard/tests/test_dashboard.py +++ b/dashboard/tests/test_dashboard.py @@ -39,9 +39,9 @@ import ray.scripts.scripts as scripts from ray.dashboard import dashboard from ray.dashboard.head import DashboardHead -from ray.experimental.state.api import StateApiClient -from ray.experimental.state.common import ListApiOptions, StateResource -from ray.experimental.state.exception import ServerUnavailable +from ray.util.state import StateApiClient +from ray.util.state.common import ListApiOptions, StateResource +from ray.util.state.exception import ServerUnavailable from ray.experimental.internal_kv import _initialize_internal_kv from unittest.mock import MagicMock from ray.dashboard.utils import DashboardHeadModule @@ -277,22 +277,38 @@ def test_agent_report_unexpected_raylet_death_large_file(shutdown_only): "ray_start_with_dashboard", [ {"dashboard_host": "127.0.0.1"}, + {"dashboard_host": "localhost"}, + ], + indirect=True, +) +def test_dashboard_address_local(ray_start_with_dashboard): + webui_url = ray_start_with_dashboard["webui_url"] + if os.environ.get("RAY_MINIMAL") == "1": + # In the minimal installation, webui url shouldn't be configured. + assert webui_url == "" + else: + webui_ip = webui_url.split(":")[0] + assert not ipaddress.ip_address(webui_ip).is_unspecified + assert webui_ip == "127.0.0.1" + + +@pytest.mark.parametrize( + "ray_start_with_dashboard", + [ {"dashboard_host": "0.0.0.0"}, {"dashboard_host": "::"}, ], indirect=True, ) -def test_dashboard_address(ray_start_with_dashboard): +def test_dashboard_address_global(ray_start_with_dashboard): webui_url = ray_start_with_dashboard["webui_url"] if os.environ.get("RAY_MINIMAL") == "1": # In the minimal installation, webui url shouldn't be configured. assert webui_url == "" else: webui_ip = webui_url.split(":")[0] - print(ipaddress.ip_address(webui_ip)) - print(webui_ip) assert not ipaddress.ip_address(webui_ip).is_unspecified - assert webui_ip in ["127.0.0.1", ray_start_with_dashboard["node_ip_address"]] + assert webui_ip == ray_start_with_dashboard["node_ip_address"] @pytest.mark.skipif( @@ -766,13 +782,14 @@ def test_dashboard_port_conflict(ray_start_with_dashboard): f"--log-dir={log_dir}", f"--gcs-address={address_info['gcs_address']}", f"--session-dir={session_dir}", + "--node-ip-address=127.0.0.1", ] logger.info("The dashboard should be exit: %s", dashboard_cmd) - p = subprocess.Popen(dashboard_cmd) - p.wait(5) + dashboard_process = subprocess.Popen(dashboard_cmd) + dashboard_process.wait(5) dashboard_cmd.append("--port-retries=10") - subprocess.Popen(dashboard_cmd) + conflicting_dashboard_process = subprocess.Popen(dashboard_cmd) timeout_seconds = 10 start_time = time.time() @@ -792,6 +809,10 @@ def test_dashboard_port_conflict(ray_start_with_dashboard): finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.") + dashboard_process.kill() + conflicting_dashboard_process.kill() + dashboard_process.wait() + conflicting_dashboard_process.wait() @pytest.mark.skipif( @@ -984,15 +1005,17 @@ def test_dashboard_requests_fail_on_missing_deps(ray_start_with_dashboard): def test_dashboard_module_load(tmpdir): """Verify if the head module can load only selected modules.""" head = DashboardHead( - "127.0.0.1", - 8265, - 1, - "127.0.0.1:6379", - str(tmpdir), - str(tmpdir), - str(tmpdir), - False, - True, + http_host="127.0.0.1", + http_port=8265, + http_port_retries=1, + node_ip_address="127.0.0.1", + gcs_address="127.0.0.1:6379", + grpc_port=0, + log_dir=str(tmpdir), + temp_dir=str(tmpdir), + session_dir=str(tmpdir), + minimal=False, + serve_frontend=True, ) # Test basic. diff --git a/dashboard/utils.py b/dashboard/utils.py index 9ef0ad986ed4..6434ce5c1b60 100644 --- a/dashboard/utils.py +++ b/dashboard/utils.py @@ -58,7 +58,7 @@ async def run(self, server): def is_minimal_module(): """ Return True if the module is minimal, meaning it - should work with `pip install ray` that doesn't requires additonal + should work with `pip install ray` that doesn't requires additional dependencies. """ @@ -87,7 +87,7 @@ async def run(self, server): def is_minimal_module(): """ Return True if the module is minimal, meaning it - should work with `pip install ray` that doesn't requires additonal + should work with `pip install ray` that doesn't requires additional dependencies. """ diff --git a/doc/BUILD b/doc/BUILD index af4044126013..13f8beed62aa 100644 --- a/doc/BUILD +++ b/doc/BUILD @@ -223,10 +223,63 @@ py_test_run_all_subdirectory( include = ["source/data/doc_code/*.py"], exclude = [ "source/ray-air/doc_code/predictors.py", - "source/data/doc_code/creating_datasets_untested.py" + "source/data/doc_code/loading_data_untested.py", + "source/data/doc_code/torch_image_batch_trained.py" ], extra_srcs = [], - tags = ["exclusive", "team:core"], + tags = ["exclusive", "team:data"], +) + +# -------------------------------------------------------------------- +# Test all Workspace template notebooks in doc/source/templates +# as smoke tests. +# -------------------------------------------------------------------- + +filegroup( + name = "workspace_templates", + srcs = glob([ + "source/templates/tests/**/*.ipynb", + "source/templates/tests/**/requirements.txt" + ]), + visibility = ["//doc:__subpackages__"] +) + +# Validate that all the paths and yamls within the templates.yaml file are valid. + +py_test( + name = "templates_directory_validation", + size = "small", + main = "source/templates/validate.py", + srcs = ["source/templates/validate.py"], + data = glob(["source/templates/**/*"]), + tags = ["exclusive", "team:ml"] +) + +# Templates that only require CPU + +py_test_run_all_notebooks( + size = "large", + # TODO(justinvyu): Merge tests/ with the regular versions of the templates. + include = ["source/templates/tests/02_many_model_training/many_model_training.ipynb"], + exclude = [], + data = ["//doc:workspace_templates"], + tags = ["exclusive", "team:ml", "ray_air"], + env = {"SMOKE_TEST": "1"}, +) + +# Templates that require GPU + +py_test_run_all_notebooks( + size = "large", + include = [ + # TODO(justinvyu): Merge tests/ with the regular versions of the templates. + "source/templates/tests/01_batch_inference/batch_inference.ipynb", + "source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb" + ], + exclude = [], + data = ["//doc:workspace_templates"], + tags = ["exclusive", "team:ml", "ray_air", "gpu"], + env = {"SMOKE_TEST": "1"}, ) # -------------- diff --git a/doc/Makefile b/doc/Makefile index 8e819ce54b7b..98bc7e086207 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -50,6 +50,12 @@ help: clean: rm -rf $(BUILDDIR)/* + rm -rf ./source/*/api/doc/* + rm -rf ./source/ray-references/api/*/doc/* + rm -rf ./source/cluster/running_applications/doc/* + rm -rf ./source/cluster/running_applications/job-submission/doc/* + rm -rf ./source/ray-observability/api/state/doc* + rm -rf ./source/rllib/package_ref/doc* html: $(SPHINXBUILD) -W --keep-going -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html diff --git a/doc/README.md b/doc/README.md index bd131b119273..f707e059c63e 100644 --- a/doc/README.md +++ b/doc/README.md @@ -21,17 +21,20 @@ make develop && open _build/html/index.html > **_NOTE:_** The above command is for development. To reproduce build failures from the > CI, you should use `make html` which is the same as `make develop` but treats warnings as errors. +> Additionally, note that `make develop` uses the `FAST` environment variable to skip some +> expensive parts of the build process. In particular, it will aggressively prune the +> left-hand side navigation, but leave the documents itself intact. -## Building just one sub-project +## Building just one subproject -Often your changes in documentation just concern one sub-project, such as Tune or Train. -To build just this one sub-project, and ignore the rest +Often your changes in documentation just concern one subproject, such as Tune or Train. +To build just this one subproject, and ignore the rest (leading to build warnings due to broken references etc.), run the following command: ```shell DOC_LIB= sphinx-build -b html -d _build/doctrees source _build/html ``` -where `` is the name of the sub-project and can be any of the docs projects in the `source/` +where `` is the name of the subproject and can be any of the docs projects in the `source/` directory either called `tune`, `rllib`, `train`, `cluster`, `serve`, `data` or the ones starting with `ray-`, e.g. `ray-observability`. diff --git a/doc/external/external_code.txt b/doc/external/external_code.txt new file mode 100644 index 000000000000..7eec59ed3a7a --- /dev/null +++ b/doc/external/external_code.txt @@ -0,0 +1,6 @@ +# Mapping from file to external URI. +# If the file is touched in a PR, a comment is posted +# by a bot to remind the user to update the contents at +# the external URI + +# Please keep this as the last line. diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 4172bc287d96..8ea8d767da4e 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -5,6 +5,7 @@ accelerate>=0.17.0 click colorama colorful +datasets # Newer versions of fairscale do not support Python 3.6 even though they still have wheels for it. # Have to manually pin it: https://github.com/facebookresearch/fairscale/issues/962 fairscale; python_version >= '3.7' @@ -26,7 +27,6 @@ pyyaml scikit-optimize redis starlette -tabulate uvicorn==0.16.0 werkzeug wandb @@ -47,7 +47,7 @@ git+https://github.com/ray-project/lightgbm_ray@main#lightgbm_ray git+https://github.com/ray-project/ray_lightning@main#ray_lightning # Syntax highlighting -Pygments==2.11.2 +Pygments==2.13.0 # Sphinx sphinx==4.3.2 @@ -55,16 +55,15 @@ sphinx-click==3.0.2 sphinx-copybutton==0.4.0 sphinxemoji==0.2.0 sphinx-jsonschema==1.17.2 -sphinx-panels==0.6.0 sphinx-version-warning==1.1.2 sphinx-book-theme==0.3.3 -sphinx-external-toc==0.2.3 -sphinxcontrib.yt==0.2.2 +sphinx-external-toc==0.2.4 sphinx-sitemap==2.2.0 sphinxcontrib-redoc==1.6.0 sphinx-tabs==3.4.0 sphinx-remove-toctrees==0.0.3 autodoc_pydantic==1.6.1 +sphinx_design==0.4.1 # MyST myst-parser==0.15.2 @@ -72,3 +71,6 @@ myst-nb==0.13.1 # Jupyter conversion jupytext==1.13.6 + +# Pin urllib to avoid downstream ssl incompatibility issues +urllib3 < 1.27 \ No newline at end of file diff --git a/doc/requirements-rtd.txt b/doc/requirements-rtd.txt deleted file mode 100644 index 5d5b4e713754..000000000000 --- a/doc/requirements-rtd.txt +++ /dev/null @@ -1,2 +0,0 @@ -# CI requirements: this is the file buildkite needs. --r requirements-doc.txt \ No newline at end of file diff --git a/doc/source/_static/css/custom.css b/doc/source/_static/css/custom.css index d2e920761432..7586bdfae7ce 100644 --- a/doc/source/_static/css/custom.css +++ b/doc/source/_static/css/custom.css @@ -109,6 +109,12 @@ div.navbar-brand-box { display: flex; flex-direction: column; } + +.bd-sidebar li { + position: relative; + word-wrap: break-word; +} + nav.bd-links { overflow-y: auto; flex: 1; diff --git a/doc/source/_static/js/custom.js b/doc/source/_static/js/custom.js index ff4f883b65a3..2ade6e53c97d 100644 --- a/doc/source/_static/js/custom.js +++ b/doc/source/_static/js/custom.js @@ -35,12 +35,12 @@ document.addEventListener("DOMContentLoaded", function() { for (let i = 0; i < navItems.length; i++) { let navItem = navItems[i]; const stringList = [ - "User Guide", "Examples", + "User Guides", "Examples", "Ray Core", "Ray Core API", "Ray Clusters", "Deploying on Kubernetes", "Deploying on VMs", "Applications Guide", "Ray Cluster Management API", "Ray AI Runtime (AIR)", "Ray AIR API", - "Ray Data", "Ray Datasets API", "Integrations", + "Ray Data", "Ray Data API", "Integrations", "Ray Train", "Ray Train API", "Ray Tune", "Ray Tune Examples", "Ray Tune API", "Ray Serve", "Ray Serve API", @@ -102,3 +102,22 @@ window.onload = function() { localStorage.removeItem("scroll"); } }; + +// When the document is fully loaded +document.addEventListener("DOMContentLoaded", function() { + // find all the code blocks' copy buttons + let codeButtons = document.querySelectorAll(".copybtn"); + for (let i = 0; i < codeButtons.length; i++) { + const button = codeButtons[i]; + // and add a click event listener to each one for Google Analytics. + button.addEventListener("click", function() { + gtag("event", "code_copy_click", { + "send_to": "UA-110413294-1", + "event_category": "ray_docs_copy_code", + "event_label": "URL: " + document.URL + + " Button: " + button.getAttribute("data-clipboard-target"), + "value": 1, + }); + }); + } +}); diff --git a/doc/source/_static/js/top-navigation.js b/doc/source/_static/js/top-navigation.js index 0a0a5adf5e57..bbb7a60ece8b 100644 --- a/doc/source/_static/js/top-navigation.js +++ b/doc/source/_static/js/top-navigation.js @@ -77,7 +77,7 @@ librariesMenu.innerHTML = "Libraries" + downCaret + "" librariesList = document.createElement("ul") librariesList.innerHTML += "
  • Ray CoreScale general Python applications
  • " librariesList.innerHTML += "
  • Ray AIRScale AI applications
  • " -librariesList.innerHTML += "
  • Ray DatasetsScale data ingest and preprocessing
  • " +librariesList.innerHTML += "
  • Ray DataScale data ingest and preprocessing
  • " librariesList.innerHTML += "
  • Ray TrainScale machine learning training
  • " librariesList.innerHTML += "
  • Ray TuneScale hyperparameter tuning
  • " librariesList.innerHTML += "
  • Ray ServeScale model serving
  • " diff --git a/doc/source/_templates/autosummary/class_without_init_args.rst b/doc/source/_templates/autosummary/class_without_init_args.rst new file mode 100644 index 000000000000..643b939c2eb4 --- /dev/null +++ b/doc/source/_templates/autosummary/class_without_init_args.rst @@ -0,0 +1,6 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }}() + :members: diff --git a/doc/source/_templates/layout.html b/doc/source/_templates/layout.html index 50de03599bde..f9a687d27080 100644 --- a/doc/source/_templates/layout.html +++ b/doc/source/_templates/layout.html @@ -15,5 +15,25 @@ gtag('config', 'UA-110413294-1'); + + + {% endblock %} diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 0c87cc8cf76e..3fe175ec0204 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -53,8 +53,10 @@ parts: - file: ray-air/computer-vision - file: ray-air/examples/serving_guide - file: ray-air/deployment + - file: ray-air/experimental-features - file: ray-air/examples/index sections: + - file: ray-air/examples/opt_deepspeed_batch_inference - file: ray-air/examples/torch_image_example - file: ray-air/examples/torch_detection - file: ray-air/examples/convert_existing_pytorch_code_to_ray_air @@ -81,10 +83,11 @@ parts: - file: ray-air/examples/gptj_batch_prediction - file: ray-air/examples/gptj_serving - file: ray-air/examples/dreambooth_finetuning + - file: ray-air/examples/dolly_lightning_fsdp_finetuning - file: ray-air/api/api - file: ray-air/benchmarks - - file: data/dataset + - file: data/data title: Ray Data sections: - file: data/getting-started @@ -95,10 +98,9 @@ parts: - file: data/examples/nyc_taxi_basic_processing title: Processing the NYC taxi dataset - file: data/examples/batch_training - title: Batch Training with Ray Datasets + title: Batch Training with Ray Data - file: data/examples/ocr_example - title: Scaling OCR with Ray Datasets - - file: data/examples/advanced-pipelines + title: Scaling OCR with Ray Data - file: data/examples/random-access - file: data/faq - file: data/api/api @@ -265,6 +267,7 @@ parts: - file: serve/scaling-and-resource-allocation - file: serve/model_composition - file: serve/dev-workflow + - file: serve/app-builder-guide - file: serve/multi-app - file: serve/production-guide/index sections: @@ -383,6 +386,19 @@ parts: - file: ray-observability/monitoring-debugging/monitoring-debugging title: "Monitoring and Debugging" + sections: + - file: ray-observability/user-guides/index + title: User Guides + sections: + - file: ray-observability/user-guides/troubleshoot-apps/index + title: Troubleshooting Applications + sections: + - file: ray-observability/user-guides/troubleshoot-apps/troubleshoot-failures + - file: ray-observability/user-guides/troubleshoot-apps/troubleshoot-hangs + - file: ray-observability/user-guides/troubleshoot-apps/optimize-performance + - file: ray-observability/user-guides/troubleshoot-apps/ray-debugging + - file: ray-observability/user-guides/troubleshoot-apps/ray-core-profiling + - file: ray-observability/user-guides/ray-tracing - file: ray-references/api title: References @@ -397,7 +413,10 @@ parts: sections: - file: ray-contribute/development - file: ray-contribute/docs + - file: ray-contribute/writing-code-snippets - file: ray-contribute/fake-autoscaler - file: ray-core/examples/testing-tips + - file: ray-contribute/debugging.rst + - file: ray-contribute/profiling.rst - file: ray-core/configure - file: ray-contribute/whitepaper diff --git a/doc/source/cluster/getting-started.rst b/doc/source/cluster/getting-started.rst index 8024ef751a33..e0054beea1b8 100644 --- a/doc/source/cluster/getting-started.rst +++ b/doc/source/cluster/getting-started.rst @@ -31,57 +31,66 @@ or onto :ref:`platforms not listed here `. What's next? ------------ -.. panels:: - :container: text-center - :column: col-lg-6 px-3 py-2 - :card: - - **I want to learn key Ray cluster concepts** - ^^^ - Understand the key concepts and main ways of interacting with a Ray cluster. - - +++ - .. link-button:: cluster-key-concepts - :type: ref - :text: Learn Key Concepts - :classes: btn-outline-info btn-block - - --- - - **I want to run Ray on Kubernetes** - ^^^ - Deploy a Ray application to a Kubernetes cluster. You can run the tutorial on a - Kubernetes cluster or on your laptop via KinD. - - +++ - .. link-button:: kuberay-quickstart - :type: ref - :text: Get Started with Ray on Kubernetes - :classes: btn-outline-info btn-block - - --- - - **I want to run Ray on a cloud provider** - ^^^ - Take a sample application designed to run on a laptop and scale it up in the - cloud. Access to an AWS or GCP account is required. - - +++ - .. link-button:: vm-cluster-quick-start - :type: ref - :text: Get Started with Ray on VMs - :classes: btn-outline-info btn-block - - --- - - **I want to run my application on an existing Ray cluster** - ^^^ - Guide to submitting applications as Jobs to existing Ray clusters. - - +++ - .. link-button:: jobs-quickstart - :type: ref - :text: Job Submission - :classes: btn-outline-info btn-block +.. grid:: 1 2 2 2 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + + **I want to learn key Ray cluster concepts** + ^^^ + Understand the key concepts and main ways of interacting with a Ray cluster. + + +++ + .. button-ref:: cluster-key-concepts + :color: primary + :outline: + :expand: + + Learn Key Concepts + + .. grid-item-card:: + + **I want to run Ray on Kubernetes** + ^^^ + Deploy a Ray application to a Kubernetes cluster. You can run the tutorial on a + Kubernetes cluster or on your laptop via KinD. + + +++ + .. button-ref:: kuberay-quickstart + :color: primary + :outline: + :expand: + + Get Started with Ray on Kubernetes + + .. grid-item-card:: + + **I want to run Ray on a cloud provider** + ^^^ + Take a sample application designed to run on a laptop and scale it up in the + cloud. Access to an AWS or GCP account is required. + + +++ + .. button-ref:: vm-cluster-quick-start + :color: primary + :outline: + :expand: + + Get Started with Ray on VMs + + .. grid-item-card:: + + **I want to run my application on an existing Ray cluster** + ^^^ + Guide to submitting applications as Jobs to existing Ray clusters. + + +++ + .. button-ref:: jobs-quickstart + :color: primary + :outline: + :expand: + + Job Submission .. include:: /_includes/clusters/announcement_bottom.rst diff --git a/doc/source/cluster/kubernetes/configs/static-ray-cluster.tls.yaml b/doc/source/cluster/kubernetes/configs/static-ray-cluster.tls.yaml new file mode 100644 index 000000000000..70e437f508cc --- /dev/null +++ b/doc/source/cluster/kubernetes/configs/static-ray-cluster.tls.yaml @@ -0,0 +1,383 @@ +apiVersion: v1 +kind: Secret +metadata: + name: ca-tls +data: + # output from cat ca.crt | base64 + ca.crt: | + LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM3RENDQWRRQ0NRQ05Yck8zQTAwbWRqQU5CZ2txaGtpRzl3MEJBUXNGQURBNE1SRXdEd1lEVlFRRERBZ3EKTG5KaGVTNXBiekVMTUFrR0ExVUVCaE1DVlZNeEZqQVVCZ05WQkFjTURWTmhiaUJHY21GdVkybHpZMjh3SGhjTgpNak13TXpJM01EZ3dNVFF4V2hjTk16TXdNekkwTURnd01UUXhXakE0TVJFd0R3WURWUVFEREFncUxuSmhlUzVwCmJ6RUxNQWtHQTFVRUJoTUNWVk14RmpBVUJnTlZCQWNNRFZOaGJpQkdjbUZ1WTJselkyOHdnZ0VpTUEwR0NTcUcKU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLQW9JQkFRQ3ZJbGNGSmZxaFNidWowQ3ZpalA0c2xXN3I3Qk1kYVJOeAp5aDhJMGNaSU5QcjQ5Rjg1dXNrY0pxbnFHNC9LeThBYnlacURBUUxsalFUa0Exb3FxVHhGdTZMSm5LOGJHN012Cm90dStjVlZLWW5SeDlLWVoyWi90THRPdzhjZHFzOURuNXVERVh0L0loZzBRc0tVRDNJN3U3QjF5bVpxTjQwWEgKWDVMRUJkN1llSm5XZExqOStLOTl6ZVR0aHlUMWtsRGsySVp2ZjVsa2xjT2hHRzA5RmNtZlF5REFlM2VvTm1IWQpVaUhVU0NORGtnWTV3U3A4V3R6RXEydHBhZEQ2eTVCNVRMS2kvV1l4ZTJLM2tXbTZnUytwQTIvdkZIaU93RHNaClNqb1ZncUtMZ0lNSnZMOGR0bitaWjNLbDlMRkZNY0JiMWJ1NCtKN2U1bno3RTRVSG4wN0pBZ01CQUFFd0RRWUoKS29aSWh2Y05BUUVMQlFBRGdnRUJBQWhSY3g2NzVJbjJVaERhMzArTkZ0UlNTcUJwK1E2WTl3VGNTL0NqM1J3MgpLSnkzUVhBU0xJUW1ESWdrVlBJeEY0V1VYUFdGdmxUL0taQ2JRejRvN2M3ck9DWEVEWnVhbExUSHRrTHVSZFNWClVHSTVSWTJXNUx6UXM2MnNtUG13OWVQYnNLek5kOEpjWkwvNndHZnNsZVQyY1RLTjliZVE2ZWdiQmdEcy91d0sKeVdOREtnaE4vaE16YmRSaFh2SFNiTW8rUkgvRG1Va1VhTXZZc3NNbzFYQkwzRXZwbmpnZXI1ZWQ5ZDVjQWYvUQpuU0VCMk13Z08rWHEwKy9sWmpiUFNWOVdWQnY1YjZlc1ZPcnZrV2o2TUFKcjUwb3BwT09KUy9TbTNEU3F5aDRBClR5c1BOblQxYStxWDRVZXljZ05VbXRoOXdONFBnc3B6ZEpORWtVdTVSSmM9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + # output from cat ca.key | base64 + ca.key: | + LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2Z0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktnd2dnU2tBZ0VBQW9JQkFRQ3ZJbGNGSmZxaFNidWoKMEN2aWpQNHNsVzdyN0JNZGFSTnh5aDhJMGNaSU5QcjQ5Rjg1dXNrY0pxbnFHNC9LeThBYnlacURBUUxsalFUawpBMW9xcVR4RnU2TEpuSzhiRzdNdm90dStjVlZLWW5SeDlLWVoyWi90THRPdzhjZHFzOURuNXVERVh0L0loZzBRCnNLVUQzSTd1N0IxeW1acU40MFhIWDVMRUJkN1llSm5XZExqOStLOTl6ZVR0aHlUMWtsRGsySVp2ZjVsa2xjT2gKR0cwOUZjbWZReURBZTNlb05tSFlVaUhVU0NORGtnWTV3U3A4V3R6RXEydHBhZEQ2eTVCNVRMS2kvV1l4ZTJLMwprV202Z1MrcEEyL3ZGSGlPd0RzWlNqb1ZncUtMZ0lNSnZMOGR0bitaWjNLbDlMRkZNY0JiMWJ1NCtKN2U1bno3CkU0VUhuMDdKQWdNQkFBRUNnZ0VCQUpYbG9XK2hveE83UlNRZmdBQkhSeUdud1NtaWhIWE93cnJKRWFqOXkyVncKRzBOTC9ka3Vld1ZpUGxwR3Z0c0hhMlVkTitkYXpUem1aMEkxY0U1RlRYWXQ5RlgxaXBaOExmRGV4cEFJOXNSVQo0bS9Ld3dRckZVdnZvWGE0YWtOMHBxQm1Kd2xNWHVPRmdOZEJLZXZWTW0xaW9JMisxTjhPb0dIVjlvdGFydks5ClUzY09CbmVBSjZmamF6ODd4RG1NY0dBcG82ZWdMOG0xaWJ1NUNwcFo2L2J2YVZYbHhFdXRtUjZYR2VKczdBRzMKVEtFYVhzTU1qdFdaM3ZXUDArMFJIMGpzRVI2a0ZMeEI3KzRHRWdPSk1WblZqbjlzT3FhVW41KzJ1REkrdkFkbAo0K2Fya3dwQnpzbGlaUVJLVW95aGwvMTRRZW9pcXpwVk9oVVpheFJOTnpVQ2dZRUE0RC83TmRudGFxa0JSdEdiClZUQTE0clA3Vy90THZSQnpBNWtLc3Q4V3crWFVGeTcvVEY2NkxpMVVtKzhhK2ttY1pMTm9mamNZc1pTMExkVXMKMlR4dk1IRWplcmdNUm5oWmUrOGJBZlZkd1RTcCtpdUpMKzRZWW1JRUZWUWlsSXhtRURzMkZQSnVZMHRDZW9ETAprVEFSeUNtMENPYUR1VXdLZjlMY3h3SFR4M3NDZ1lFQXgrNGpyOHV3aXh3WmwwUFBJb1Z1by9wSHZMT0ZxNXNBCmIrVEZnMEhFTVdIK1JKclhLRjA1YTRGNS9zc3pLZ09ZMGFZVUxlWnp3V1dJZElId0pzQnhGWktOdHRYTkhRbS8KOEFlVGRENnZ1OXlmN0tFZjhRNnFmaDRPRExvVDg0UTFWbGs4ek5ZN0FNUWZwN2p5RnpFOStvSm9tdlM0Snc1SApCZUNLZGZGR1RZc0NnWUVBaitkL0JhZTd1MTZJK3pFM1JRdVRDTkFHMVpnRm1tWWI2SXNsV25QZTRBZDBld3dsCnVKUnhWWUN4Y3YrVmlGZ0VqSHEwNjRuZnh0VnVhcHNLRkwyN2ZKS2QrZnB4cGlkRkJVc0RRZFo3TzZqWUN6bzAKNXhVYmdNYjFaOXA5OW1YQ2VWZ0Y5SnMrUzJuWVYxU2ZUYVJUUk9lK0tKZ0VuN3cwWUtLb0d1MEpRbEVDZ1lCZApZdXJnYm5Ca1NoZmFCQjU0cllMa3JUOWM4UzM2M2tmeC9CWVdIVjRiQXY3VjVNMmpXUWc5SXhsczNsVmp4cEpYCk94QXA4SDhaVXVmT0kvT2M1ajdzS0t4eFBxUzBiNTFyN04zL2FsaURrNlpQeldNeUlmdVpOVWl5d1NnWWt5U20KMU1BRm5mdXBlL0tkVVZJamF5amNIcFhsNjNFcExRNFh2SzV3TU9iNXlRS0JnR0kzSTAwSTlnbURzS1JrOFkxdQpId1l0dVdrNjFvWEhUTHorR3d6RUNCQ0VnNkZxMjZVeDZmVzBySlVwV3pOVURCNkRRRGxCTGx3S1M4Z1R3eGtGCkRkY3VrbzFHekdlQWYvazEwWktTZmFXNVcwVlloVGVjSDhyZXpReWxwUk5YT3ZNZkFwWUplcnhBZ09yK3hFajUKK2wwalU0MDBTMUx0cWhLVzZMK3kxRVd5Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tls +data: + gencert_head.sh: | + #!/bin/sh + ## Create tls.key + openssl genrsa -out /etc/ray/tls/tls.key 2048 + + ## Write CSR Config + cat > /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf < /etc/ray/tls/csr.conf < /etc/ray/tls/cert.conf <`_) or proxying from your local machine to the cluster (on `K8s `_). -.. tabbed:: AWS - - With the Ray cluster launcher, you can configure the security group - to allow inbound access by defining :ref:`cluster-configuration-security-group` - in your `cluster.yaml`. - - .. code-block:: yaml - - # An unique identifier for the head node and workers of this cluster. - cluster_name: minimal_security_group - - # Cloud-provider specific configuration. - provider: - type: aws - region: us-west-2 - security_group: - GroupName: ray_client_security_group - IpPermissions: - - FromPort: 10001 - ToPort: 10001 - IpProtocol: TCP - IpRanges: - # This will enable inbound access from ALL IPv4 addresses. - - CidrIp: 0.0.0.0/0 +.. tab-set:: + + .. tab-item:: AWS + + With the Ray cluster launcher, you can configure the security group + to allow inbound access by defining :ref:`cluster-configuration-security-group` + in your `cluster.yaml`. + + .. code-block:: yaml + + # An unique identifier for the head node and workers of this cluster. + cluster_name: minimal_security_group + + # Cloud-provider specific configuration. + provider: + type: aws + region: us-west-2 + security_group: + GroupName: ray_client_security_group + IpPermissions: + - FromPort: 10001 + ToPort: 10001 + IpProtocol: TCP + IpRanges: + # This will enable inbound access from ALL IPv4 addresses. + - CidrIp: 0.0.0.0/0 Step 3: Run Ray code ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/cluster/running-applications/job-submission/sdk.rst b/doc/source/cluster/running-applications/job-submission/sdk.rst index 681275b5c597..c59e9241a690 100644 --- a/doc/source/cluster/running-applications/job-submission/sdk.rst +++ b/doc/source/cluster/running-applications/job-submission/sdk.rst @@ -223,4 +223,23 @@ To be precise, the environment variable ``CUDA_VISIBLE_DEVICES`` will not be set .. note:: - By default, 0 CPUs and 0 GPUs are reserved for the entrypoint script. \ No newline at end of file + By default, 0 CPUs and 0 GPUs are reserved for the entrypoint script. + + +Client Configuration +-------------------------------- + +Additional client connection options, such as custom HTTP headers and cookies, can be passed to the ``JobSubmissionClient`` class. +A full list of options can be found in the :ref:`API Reference `. + +TLS Verification +~~~~~~~~~ +By default, any HTTPS client connections will be verified using system certificates found by the underlying ``requests`` and ``aiohttp`` libraries. +The ``verify`` parameter can be set to override this behavior. For example: + +.. code-block:: python + + client = JobSubmissionClient("https://", verify="/path/to/cert.pem") + +will use the certificate found at ``/path/to/cert.pem`` to verify the job server's certificate. +Certificate verification can be disabled by setting the ``verify`` parameter to ``False``. \ No newline at end of file diff --git a/doc/source/cluster/running-applications/monitoring-and-observability.rst b/doc/source/cluster/running-applications/monitoring-and-observability.rst index 8d41eafa2ed6..c9dd3e39ee9a 100644 --- a/doc/source/cluster/running-applications/monitoring-and-observability.rst +++ b/doc/source/cluster/running-applications/monitoring-and-observability.rst @@ -21,32 +21,34 @@ including the running jobs, actors, workers, nodes, etc. By default, the :ref:`cluster launcher ` and :ref:`KubeRay operator ` will launch the dashboard, but will not publicly expose the port. -.. tabbed:: If using the VM cluster launcher +.. tab-set:: - You can securely port-forward local traffic to the dashboard via the ``ray - dashboard`` command. + .. tab-item:: If using the VM cluster launcher - .. code-block:: shell + You can securely port-forward local traffic to the dashboard via the ``ray + dashboard`` command. - $ ray dashboard [-p ] + .. code-block:: shell - The dashboard will now be visible at ``http://localhost:8265``. + $ ray dashboard [-p ] -.. tabbed:: If using Kubernetes + The dashboard will now be visible at ``http://localhost:8265``. - The KubeRay operator makes the dashboard available via a Service targeting - the Ray head pod, named ``-head-svc``. You can access the - dashboard from within the Kubernetes cluster at ``http://-head-svc:8265``. + .. tab-item:: If using Kubernetes - You can also view the dashboard from outside the Kubernetes cluster by - using port-forwarding: + The KubeRay operator makes the dashboard available via a Service targeting + the Ray head pod, named ``-head-svc``. You can access the + dashboard from within the Kubernetes cluster at ``http://-head-svc:8265``. - .. code-block:: shell + You can also view the dashboard from outside the Kubernetes cluster by + using port-forwarding: - $ kubectl port-forward service/raycluster-autoscaler-head-svc 8265:8265 + .. code-block:: shell - For more information about configuring network access to a Ray cluster on - Kubernetes, see the :ref:`networking notes `. + $ kubectl port-forward service/raycluster-autoscaler-head-svc 8265:8265 + + For more information about configuring network access to a Ray cluster on + Kubernetes, see the :ref:`networking notes `. Using Ray Cluster CLI tools @@ -63,29 +65,31 @@ These CLI commands can be run on any node in a Ray Cluster. Examples for executing these commands from a machine outside the Ray Cluster are provided below. -.. tabbed:: If using the VM cluster launcher +.. tab-set:: + + .. tab-item:: If using the VM cluster launcher - Execute a command on the cluster using ``ray exec``: + Execute a command on the cluster using ``ray exec``: - .. code-block:: shell + .. code-block:: shell - $ ray exec "ray status" + $ ray exec "ray status" -.. tabbed:: If using Kubernetes + .. tab-item:: If using Kubernetes - Execute a command on the cluster using ``kubectl exec`` and the configured - RayCluster name. We will use the Service targeting the Ray head pod to - execute a CLI command on the cluster. + Execute a command on the cluster using ``kubectl exec`` and the configured + RayCluster name. We will use the Service targeting the Ray head pod to + execute a CLI command on the cluster. - .. code-block:: shell + .. code-block:: shell - # First, find the name of the Ray head service. - $ kubectl get pod | grep -head - # NAME READY STATUS RESTARTS AGE - # -head-xxxxx 2/2 Running 0 XXs + # First, find the name of the Ray head service. + $ kubectl get pod | grep -head + # NAME READY STATUS RESTARTS AGE + # -head-xxxxx 2/2 Running 0 XXs - # Then, use the name of the Ray head service to run `ray status`. - $ kubectl exec -head-xxxxx -- ray status + # Then, use the name of the Ray head service to run `ray status`. + $ kubectl exec -head-xxxxx -- ray status .. _multi-node-metrics: @@ -120,7 +124,7 @@ The service discovery file is generated on the :ref:`head node `. +You can choose to use this config or modify your own to enable this behavior. The details of the config can be seen below and full documentation can be found `here `_. With this config, Prometheus will automatically update the addresses that it scrapes based on the contents of Ray's service discovery file. diff --git a/doc/source/cluster/vms/getting-started.rst b/doc/source/cluster/vms/getting-started.rst index 97e553f1f62b..b323c29bd92c 100644 --- a/doc/source/cluster/vms/getting-started.rst +++ b/doc/source/cluster/vms/getting-started.rst @@ -31,37 +31,41 @@ Setup Before we start, you will need to install some Python dependencies as follows: -.. tabbed:: AWS +.. tab-set:: - .. code-block:: shell + .. tab-item:: AWS - $ pip install -U "ray[default]" boto3 + .. code-block:: shell -.. tabbed:: Azure + $ pip install -U "ray[default]" boto3 - .. code-block:: shell + .. tab-item:: Azure - $ pip install -U "ray[default]" azure-cli azure-core + .. code-block:: shell -.. tabbed:: GCP + $ pip install -U "ray[default]" azure-cli azure-core - .. code-block:: shell + .. tab-item:: GCP - $ pip install -U "ray[default]" google-api-python-client + .. code-block:: shell + + $ pip install -U "ray[default]" google-api-python-client Next, if you're not set up to use your cloud provider from the command line, you'll have to configure your credentials: -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - Configure your credentials in ``~/.aws/credentials`` as described in `the AWS docs `_. + Configure your credentials in ``~/.aws/credentials`` as described in `the AWS docs `_. -.. tabbed:: Azure + .. tab-item:: Azure - Log in using ``az login``, then configure your credentials with ``az account set -s ``. + Log in using ``az login``, then configure your credentials with ``az account set -s ``. -.. tabbed:: GCP + .. tab-item:: GCP - Set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable as described in `the GCP docs `_. + Set the ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable as described in `the GCP docs `_. Create a (basic) Python application ----------------------------------- @@ -154,45 +158,47 @@ To start a Ray Cluster, first we need to define the cluster configuration. The c A minimal sample cluster configuration file looks as follows: -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - .. literalinclude:: ../../../../python/ray/autoscaler/aws/example-minimal.yaml - :language: yaml + .. literalinclude:: ../../../../python/ray/autoscaler/aws/example-minimal.yaml + :language: yaml -.. tabbed:: Azure + .. tab-item:: Azure - .. code-block:: yaml + .. code-block:: yaml - # An unique identifier for the head node and workers of this cluster. - cluster_name: minimal + # An unique identifier for the head node and workers of this cluster. + cluster_name: minimal - # Cloud-provider specific configuration. - provider: - type: azure - location: westus2 - resource_group: ray-cluster + # Cloud-provider specific configuration. + provider: + type: azure + location: westus2 + resource_group: ray-cluster - # How Ray will authenticate with newly launched nodes. - auth: - ssh_user: ubuntu - # you must specify paths to matching private and public key pair files - # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair - ssh_private_key: ~/.ssh/id_rsa - # changes to this should match what is specified in file_mounts - ssh_public_key: ~/.ssh/id_rsa.pub + # How Ray will authenticate with newly launched nodes. + auth: + ssh_user: ubuntu + # you must specify paths to matching private and public key pair files + # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair + ssh_private_key: ~/.ssh/id_rsa + # changes to this should match what is specified in file_mounts + ssh_public_key: ~/.ssh/id_rsa.pub -.. tabbed:: GCP + .. tab-item:: GCP - .. code-block:: yaml + .. code-block:: yaml - # A unique identifier for the head node and workers of this cluster. - cluster_name: minimal + # A unique identifier for the head node and workers of this cluster. + cluster_name: minimal - # Cloud-provider specific configuration. - provider: - type: gcp - region: us-west1 + # Cloud-provider specific configuration. + provider: + type: gcp + region: us-west1 Save this configuration file as ``config.yaml``. You can specify a lot more details in the configuration file: instance types to use, minimum and maximum number of workers to start, autoscaling strategy, files to sync, and more. For a full reference on the available configuration properties, please refer to the :ref:`cluster YAML configuration options reference `. diff --git a/doc/source/cluster/vms/index.md b/doc/source/cluster/vms/index.md index 2ef1bcd2f898..b61a894ba311 100644 --- a/doc/source/cluster/vms/index.md +++ b/doc/source/cluster/vms/index.md @@ -8,7 +8,7 @@ for launching AWS and GCP clusters, and also has community-maintained integratio Each Ray cluster consists of a head node and a collection of worker nodes. Optional [autoscaling](vms-autoscaling) support allows the Ray cluster to be sized according to the requirements of your Ray workload, adding and removing worker nodes as needed. Ray supports -clusters composed of multiple heterogenous compute nodes (including GPU nodes). +clusters composed of multiple heterogeneous compute nodes (including GPU nodes). Concretely, you will learn how to: @@ -20,52 +20,67 @@ Concretely, you will learn how to: The Ray docs present all the information you need to start running Ray workloads on VMs. ```{eval-rst} -.. panels:: - :container: text-center - :column: col-lg-6 px-2 py-2 - :card: - - **Getting Started** - ^^^ - - Learn how to start a Ray cluster and deploy Ray applications in the cloud. - - +++ - .. link-button:: vm-cluster-quick-start - :type: ref - :text: Get Started with Ray on Cloud VMs - :classes: btn-outline-info btn-block - --- - **Examples** - ^^^ - - Try example Ray workloads in the Cloud - - +++ - .. link-button:: vm-cluster-examples - :type: ref - :text: Try example workloads - :classes: btn-outline-info btn-block - --- - **User Guides** - ^^^ - - Learn best practices for configuring cloud clusters - - +++ - .. link-button:: vm-cluster-guides - :type: ref - :text: Read the User Guides - :classes: btn-outline-info btn-block - --- - **API Reference** - ^^^ - - Find API references for cloud clusters - - +++ - .. link-button:: vm-cluster-api-references - :type: ref - :text: Check API references - :classes: btn-outline-info btn-block +.. grid:: 1 2 2 2 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + + **Getting Started** + ^^^ + + Learn how to start a Ray cluster and deploy Ray applications in the cloud. + + +++ + .. button-ref:: vm-cluster-quick-start + :color: primary + :outline: + :expand: + + Get Started with Ray on Cloud VMs + + .. grid-item-card:: + + **Examples** + ^^^ + + Try example Ray workloads in the Cloud + + +++ + .. button-ref:: vm-cluster-examples + :color: primary + :outline: + :expand: + + Try example workloads + + .. grid-item-card:: + + **User Guides** + ^^^ + + Learn best practices for configuring cloud clusters + + +++ + .. button-ref:: vm-cluster-guides + :color: primary + :outline: + :expand: + + Read the User Guides + + .. grid-item-card:: + + **API Reference** + ^^^ + + Find API references for cloud clusters + + +++ + .. button-ref:: vm-cluster-api-references + :color: primary + :outline: + :expand: + + Check API references ``` diff --git a/doc/source/cluster/vms/references/ray-cluster-configuration.rst b/doc/source/cluster/vms/references/ray-cluster-configuration.rst index b12bb7a76047..5e5a154b666e 100644 --- a/doc/source/cluster/vms/references/ray-cluster-configuration.rst +++ b/doc/source/cluster/vms/references/ray-cluster-configuration.rst @@ -72,76 +72,82 @@ Docker Auth ~~~~ -.. tabbed:: AWS +.. tab-set:: - .. parsed-literal:: + .. tab-item:: AWS - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str + .. parsed-literal:: -.. tabbed:: Azure + :ref:`ssh_user `: str + :ref:`ssh_private_key `: str - .. parsed-literal:: + .. tab-item:: Azure - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str - :ref:`ssh_public_key `: str + .. parsed-literal:: -.. tabbed:: GCP + :ref:`ssh_user `: str + :ref:`ssh_private_key `: str + :ref:`ssh_public_key `: str - .. parsed-literal:: + .. tab-item:: GCP - :ref:`ssh_user `: str - :ref:`ssh_private_key `: str + .. parsed-literal:: + + :ref:`ssh_user `: str + :ref:`ssh_private_key `: str .. _cluster-configuration-provider-type: Provider ~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - .. parsed-literal:: + .. parsed-literal:: - :ref:`type `: str - :ref:`region `: str - :ref:`availability_zone `: str - :ref:`cache_stopped_nodes `: bool - :ref:`security_group `: - :ref:`Security Group ` + :ref:`type `: str + :ref:`region `: str + :ref:`availability_zone `: str + :ref:`cache_stopped_nodes `: bool + :ref:`security_group `: + :ref:`Security Group ` -.. tabbed:: Azure + .. tab-item:: Azure - .. parsed-literal:: + .. parsed-literal:: - :ref:`type `: str - :ref:`location `: str - :ref:`resource_group `: str - :ref:`subscription_id `: str - :ref:`cache_stopped_nodes `: bool + :ref:`type `: str + :ref:`location `: str + :ref:`resource_group `: str + :ref:`subscription_id `: str + :ref:`cache_stopped_nodes `: bool -.. tabbed:: GCP + .. tab-item:: GCP - .. parsed-literal:: + .. parsed-literal:: - :ref:`type `: str - :ref:`region `: str - :ref:`availability_zone `: str - :ref:`project_id `: str - :ref:`cache_stopped_nodes `: bool + :ref:`type `: str + :ref:`region `: str + :ref:`availability_zone `: str + :ref:`project_id `: str + :ref:`cache_stopped_nodes `: bool .. _cluster-configuration-security-group-type: Security Group ~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - .. parsed-literal:: + .. parsed-literal:: - :ref:`GroupName `: str - :ref:`IpPermissions `: - - `IpPermission `_ + :ref:`GroupName `: str + :ref:`IpPermissions `: + - `IpPermission `_ .. _cluster-configuration-node-types-type: @@ -181,17 +187,19 @@ Cloud-specific configuration for nodes of a given node type. Modifying the ``node_config`` and updating with :ref:`ray up ` will cause the autoscaler to scale down all existing nodes of the node type; nodes with the newly applied ``node_config`` will then be created according to cluster configuration and Ray resource demands. -.. tabbed:: AWS +.. tab-set:: - A YAML object which conforms to the EC2 ``create_instances`` API in `the AWS docs `_. + .. tab-item:: AWS -.. tabbed:: Azure + A YAML object which conforms to the EC2 ``create_instances`` API in `the AWS docs `_. - A YAML object as defined in `the deployment template `_ whose resources are defined in `the Azure docs `_. + .. tab-item:: Azure -.. tabbed:: GCP + A YAML object as defined in `the deployment template `_ whose resources are defined in `the Azure docs `_. - A YAML object as defined in `the GCP docs `_. + .. tab-item:: GCP + + A YAML object as defined in `the GCP docs `_. .. _cluster-configuration-node-docker-type: @@ -347,26 +355,28 @@ Each node type is identified by a user-specified key. * **Type:** :ref:`Node types ` * **Default:** -.. tabbed:: AWS - - .. code-block:: yaml - - available_node_types: - ray.head.default: - node_config: - InstanceType: m5.large - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 140 - resources: {"CPU": 2} - ray.worker.default: - node_config: - InstanceType: m5.large - InstanceMarketOptions: - MarketType: spot - resources: {"CPU": 2} - min_workers: 0 +.. tab-set:: + + .. tab-item:: AWS + + .. code-block:: yaml + + available_node_types: + ray.head.default: + node_config: + InstanceType: m5.large + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 140 + resources: {"CPU": 2} + ray.worker.default: + node_config: + InstanceType: m5.large + InstanceMarketOptions: + MarketType: spot + resources: {"CPU": 2} + min_workers: 0 .. _cluster-configuration-head-node-type: @@ -462,14 +472,16 @@ A list of commands to run to set up nodes. These commands will always run on the * **Type:** List of String * **Default:** -.. tabbed:: AWS +.. tab-set:: - .. code-block:: yaml + .. tab-item:: AWS - # Default setup_commands: - setup_commands: - - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl + .. code-block:: yaml + + # Default setup_commands: + setup_commands: + - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - Setup commands should ideally be *idempotent* (i.e., can be run multiple times without changing the result); this allows Ray to safely update nodes after they have been created. You can usually make commands idempotent with small modifications, e.g. ``git clone foo`` can be rewritten as ``test -e foo || git clone foo`` which checks if the repo is already cloned first. @@ -522,13 +534,15 @@ Commands to start ray on the head node. You don't need to change this. * **Type:** List of String * **Default:** -.. tabbed:: AWS +.. tab-set:: - .. code-block:: yaml + .. tab-item:: AWS - head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + .. code-block:: yaml + + head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml .. _cluster-configuration-worker-start-ray-commands: @@ -542,13 +556,15 @@ Command to start ray on worker nodes. You don't need to change this. * **Type:** List of String * **Default:** -.. tabbed:: AWS +.. tab-set:: - .. code-block:: yaml + .. tab-item:: AWS - worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 + .. code-block:: yaml + + worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 .. _cluster-configuration-image: @@ -691,225 +707,243 @@ The user that Ray will authenticate with when launching new nodes. ``auth.ssh_private_key`` ~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. + The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. - * **Required:** No - * **Importance:** Low - * **Type:** String + * **Required:** No + * **Importance:** Low + * **Type:** String -.. tabbed:: Azure + .. tab-item:: Azure - The path to an existing private key for Ray to use. + The path to an existing private key for Ray to use. - * **Required:** Yes - * **Importance:** High - * **Type:** String + * **Required:** Yes + * **Importance:** High + * **Type:** String - You may use ``ssh-keygen -t rsa -b 4096`` to generate a new ssh keypair. + You may use ``ssh-keygen -t rsa -b 4096`` to generate a new ssh keypair. -.. tabbed:: GCP + .. tab-item:: GCP - The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. + The path to an existing private key for Ray to use. If not configured, Ray will create a new private keypair (default behavior). If configured, the key must be added to the project-wide metadata and ``KeyName`` has to be defined in the :ref:`node configuration `. - * **Required:** No - * **Importance:** Low - * **Type:** String + * **Required:** No + * **Importance:** Low + * **Type:** String .. _cluster-configuration-ssh-public-key: ``auth.ssh_public_key`` ~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - Not available. + .. tab-item:: AWS -.. tabbed:: Azure + Not available. - The path to an existing public key for Ray to use. + .. tab-item:: Azure - * **Required:** Yes - * **Importance:** High - * **Type:** String + The path to an existing public key for Ray to use. -.. tabbed:: GCP + * **Required:** Yes + * **Importance:** High + * **Type:** String - Not available. + .. tab-item:: GCP + + Not available. .. _cluster-configuration-type: ``provider.type`` ~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - The cloud service provider. For AWS, this must be set to ``aws``. + The cloud service provider. For AWS, this must be set to ``aws``. - * **Required:** Yes - * **Importance:** High - * **Type:** String + * **Required:** Yes + * **Importance:** High + * **Type:** String -.. tabbed:: Azure + .. tab-item:: Azure - The cloud service provider. For Azure, this must be set to ``azure``. + The cloud service provider. For Azure, this must be set to ``azure``. - * **Required:** Yes - * **Importance:** High - * **Type:** String + * **Required:** Yes + * **Importance:** High + * **Type:** String -.. tabbed:: GCP + .. tab-item:: GCP - The cloud service provider. For GCP, this must be set to ``gcp``. + The cloud service provider. For GCP, this must be set to ``gcp``. - * **Required:** Yes - * **Importance:** High - * **Type:** String + * **Required:** Yes + * **Importance:** High + * **Type:** String .. _cluster-configuration-region: ``provider.region`` ~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - The region to use for deployment of the Ray cluster. + .. tab-item:: AWS - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** us-west-2 + The region to use for deployment of the Ray cluster. -.. tabbed:: Azure + * **Required:** Yes + * **Importance:** High + * **Type:** String + * **Default:** us-west-2 - Not available. + .. tab-item:: Azure -.. tabbed:: GCP + Not available. - The region to use for deployment of the Ray cluster. + .. tab-item:: GCP - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** us-west1 + The region to use for deployment of the Ray cluster. + + * **Required:** Yes + * **Importance:** High + * **Type:** String + * **Default:** us-west1 .. _cluster-configuration-availability-zone: ``provider.availability_zone`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. - Nodes will be launched in the first listed availability zone and will be tried in the following availability - zones if launching fails. + A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. + Nodes will be launched in the first listed availability zone and will be tried in the following availability + zones if launching fails. - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** us-west-2a,us-west-2b + * **Required:** No + * **Importance:** Low + * **Type:** String + * **Default:** us-west-2a,us-west-2b -.. tabbed:: Azure + .. tab-item:: Azure - Not available. + Not available. -.. tabbed:: GCP + .. tab-item:: GCP - A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. + A string specifying a comma-separated list of availability zone(s) that nodes may be launched in. - * **Required:** No - * **Importance:** Low - * **Type:** String - * **Default:** us-west1-a + * **Required:** No + * **Importance:** Low + * **Type:** String + * **Default:** us-west1-a .. _cluster-configuration-location: ``provider.location`` ~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - Not available. + .. tab-item:: AWS -.. tabbed:: Azure + Not available. - The location to use for deployment of the Ray cluster. + .. tab-item:: Azure - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** westus2 + The location to use for deployment of the Ray cluster. -.. tabbed:: GCP + * **Required:** Yes + * **Importance:** High + * **Type:** String + * **Default:** westus2 - Not available. + .. tab-item:: GCP + + Not available. .. _cluster-configuration-resource-group: ``provider.resource_group`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - Not available. + Not available. -.. tabbed:: Azure + .. tab-item:: Azure - The resource group to use for deployment of the Ray cluster. + The resource group to use for deployment of the Ray cluster. - * **Required:** Yes - * **Importance:** High - * **Type:** String - * **Default:** ray-cluster + * **Required:** Yes + * **Importance:** High + * **Type:** String + * **Default:** ray-cluster -.. tabbed:: GCP + .. tab-item:: GCP - Not available. + Not available. .. _cluster-configuration-subscription-id: ``provider.subscription_id`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - Not available. + .. tab-item:: AWS -.. tabbed:: Azure + Not available. - The subscription ID to use for deployment of the Ray cluster. If not specified, Ray will use the default from the Azure CLI. + .. tab-item:: Azure - * **Required:** No - * **Importance:** High - * **Type:** String - * **Default:** ``""`` + The subscription ID to use for deployment of the Ray cluster. If not specified, Ray will use the default from the Azure CLI. -.. tabbed:: GCP + * **Required:** No + * **Importance:** High + * **Type:** String + * **Default:** ``""`` - Not available. + .. tab-item:: GCP + + Not available. .. _cluster-configuration-project-id: ``provider.project_id`` ~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - Not available. + Not available. -.. tabbed:: Azure + .. tab-item:: Azure - Not available. + Not available. -.. tabbed:: GCP + .. tab-item:: GCP - The globally unique project ID to use for deployment of the Ray cluster. + The globally unique project ID to use for deployment of the Ray cluster. - * **Required:** Yes - * **Importance:** Low - * **Type:** String - * **Default:** ``null`` + * **Required:** Yes + * **Importance:** Low + * **Type:** String + * **Default:** ``null`` .. _cluster-configuration-cache-stopped-nodes: @@ -929,21 +963,23 @@ If enabled, nodes will be *stopped* when the cluster scales down. If disabled, n ``provider.security_group`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - A security group that can be used to specify custom inbound rules. + .. tab-item:: AWS - * **Required:** No - * **Importance:** Medium - * **Type:** :ref:`Security Group ` + A security group that can be used to specify custom inbound rules. -.. tabbed:: Azure + * **Required:** No + * **Importance:** Medium + * **Type:** :ref:`Security Group ` - Not available. + .. tab-item:: Azure -.. tabbed:: GCP + Not available. - Not available. + .. tab-item:: GCP + + Not available. .. _cluster-configuration-group-name: @@ -1041,29 +1077,31 @@ A list of commands to run to set up worker nodes of this type. These commands wi ``available_node_types..node_type.resources.CPU`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - The number of CPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. + The number of CPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. - * **Required:** Yes (except for AWS/K8s) - * **Importance:** High - * **Type:** Integer + * **Required:** Yes (except for AWS/K8s) + * **Importance:** High + * **Type:** Integer -.. tabbed:: Azure + .. tab-item:: Azure - The number of CPUs made available by this node. + The number of CPUs made available by this node. - * **Required:** Yes - * **Importance:** High - * **Type:** Integer + * **Required:** Yes + * **Importance:** High + * **Type:** Integer -.. tabbed:: GCP + .. tab-item:: GCP - The number of CPUs made available by this node. + The number of CPUs made available by this node. - * **Required:** No - * **Importance:** High - * **Type:** Integer + * **Required:** No + * **Importance:** High + * **Type:** Integer .. _cluster-configuration-gpu: @@ -1071,87 +1109,97 @@ A list of commands to run to set up worker nodes of this type. These commands wi ``available_node_types..node_type.resources.GPU`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - The number of GPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. + .. tab-item:: AWS - * **Required:** No - * **Importance:** Low - * **Type:** Integer + The number of GPUs made available by this node. If not configured, Autoscaler can automatically detect them only for AWS/Kubernetes cloud providers. -.. tabbed:: Azure + * **Required:** No + * **Importance:** Low + * **Type:** Integer - The number of GPUs made available by this node. + .. tab-item:: Azure - * **Required:** No - * **Importance:** High - * **Type:** Integer + The number of GPUs made available by this node. -.. tabbed:: GCP + * **Required:** No + * **Importance:** High + * **Type:** Integer - The number of GPUs made available by this node. + .. tab-item:: GCP - * **Required:** No - * **Importance:** High - * **Type:** Integer + The number of GPUs made available by this node. + + * **Required:** No + * **Importance:** High + * **Type:** Integer .. _cluster-configuration-memory: ``available_node_types..node_type.resources.memory`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS - The memory in bytes allocated for python worker heap memory on the node. If not configured, Autoscaler will automatically detect the amount of RAM on the node for AWS/Kubernetes and allocate 70% of it for the heap. +.. tab-set:: + + .. tab-item:: AWS + + The memory in bytes allocated for python worker heap memory on the node. + If not configured, Autoscaler will automatically detect the amount of RAM on + the node for AWS/Kubernetes and allocate 70% of it for the heap. + + * **Required:** No + * **Importance:** Low + * **Type:** Integer - * **Required:** No - * **Importance:** Low - * **Type:** Integer + .. tab-item:: Azure -.. tabbed:: Azure + The memory in bytes allocated for python worker heap memory on the node. - The memory in bytes allocated for python worker heap memory on the node. + * **Required:** No + * **Importance:** High + * **Type:** Integer - * **Required:** No - * **Importance:** High - * **Type:** Integer + .. tab-item:: GCP -.. tabbed:: GCP + The memory in bytes allocated for python worker heap memory on the node. - The memory in bytes allocated for python worker heap memory on the node. + * **Required:** No + * **Importance:** High + * **Type:** Integer - * **Required:** No - * **Importance:** High - * **Type:** Integer - .. _cluster-configuration-object-store-memory: +.. _cluster-configuration-object-store-memory: ``available_node_types..node_type.resources.object-store-memory`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - The memory in bytes allocated for the object store on the node. If not configured, Autoscaler will automatically detect the amount of RAM on the node for AWS/Kubernetes and allocate 30% of it for the object store. + .. tab-item:: AWS - * **Required:** No - * **Importance:** Low - * **Type:** Integer + The memory in bytes allocated for the object store on the node. If not configured, Autoscaler will automatically detect the amount of RAM on the node for AWS/Kubernetes and allocate 30% of it for the object store. -.. tabbed:: Azure + * **Required:** No + * **Importance:** Low + * **Type:** Integer - The memory in bytes allocated for the object store on the node. + .. tab-item:: Azure - * **Required:** No - * **Importance:** High - * **Type:** Integer + The memory in bytes allocated for the object store on the node. -.. tabbed:: GCP + * **Required:** No + * **Importance:** High + * **Type:** Integer - The memory in bytes allocated for the object store on the node. + .. tab-item:: GCP - * **Required:** No - * **Importance:** High - * **Type:** Integer + The memory in bytes allocated for the object store on the node. + + * **Required:** No + * **Importance:** High + * **Type:** Integer .. _cluster-configuration-node-docker: @@ -1171,38 +1219,42 @@ Examples Minimal configuration ~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: + + .. tab-item:: AWS - .. literalinclude:: ../../../../../python/ray/autoscaler/aws/example-minimal.yaml - :language: yaml + .. literalinclude:: ../../../../../python/ray/autoscaler/aws/example-minimal.yaml + :language: yaml -.. tabbed:: Azure + .. tab-item:: Azure - .. literalinclude:: ../../../../../python/ray/autoscaler/azure/example-minimal.yaml - :language: yaml + .. literalinclude:: ../../../../../python/ray/autoscaler/azure/example-minimal.yaml + :language: yaml -.. tabbed:: GCP + .. tab-item:: GCP - .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-minimal.yaml - :language: yaml + .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-minimal.yaml + :language: yaml Full configuration ~~~~~~~~~~~~~~~~~~ -.. tabbed:: AWS +.. tab-set:: - .. literalinclude:: ../../../../../python/ray/autoscaler/aws/example-full.yaml - :language: yaml + .. tab-item:: AWS -.. tabbed:: Azure + .. literalinclude:: ../../../../../python/ray/autoscaler/aws/example-full.yaml + :language: yaml - .. literalinclude:: ../../../../../python/ray/autoscaler/azure/example-full.yaml - :language: yaml + .. tab-item:: Azure -.. tabbed:: GCP + .. literalinclude:: ../../../../../python/ray/autoscaler/azure/example-full.yaml + :language: yaml - .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-full.yaml - :language: yaml + .. tab-item:: GCP + + .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/example-full.yaml + :language: yaml TPU Configuration ~~~~~~~~~~~~~~~~~ @@ -1211,7 +1263,9 @@ It is possible to use `TPU VMs `_. -.. tabbed:: GCP +.. tab-set:: + + .. tab-item:: GCP - .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/tpu.yaml - :language: yaml + .. literalinclude:: ../../../../../python/ray/autoscaler/gcp/tpu.yaml + :language: yaml diff --git a/doc/source/conf.py b/doc/source/conf.py index 28c50db32e28..7aa0c31db9f9 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -38,7 +38,6 @@ extensions = [ "callouts", # custom extension from _ext folder - "sphinx_panels", "sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.napoleon", @@ -46,7 +45,6 @@ "sphinx-jsonschema", "sphinxemoji.sphinxemoji", "sphinx_copybutton", - "sphinxcontrib.yt", "versionwarning.extension", "sphinx_sitemap", "myst_nb", @@ -58,6 +56,7 @@ "sphinxcontrib.redoc", "sphinx_tabs.tabs", "sphinx_remove_toctrees", + "sphinx_design", ] # Prune deep toc-trees on demand for smaller html and faster builds. @@ -236,6 +235,7 @@ "https://www.datanami.com/2018/02/01/rays-new-library-targets-high-speed-reinforcement-learning/", # 403 Client Error: Forbidden for url. # They ratelimit bots. + "https://www.researchgate.net/publication/222573328_Stochastic_Gradient_Boosting", "https://www.datanami.com/2019/11/05/why-every-python-developer-will-love-ray/", "https://dev.mysql.com/doc/connector-python/en/", # Returning 522s intermittently. @@ -432,3 +432,8 @@ def setup(app): ] redoc_uri = "https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js" + +autosummary_filename_map = { + "ray.serve.deployment": "ray.serve.deployment_decorator", + "ray.serve.Deployment": "ray.serve.Deployment", +} diff --git a/doc/source/custom_directives.py b/doc/source/custom_directives.py index 39636421e51a..6f9e057fd38c 100644 --- a/doc/source/custom_directives.py +++ b/doc/source/custom_directives.py @@ -286,16 +286,15 @@ def build_gallery(app): source = yaml.safe_load((Path(app.srcdir) / gallery).read_text()) meta = source["meta"] - is_titled = True if meta.get("section-titles") else False - meta.pop("section-titles") + grid = meta.pop("grid") projects = source["projects"] - buttons = source["buttons"] + classes = source["classes"] for item in projects: - ref = ":type: url" + ref = "button-link" website = item["website"] if "://" not in website: # if it has no http/s protocol, it's a "ref" - ref = ref.replace("url", "ref") + ref = ref.replace("link", "ref") if not item.get("image"): item["image"] = "https://docs.ray.io/_images/ray_logo.png" @@ -308,40 +307,37 @@ def build_gallery(app): gh_stars = ( f".. image:: https://img.shields.io/github/" f"stars/{org}/{repo}?style=social)]\n" - f"\t\t:target: {item['repo']}" + f"\t\t\t:target: {item['repo']}" ) except Exception: pass item = f""" - --- + .. grid-item-card:: :img-top: {item["image"]} + :class-img-top: {classes["class-img-top"]} {gh_stars} {item["description"]} +++ - .. link-button:: {item["website"]} - {ref} - :text: {item["name"]} - :classes: {buttons["classes"]} + .. {ref}:: {item["website"]} + :color: primary + :outline: + :expand: + + {item["name"]} """ + panel_items.append(item) - panel_header = ".. panels::\n" + panel_header = f".. grid:: {grid}\n" for k, v in meta.items(): - panel_header += f"\t:{k}: {v}\n" - - if is_titled: - panels = "" - for item, panel in zip(projects, panel_items): - title = item["section_title"] - underline_title = "-" * len(title) - panels += f"{title}\n{underline_title}\n\n{panel_header}{panel}\n\n" - else: - panel_items = "\n".join(panel_items) - panels = panel_header + panel_items + panel_header += f" :{k}: {v}\n" + + panel_items = "\n".join(panel_items) + panels = panel_header + panel_items gallery_out = gallery.replace(".yml", ".txt") (Path(app.srcdir) / gallery_out).write_text(panels) diff --git a/doc/source/data/api/api.rst b/doc/source/data/api/api.rst index 3d0c571222e4..a5ce1410d1d9 100644 --- a/doc/source/data/api/api.rst +++ b/doc/source/data/api/api.rst @@ -1,6 +1,6 @@ .. _data-api: -Ray Datasets API +Ray Data API ================ .. toctree:: @@ -8,11 +8,10 @@ Ray Datasets API input_output.rst dataset.rst - dataset_iterator.rst - dataset_pipeline.rst + data_iterator.rst execution_options.rst - grouped_dataset.rst - dataset_context.rst + grouped_data.rst + data_context.rst data_representations.rst random_access_dataset.rst utility.rst diff --git a/doc/source/data/api/dataset_context.rst b/doc/source/data/api/data_context.rst similarity index 90% rename from doc/source/data/api/dataset_context.rst rename to doc/source/data/api/data_context.rst index 8d1ae2c112e4..2a006b4a2619 100644 --- a/doc/source/data/api/dataset_context.rst +++ b/doc/source/data/api/data_context.rst @@ -1,4 +1,4 @@ -.. _dataset-context-api: +.. _data-context-api: DataContext API =============== diff --git a/doc/source/data/api/dataset_iterator.rst b/doc/source/data/api/data_iterator.rst similarity index 100% rename from doc/source/data/api/dataset_iterator.rst rename to doc/source/data/api/data_iterator.rst diff --git a/doc/source/data/api/data_representations.rst b/doc/source/data/api/data_representations.rst index 6d731af919bf..c6fa60beb563 100644 --- a/doc/source/data/api/data_representations.rst +++ b/doc/source/data/api/data_representations.rst @@ -1,7 +1,7 @@ .. _data-representations: -Data Representations -==================== +Data Representations (internal) +=============================== .. currentmodule:: ray.data diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst index af6b0dbcebe8..27755db4c03a 100644 --- a/doc/source/data/api/dataset.rst +++ b/doc/source/data/api/dataset.rst @@ -1,7 +1,7 @@ .. _dataset-api: Dataset API -=========== +============== .. currentmodule:: ray.data @@ -41,7 +41,7 @@ Sorting, Shuffling, Repartitioning Dataset.repartition Splitting and Merging Datasets ------------------------------- +--------------------------------- .. autosummary:: :toctree: doc/ @@ -77,8 +77,8 @@ Converting to Pipeline Dataset.repeat Dataset.window -Consuming Datasets ------------------- +Consuming Data +--------------------- .. autosummary:: :toctree: doc/ @@ -126,6 +126,7 @@ Inspecting Metadata :toctree: doc/ Dataset.count + Dataset.columns Dataset.schema Dataset.default_batch_format Dataset.num_blocks @@ -138,9 +139,10 @@ Execution --------- .. autosummary:: - :toctree: doc/ + :toctree: doc/ - Dataset.materialize + Dataset.materialize + ActorPoolStrategy Serialization ------------- diff --git a/doc/source/data/api/dataset_pipeline.rst b/doc/source/data/api/dataset_pipeline.rst deleted file mode 100644 index 70919f562253..000000000000 --- a/doc/source/data/api/dataset_pipeline.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. _dataset-pipeline-api: - -DatasetPipeline API -=================== - -.. currentmodule:: ray.data - -Constructor ------------ - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline - -Basic Transformations ---------------------- - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline.map - DatasetPipeline.map_batches - DatasetPipeline.flat_map - DatasetPipeline.foreach_window - DatasetPipeline.filter - DatasetPipeline.add_column - DatasetPipeline.drop_columns - DatasetPipeline.select_columns - -Sorting, Shuffling, Repartitioning ----------------------------------- - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline.sort_each_window - DatasetPipeline.random_shuffle_each_window - DatasetPipeline.randomize_block_order_each_window - DatasetPipeline.repartition_each_window - -Splitting DatasetPipelines --------------------------- - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline.split - DatasetPipeline.split_at_indices - -Creating DatasetPipelines -------------------------- - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline.repeat - DatasetPipeline.rewindow - DatasetPipeline.from_iterable - -Consuming DatasetPipelines --------------------------- - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline.show - DatasetPipeline.show_windows - DatasetPipeline.take - DatasetPipeline.take_all - DatasetPipeline.iterator - DatasetPipeline.iter_rows - DatasetPipeline.iter_batches - DatasetPipeline.iter_torch_batches - DatasetPipeline.iter_tf_batches - -I/O and Conversion ------------------- - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline.write_json - DatasetPipeline.write_csv - DatasetPipeline.write_parquet - DatasetPipeline.write_datasource - DatasetPipeline.to_tf - DatasetPipeline.to_torch - -Inspecting Metadata -------------------- - -.. autosummary:: - :toctree: doc/ - - DatasetPipeline.schema - DatasetPipeline.count - DatasetPipeline.stats - DatasetPipeline.sum diff --git a/doc/source/data/api/from_other_data_libs.rst b/doc/source/data/api/from_other_data_libs.rst index 5fb1d540074b..f3fe2e448b05 100644 --- a/doc/source/data/api/from_other_data_libs.rst +++ b/doc/source/data/api/from_other_data_libs.rst @@ -3,16 +3,16 @@ API Guide for Users from Other Data Libraries ============================================= -Ray Datasets is a data loading and preprocessing library for ML. It shares certain +Ray Data is a data loading and preprocessing library for ML. It shares certain similarities with other ETL data processing libraries, but also has its own focus. In this API guide, we will provide API mappings for users who come from those data -libraries, so you can quickly map what you may already know to Ray Datasets APIs. +libraries, so you can quickly map what you may already know to Ray Data APIs. .. note:: - This is meant to map APIs that perform comparable but not necessarily identical operations. Please check the API reference for exact semantics and usage. - - This list may not be exhaustive: Ray Datasets is not a traditional ETL data processing library, so not all data processing APIs can map to Datasets. + - This list may not be exhaustive: Ray Data is not a traditional ETL data processing library, so not all data processing APIs can map to Datasets. In addition, we try to focus on common APIs or APIs that are less obvious to see a connection. .. _api-guide-for-pandas-users: @@ -20,11 +20,11 @@ libraries, so you can quickly map what you may already know to Ray Datasets APIs For Pandas Users ---------------- -.. list-table:: Pandas DataFrame vs. Ray Datasets APIs +.. list-table:: Pandas DataFrame vs. Ray Data APIs :header-rows: 1 * - Pandas DataFrame API - - Ray Datasets API + - Ray Data API * - df.head() - :meth:`ds.show() `, :meth:`ds.take() `, or :meth:`ds.take_batch() ` * - df.dtypes @@ -42,7 +42,7 @@ For Pandas Users * - df.groupby() - :meth:`ds.groupby() ` * - df.groupby().apply() - - :meth:`ds.groupby().map_groups() ` + - :meth:`ds.groupby().map_groups() ` * - df.sample() - :meth:`ds.random_sample() ` * - df.sort_values() @@ -67,11 +67,11 @@ For Pandas Users For PyArrow Users ----------------- -.. list-table:: PyArrow Table vs. Ray Datasets APIs +.. list-table:: PyArrow Table vs. Ray Data APIs :header-rows: 1 * - PyArrow Table API - - Ray Datasets API + - Ray Data API * - pa.Table.schema - :meth:`ds.schema() ` * - pa.Table.num_rows diff --git a/doc/source/data/api/grouped_dataset.rst b/doc/source/data/api/grouped_data.rst similarity index 65% rename from doc/source/data/api/grouped_dataset.rst rename to doc/source/data/api/grouped_data.rst index afcfb498d7f6..fce6a8d9705e 100644 --- a/doc/source/data/api/grouped_dataset.rst +++ b/doc/source/data/api/grouped_data.rst @@ -13,7 +13,7 @@ Constructor .. autosummary:: :toctree: doc/ - grouped_dataset.GroupedData + grouped_data.GroupedData Computations / Descriptive Stats -------------------------------- @@ -21,12 +21,12 @@ Computations / Descriptive Stats .. autosummary:: :toctree: doc/ - grouped_dataset.GroupedData.count - grouped_dataset.GroupedData.sum - grouped_dataset.GroupedData.min - grouped_dataset.GroupedData.max - grouped_dataset.GroupedData.mean - grouped_dataset.GroupedData.std + grouped_data.GroupedData.count + grouped_data.GroupedData.sum + grouped_data.GroupedData.min + grouped_data.GroupedData.max + grouped_data.GroupedData.mean + grouped_data.GroupedData.std Function Application -------------------- @@ -34,8 +34,8 @@ Function Application .. autosummary:: :toctree: doc/ - grouped_dataset.GroupedData.aggregate - grouped_dataset.GroupedData.map_groups + grouped_data.GroupedData.aggregate + grouped_data.GroupedData.map_groups Aggregate Function ------------------ diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst index 281d486911bb..e5cef0988e8a 100644 --- a/doc/source/data/api/input_output.rst +++ b/doc/source/data/api/input_output.rst @@ -12,7 +12,6 @@ Synthetic Data :toctree: doc/ range - range_table range_tensor Python Objects diff --git a/doc/source/data/api/random_access_dataset.rst b/doc/source/data/api/random_access_dataset.rst index e3a171af18d2..6bfbdba1585c 100644 --- a/doc/source/data/api/random_access_dataset.rst +++ b/doc/source/data/api/random_access_dataset.rst @@ -1,7 +1,7 @@ .. _random-access-dataset-api: -(Experimental) RandomAccessDataset API -====================================== +RandomAccessDataset (experimental) +================================== .. currentmodule:: ray.data diff --git a/doc/source/data/batch_inference.rst b/doc/source/data/batch_inference.rst new file mode 100644 index 000000000000..cc15baf7fcde --- /dev/null +++ b/doc/source/data/batch_inference.rst @@ -0,0 +1,815 @@ +.. _batch_inference_home: + +Running Batch Inference +======================= + +.. note:: + + In this tutorial you'll learn what batch inference is, why you might want to use + Ray for it, and how to use Ray Data effectively for this task. + If you are familiar with the basics of inference tasks, jump straight to + code in the :ref:`quickstart section `, our detailed + :ref:`walk-through`, + or our :ref:`in-depth guide for PyTorch models`. + +Batch inference refers to generating model predictions on a set of input data. +The model can range from a simple Python function to a complex neural network. +In batch inference, also known as offline inference, your model is run on a large +batch of data on demand. +This is in contrast to online inference, where the model is run immediately on a +data point when it becomes available. + +Here's a simple schematic of batch inference for the computer vision task classifying +images as cats or docs, by "mapping" batches of input data to predictions +via ML model inference: + +.. figure:: images/batch_inference.png + + Evaluating a batch of input data with a model to get predictions. + +Batch inference is a foundational workload for many AI companies, especially since +more and more pre-trained models become available. +And while batch inference looks simple at the surface, it can be challenging to do right in production. +For instance, your data batches can be excessively large, too slow to process sequentially, +or might need custom preprocessing before being fed into your models. +To run inference workloads effectively at scale, you need to: + +- manage your compute infrastructure and cloud clusters +- parallelize data processing and utilize all your cluster resources (CPUs and GPUs) +- efficiently transfer data between cloud storage, CPUs for preprocessing, and GPUs for model inference + +Here's a realistic view of batch inference for modern AI applications: + +.. figure:: images/batch_inference_overview.png + + Evaluating a batch of input data with a model to get predictions. + +Why use Ray Data for batch inference? +------------------------------------- + +There are reasons to use Ray for batch inference, even if your current +use case does not require scaling yet: + +1. **Faster and Cheaper for modern Deep Learning Applications**: + Ray is built for + complex workloads and supports loading and preprocessing data with CPUs and model inference on GPUs. +2. **Cloud, framework, and data format agnostic**: + Ray Data works on any cloud provider or + any ML framework (like PyTorch and Tensorflow) and does not require a particular file format. +3. **Out of the box scaling**: + The same code that works on one machine also runs on a + large cluster without any changes. +4. **Python first**: + You can express your inference job directly in Python instead of + YAML files or other formats. + +.. _batch_inference_quickstart: + +Quick Start +----------- + +If you're impatient and want to see a copy-paste example right away, +here are a few simple examples. +Just pick one of the frameworks you like and run the code in your terminal. +If you want a more detailed rundown of the same examples, skip ahead to the +:ref:`following batch inference walk-through with Ray`. + + +.. tabs:: + + .. group-tab:: HuggingFace + + .. literalinclude:: ./doc_code/hf_quick_start.py + :language: python + :start-after: __hf_super_quick_start__ + :end-before: __hf_super_quick_end__ + + .. group-tab:: PyTorch + + .. literalinclude:: ./doc_code/pytorch_quick_start.py + :language: python + :start-after: __pt_super_quick_start__ + :end-before: __pt_super_quick_end__ + + .. group-tab:: TensorFlow + + .. literalinclude:: ./doc_code/tf_quick_start.py + :language: python + :start-after: __tf_super_quick_start__ + :end-before: __tf_super_quick_end__ + + +.. _batch_inference_walk_through: + +Walk-through: Batch Inference with Ray +-------------------------------------- + +Running batch inference is conceptually easy and requires three steps: + +1. Load your data and apply any preprocessing you need. +2. Define your model and define a transformation that applies your model to your data. +3. Run the transformation on your data. + + +Let's take a look at a simple example of this process without using Ray. +In each example we load ``batches`` of data, load a ``model``, define a ``transform`` +function and apply the model to the data to get ``results``. + +.. tabs:: + + .. group-tab:: HuggingFace + + .. literalinclude:: ./doc_code/hf_quick_start.py + :language: python + :start-after: __hf_no_ray_start__ + :end-before: __hf_no_ray_end__ + + .. group-tab:: PyTorch + + .. literalinclude:: ./doc_code/pytorch_quick_start.py + :language: python + :start-after: __pt_no_ray_start__ + :end-before: __pt_no_ray_end__ + + .. group-tab:: TensorFlow + + .. literalinclude:: ./doc_code/tf_quick_start.py + :language: python + :start-after: __tf_no_ray_start__ + :end-before: __tf_no_ray_end__ + +.. note:: + + As a Python user, this should all look familiar to you. + The only part that you might be wondering about is that we're using + ``Dict[str, np.ndarray]`` as input type to our ``transform`` functions. + We do this to ease the transition to Ray, given that Ray Data uses + ``Dict[str, np.ndarray]`` as the default format for its batches. + + +If you can follow the above examples conceptually, you should have no trouble scaling your batch +inference workload to a compute cluster with Ray Data. +If you're using Ray, the three steps for running batch inference read as follows: + +1. Load a Ray Data dataset and apply any preprocessing you need. This will distribute your data + across the cluster. +2. Define your model in a class and define a transformation that applies your model to + your data batches (of format ``Dict[str, np.ndarray]`` by default). +3. Run inference on your data by using the :meth:`ds.map_batches() ` + method from Ray Data. In this step you also define how your batch processing job + gets distributed across your cluster. + +.. note:: + + All advanced use cases ultimately boil down to extensions of the above three steps, + like loading and storing data from cloud storage, using complex preprocessing functions, + demanding model setups, additional postprocessing, or other customizations. + We'll cover these advanced use cases in the next sections. + +Let's scale out the above examples to a Ray cluster. +To start, install Ray with the data processing library, Ray Data: + +.. code-block:: bash + + pip install ray[data] + + +1. Loading and preprocessing data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For this quick start guide we use very small, in-memory datasets by +leveraging common Python libraries like NumPy and Pandas. + +In fact, we're using the exact same datasets as in the previous section, but load +them into Ray data. +The result of this step is a Dataset ``ds`` that we can use to run inference on. + + +.. tabs:: + + .. group-tab:: HuggingFace + + Create a Pandas DataFrame with text data and convert it to a Dataset + with the :meth:`ray.data.from_pandas() ` method. + + .. literalinclude:: ./doc_code/hf_quick_start.py + :language: python + :start-after: __hf_quickstart_load_start__ + :end-before: __hf_quickstart_load_end__ + + .. group-tab:: PyTorch + + Create a NumPy array with 100 + entries and convert it to a Dataset with the + :meth:`ray.data.from_numpy() ` method. + + .. literalinclude:: ./doc_code/pytorch_quick_start.py + :language: python + :start-after: __pt_quickstart_load_start__ + :end-before: __pt_quickstart_load_end__ + + .. group-tab:: TensorFlow + + Create a NumPy array with 100 + entries and convert it to a Dataset with the + :meth:`ray.data.from_numpy() ` method. + + .. literalinclude:: ./doc_code/tf_quick_start.py + :language: python + :start-after: __tf_quickstart_load_start__ + :end-before: __tf_quickstart_load_end__ + +2. Setting up your model +~~~~~~~~~~~~~~~~~~~~~~~~ + +Next, you want to set up your model for inference, by defining a predictor. +The core idea is to define a class that loads your model in its ``__init__`` method and +and implements a ``__call__`` method that takes a batch of data and returns a batch of predictions. +The ``__call__`` method is essentially the same as the ``transform`` function from the previous section. + +Below you find examples for PyTorch, TensorFlow, and HuggingFace. + +.. tabs:: + + .. group-tab:: HuggingFace + + .. callout:: + + .. literalinclude:: ./doc_code/hf_quick_start.py + :language: python + :start-after: __hf_quickstart_model_start__ + :end-before: __hf_quickstart_model_end__ + + .. annotations:: + <1> Use the constructor (``__init__``) to initialize your model. + + <2> The ``__call__`` method runs inference on a batch of data. + + .. group-tab:: PyTorch + + .. callout:: + + .. literalinclude:: ./doc_code/pytorch_quick_start.py + :language: python + :start-after: __pt_quickstart_model_start__ + :end-before: __pt_quickstart_model_end__ + + .. annotations:: + <1> Use the constructor (``__init__``) to initialize your model. + + <2> The ``__call__`` method runs inference on a batch of data. + + + .. group-tab:: TensorFlow + + .. callout:: + + .. literalinclude:: ./doc_code/tf_quick_start.py + :language: python + :start-after: __tf_quickstart_model_start__ + :end-before: __tf_quickstart_model_end__ + + .. annotations:: + <1> Use the constructor (``__init__``) to initialize your model. + + <2> The ``__call__`` method runs inference on a batch of data. + + +3. Getting predictions with Ray Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once you have your Dataset ``ds`` and your predictor class, you can use +:meth:`ds.map_batches() ` to get predictions. +``map_batches`` takes your predictor class as an argument and allows you to specify +``compute`` resources by defining the :class:`ActorPoolStrategy `. +In the example below, we use two CPUs to run inference in parallel and then print the results. +We cover resource allocation in more detail in :ref:`the configuration section of this guide `. + +.. tabs:: + + .. group-tab:: HuggingFace + + .. literalinclude:: ./doc_code/hf_quick_start.py + :language: python + :start-after: __hf_quickstart_prediction_start__ + :end-before: __hf_quickstart_prediction_end__ + + .. group-tab:: PyTorch + + .. literalinclude:: ./doc_code/pytorch_quick_start.py + :language: python + :start-after: __pt_quickstart_prediction_start__ + :end-before: __pt_quickstart_prediction_end__ + + .. group-tab:: TensorFlow + + .. literalinclude:: ./doc_code/tf_quick_start.py + :language: python + :start-after: __tf_quickstart_prediction_start__ + :end-before: __tf_quickstart_prediction_end__ + + +Note how defining your :meth:`ds.map_batches() ` function requires +you to write a Python method that takes a batch of data and returns a batch of predictions. +An easy way to do this and validate it is to use :meth:`ds.take_batch(N) ` to get a batch of data +first, and then locally test your predictor method on that batch, without using Ray. +Once you are happy with the results, you can use the same function in ``map_batches`` +on the full dataset. Below you see how to do that in our running examples. + +.. tabs:: + + .. group-tab:: HuggingFace + + .. literalinclude:: ./doc_code/hf_quick_start.py + :language: python + :start-after: __hf_quickstart_prediction_test_start__ + :end-before: __hf_quickstart_prediction_test_end__ + + .. group-tab:: PyTorch + + .. literalinclude:: ./doc_code/pytorch_quick_start.py + :language: python + :start-after: __pt_quickstart_prediction_test_start__ + :end-before: __pt_quickstart_prediction_test_end__ + + .. group-tab:: TensorFlow + + .. literalinclude:: ./doc_code/tf_quick_start.py + :language: python + :start-after: __tf_quickstart_prediction_test_start__ + :end-before: __tf_quickstart_prediction_test_end__ + + +.. _batch_inference_advanced_pytorch_example: + +Advanced Guide to Batch Inference with PyTorch +---------------------------------------------- + +Let's use batch inference on a pre-trained PyTorch model for image classification +to illustrate advanced concepts of batch processing with Ray. + +.. important:: + + If you want to dive right into example use cases next, consider reading the following + tutorials next: + + .. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /data/examples/ocr_example + + Batch OCR processing using Ray Data + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/torch_detection + + Fine-tuning an Object Detection Model and using it for Batch Inference + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/torch_image_example + + Training an Image Classifier and using it for Batch Inference + + +Loading data with Ray Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the quick start guide we glossed over the details of loading data with Ray Data. +Your data might be stored in a variety of formats, and you might want to load it from different sources. +Ray Data supports multiple formats and sources out of the box. +The :ref:`guide to loading data ` is the ultimate resource +to learn more about loading data with Ray Data, but we'll cover the basics here, too. + +.. hint:: + + With Ray Data, you can :ref:`create synthetic data in Python `, + :ref:`load data from various storage solutions ` such as S3, + HDFS, or GCS, using common formats such as CSV, JSON, Text, Images, Binary, + TFRecords, Parquet, and more. Ray Data also supports reading from common SQL and NoSQL + databases, and allows you to define your own, custom data sources. + + You can also read :ref:`common Python library formats ` + such as Pandas, NumPy, Arrow, or plain Python objects, as well as from + :ref:`distributed data processing frameworks ` + such as Spark, Dask, Modin, or Mars. + + Of course, Ray Data also supports :ref:`reading data from common ML frameworks ` + like PyTorch, TensorFlow or HuggingFace. + +.. callout:: + + .. literalinclude:: ./doc_code/torch_image_batch_trained.py + :language: python + :start-after: __pt_load_start__ + :end-before: __pt_load_end__ + + .. annotations:: + <1> We use one gigabyte of image data from the Imagenet dataset from S3. + + <2> We use :func:`read_images ` from Ray Data and limit the number of images to 1000. + +The process of loading data with Ray Data is as diverse as the data you have. +For instance, in the example above we didn't load the text labels for our images, +which would require a different data source and loading function. +For any advanced use cases, we recommend you read the +:ref:`guide to loading data `. + +Preprocessing with Ray Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +After loading your data, it often needs to be preprocessed prior to inference. +This may include cropping or resizing images, or tokenizing raw text. + +To introduce common terminology, with :ref:`Ray Data ` you can define +user-defined functions that transform batches of your data. +As you've seen before, applying these functions via +:meth:`ds.map_batches() ` outputs a new, transformed dataset. + +.. note:: + + The way we do preprocessing here is conceptually close to how we do batch + inference, and we use the same :meth:`ds.map_batches() ` + call from Ray Data to run this task. + The main difference is that we don't use a machine learning model to transform our data, + which has some practical consequences. For instance, in the example below we simply + define a map function that we pass into ``map_batches``, and not a class. + +To transform our raw images loaded from S3 in the last step, we use functionality from +the ``torchvision`` package to define a function called ``preprocess_images``. + +.. callout:: + + .. literalinclude:: ./doc_code/torch_image_batch_trained.py + :language: python + :start-after: __pt_preprocess_start__ + :end-before: __pt_preprocess_end__ + + .. annotations:: + <1> We compose PyTorch tensor creation with image preprocessing, so that our processed images "fit" into a ``ResNet18`` PyTorch model. + + <2> We then define a simple function to transform batches of raw data accordingly. Note that these batches come as dictionaries of NumPy images stored in the ``"images"`` key. + + <3> Finally, we apply the function to our dataset using ``map_batches``. + +.. tip:: + + For the full suite of transformations available in Ray Data, read + :ref:`the data transformation guide `. + +Defining predictors as stateful classes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the key value adds of Ray over other distributed systems is the support for +distributed stateful operations. These stateful operations are especially useful +for inference since the model only needs to be initialized once, instead of per batch. + +.. margin:: + + In short, running model inference means applying + :meth:`ds.map_batches() ` + to a dataset with a trained model as a class. + +You've already seen how to do this in the quickstart section of this guide, but now +that you're equipped with more knowledge, let's have a look at how to define a +stateful class with Ray for our pretrained ResNet model: + +.. callout:: + + .. literalinclude:: ./doc_code/torch_image_batch_trained.py + :language: python + :start-after: __pt_model_start__ + :end-before: __pt_model_end__ + + .. annotations:: + <1> The ``__init__`` method is used to initialize the model once. Ray takes care of distributing and managing this state for our batch processing task. + + <2> The ``__call__`` method is used to apply the model to a batch of data. + + <3> We're free to use any custom code in a stateful class. + + <4> Finally, we return the ``"class"`` key of the model predictions as Numpy array. + +.. note:: + + Of course, you can also use GPUs for inference with Ray. + Jump ahead to the :ref:`GPU usage section ` to see how + to modify the current example to use GPUs. + + +Scalable inference with Ray Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To get predictions, we call :meth:`ds.map_batches() `, +by making sure to specify a :class:`ActorPoolStrategy ` +which defines how many workers to use for inference. + +.. callout:: + + .. literalinclude:: ./doc_code/torch_image_batch_trained.py + :language: python + :start-after: __pt_prediction_start__ + :end-before: __pt_prediction_end__ + + .. annotations:: + <1> In this example we use a total of four Ray Actors to run inference on our dataset. + + <2> Each actor should use one GPU. + +To summarize, mapping a function over batches is the simplest transform for Datasets. +The function defines the logic for transforming individual batches of data of the dataset +Performing operations over batches of data is more performant than single element +operations as it can leverage the underlying vectorization capabilities of Pandas or NumPy. + + +.. note:: + + You can use :meth:`ds.map_batches() ` on functions, too. + This is mostly useful for quick transformations of your data that doesn't require + an ML model or other stateful objects. + To handle state, using classes like we did above is the recommended way. + In the dropdown below you find an example of mapping data with a simple Python + function. + + .. dropdown:: Example using ``map_batches`` with functions + + This example transforms example data using a simple Python function. + The ``map_function`` uses the fact that our ``data`` batches in this particular + example are Pandas dataframes. + Note that by using a map function instead of a class, we don't have to define + :class:`ActorPoolStrategy ` to specify compute resources. + + .. literalinclude:: ./doc_code/batch_formats.py + :language: python + :start-after: __simple_map_function_start__ + :end-before: __simple_map_function_end__ + +.. _batch_inference_formats: + +Working with batch formats +-------------------------- + +Now that you've seen examples of batch inference with Ray, let's have a closer look +at how to deal with different data formats for batches. +First of all, you need to distinguish between two types of batch formats: + +- Input batch formats: This is the format of the input to your transformation function. You will often have to + refer to the right format name to run batch inference on your data. +- Output batch formats: This is the format your function return. + +In many standard cases, the input batch format is the same as the output batch format, +but it's good to be aware of the differences. + +.. margin:: + + We refer to batch formats by name in Ray Data (using strings). + For instance, the batch format used to represent Pandas dataframes is called ``"pandas"``. + We often use batch format names and the libraries they represent interchangeably. + +Let's focus on the three available input batch formats first, +namely NumPy, Pandas and Arrow, and how they're used in Ray Data. +By default, the batch format will be ``"numpy"``, but you can specify other formats +as you see fit. + +.. tab-set:: + + .. tab-item:: NumPy (default) + + The ``"numpy"`` batch format presents batches as dictionary of + `numpy.ndarray `__ (``Dict[str, np.ndarray]``), with each key-value pair representing one column. + + .. literalinclude:: ./doc_code/batch_formats.py + :language: python + :start-after: __simple_numpy_start__ + :end-before: __simple_numpy_end__ + + .. tab-item:: Pandas + + The ``"pandas"`` batch format presents batches in + `pandas.DataFrame `__ + format. + + .. literalinclude:: ./doc_code/batch_formats.py + :language: python + :start-after: __simple_pandas_start__ + :end-before: __simple_pandas_end__ + + .. tab-item:: Arrow + + The ``"pyarrow"`` batch format presents batches in ``pyarrow.Table`` format. + + .. literalinclude:: ./doc_code/batch_formats.py + :language: python + :start-after: __simple_pyarrow_start__ + :end-before: __simple_pyarrow_end__ + +When defining the return value of your function, you can choose between +dictionaries of NumPy arrays (``Dict[str, np.ndarray]``), Pandas dataframes +(``pandas.DataFrame``), and Arrow tables (``pyarrow.Table``). + +You can learn more about output formats in :ref:`the transforming data guide `. + +.. important:: + + No matter which batch format you use, you will always have to be familiar with + the underlying APIs used to represent your data. For instance, if you use the + ``"pandas"`` batch format, you will need to know the basics of interacting with + dataframes to make your batch inference jobs work. + +.. seealso:: + + As we've discussed in this guide, using :meth:`ds.map_batches() ` + on a class defining your model + should be your default choice for running inference with Ray. + For instance, if you're already using the Ray AIR framework for running your ML workflows, + you may want to use the + :ref:`framework-specific batch predictor implementations`. + + To see an extension of the quick start example using an AIR + ``HuggingFacePredictor``, see the following example: + + .. dropdown:: Batch inference example with HuggingFace and Ray AIR + + .. literalinclude:: ./doc_code/hf_quick_start.py + :language: python + :start-after: __hf_quickstart_air_start__ + :end-before: __hf_quickstart_air_end__ + +.. _batch_inference_config: + +Configuration & Troubleshooting +------------------------------- + +Configuring Batch Size +~~~~~~~~~~~~~~~~~~~~~~ + +An important parameter to set for :meth:`ds.map_batches() ` +is ``batch_size``, which controls the size of the batches provided to the function. +Here's a simple example of loading the IRIS dataset (which has Pandas format by default) +and processing it with a batch size of `10`: + +.. literalinclude:: ./doc_code/batch_formats.py + :language: python + :start-after: __simple_map_function_start__ + :end-before: __simple_map_function_end__ + +Increasing ``batch_size`` can result in faster execution by better leveraging vectorized +operations and hardware, reducing batch slicing and concatenation overhead, and overall +saturation of CPUs or GPUs. +On the other hand, this will also result in higher memory utilization, which can +lead to out-of-memory (OOM) failures. +If encountering OOMs, decreasing your ``batch_size`` may help. + +.. caution:: + The default ``batch_size`` of ``4096`` may be too large for datasets with large rows + (e.g. tables with many columns or a collection of large images). + + +.. _batch_inference_gpu: + +Using GPUs in batch inference +----------------------------- + +To use GPUs for inference, first pdate the callable class implementation to +move the model and data to and from Cuda device. +Here's a quick example for a PyTorch model: + +.. code-block:: diff + + from torchvision.models import resnet18 + + class TorchModel: + def __init__(self): + self.model = resnet18(pretrained=True) + + self.model = self.model.cuda() + self.model.eval() + + def __call__(self, batch: Dict[str, np.ndarray]): + torch_batch = torch.stack(batch["data"]) + + torch_batch = torch_batch.cuda() + with torch.inference_mode(): + prediction = self.model(torch_batch) + - return {"class": prediction.argmax(dim=1).detach().numpy()} + + return {"class": prediction.argmax(dim=1).detach().cpu().numpy()} + + +Next, specify ``num_gpus=N`` in :meth:`ds.map_batches() ` +to indicate that each inference worker should use ``N`` GPUs. + +.. code-block:: diff + + predictions = ds.map_batches( + TorchModel, + compute=ray.data.ActorPoolStrategy(size=2), + + num_gpus=1 + ) + +**How should I configure num_cpus and num_gpus for my model?** + +By default, Ray will assign 1 CPU per task or actor. For example, on a machine +with 16 CPUs, this will result in 16 tasks or actors running concurrently for inference. +To change this, you can specify ``num_cpus=N``, which will tell Ray to reserve more CPUs +for the task or actor, or ``num_gpus=N``, which will tell Ray to reserve/assign GPUs +(GPUs will be assigned via `CUDA_VISIBLE_DEVICES` env var). + +.. code-block:: python + + # Use 16 actors, each of which is assigned 1 GPU (16 GPUs total). + ds = ds.map_batches( + MyFn, + compute=ActorPoolStrategy(size=16), + num_gpus=1 + ) + + # Use 16 actors, each of which is reserved 8 CPUs (128 CPUs total). + ds = ds.map_batches( + MyFn, + compute=ActorPoolStrategy(size=16), + num_cpus=8) + + +**How should I deal with OOM errors due to heavy model memory usage?** + +It's common for models to consume a large amount of heap memory. For example, if a model +uses 5GB of RAM when created / run, and a machine has 16GB of RAM total, then no more +than three of these models can be run at the same time. The default resource assignments +of one CPU per task/actor will likely lead to OutOfMemoryErrors from Ray in this situation. + +Let's suppose our machine has 16GiB of RAM and 8 GPUs. To tell Ray to construct at most +3 of these actors per node, we can override the CPU or memory: + +.. code-block:: python + + # Require 5 CPUs per actor (so at most 3 can fit per 16 CPU node). + ds = ds.map_batches( + MyFn, + compute=ActorPoolStrategy(size=16), + num_cpus=5) + +Learn more +---------- + + +Batch inference is just one small part of the Machine Learning workflow, and only +a fraction of what Ray can do. + +.. figure:: images/train_predict_pipeline.png + + How batch inference fits into the bigger picture of training and prediction AI models. + +To learn more about Ray and batch inference, check out the following +tutorials and examples: + +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /data/examples/nyc_taxi_basic_processing + + Batch Inference on NYC taxi data using Ray Data + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /data/examples/ocr_example + + Batch OCR processing using Ray Data + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/torch_detection + + Fine-tuning an Object Detection Model and using it for Batch Inference + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/torch_image_example + + Training an Image Classifier and using it for Batch Inference + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/stablediffusion_batch_prediction + + Stable Diffusion Batch Prediction with Ray AIR diff --git a/doc/source/data/consuming-datasets.rst b/doc/source/data/consuming-data.rst similarity index 53% rename from doc/source/data/consuming-datasets.rst rename to doc/source/data/consuming-data.rst index bd67e078eb4b..83af18c68f2b 100644 --- a/doc/source/data/consuming-datasets.rst +++ b/doc/source/data/consuming-data.rst @@ -1,8 +1,8 @@ -.. _consuming_datasets: +.. _consuming_data: -================== -Consuming Datasets -================== +===================== +Consuming Data +===================== The data underlying a ``Dataset`` can be consumed in several ways: @@ -16,43 +16,38 @@ Retrieving a limited set of rows A limited set of rows can be retrieved from a ``Dataset`` via the :meth:`ds.take() ` or :meth:`ds.take_batch() ` APIs, and :meth:`ds.show() `, for printing a limited set of rows. These -methods are convenient for quickly inspecting a subset (prefix) of rows. They have the -benefit that, if used right after reading, they will only trigger more files to be -read if needed to retrieve rows from that file; if inspecting a small prefix of rows, -often only the first file will need to be read. +methods are convenient for quickly inspecting a subset (prefix) of rows. -.. literalinclude:: ./doc_code/consuming_datasets.py +.. literalinclude:: ./doc_code/consuming_data.py :language: python :start-after: __take_begin__ :end-before: __take_end__ Iterating over Datasets -======================= +========================== Datasets can be consumed a row at a time using the :meth:`ds.iter_rows() ` API -.. literalinclude:: ./doc_code/consuming_datasets.py +.. literalinclude:: ./doc_code/consuming_data.py :language: python :start-after: __iter_rows_begin__ :end-before: __iter_rows_end__ or a batch at a time using the :meth:`ds.iter_batches() ` API, where you can specify -batch size as well as the desired batch format. By default, the batch format is -``"default"``. For tabular data, the default format is a Pandas DataFrame; for Python -objects, it's a list. +batch size as well as the desired batch format. By default, the batches have type +``Dict[str, np.ndarray]`` (NumPy format). -.. literalinclude:: ./doc_code/consuming_datasets.py +.. literalinclude:: ./doc_code/consuming_data.py :language: python :start-after: __iter_batches_begin__ :end-before: __iter_batches_end__ - Datasets can be passed to Ray tasks or actors and accessed by these iteration methods. This does not incur a copy, since the blocks of the Dataset are passed by reference as Ray objects: -.. literalinclude:: ./doc_code/consuming_datasets.py +.. literalinclude:: ./doc_code/consuming_data.py :language: python :start-after: __remote_iterators_begin__ :end-before: __remote_iterators_end__ @@ -61,62 +56,62 @@ This does not incur a copy, since the blocks of the Dataset are passed by refere Splitting Into and Consuming Shards =================================== -Datasets can be split up into disjoint sub-datasets, or shards. -Locality-aware splitting is supported if you pass in a list of actor handles to the -:meth:`ds.split() ` function along with the number of desired splits. +Datasets can be split up into disjoint iterators, or shards. This is a common pattern useful for loading and sharding data between distributed training actors: .. note:: If using :ref:`Ray Train ` for distributed training, you do not need to split the dataset; Ray - Train will automatically do locality-aware splitting into per-trainer shards for you! + Train will automatically do locality-aware splitting into per-trainer shards for you. -.. literalinclude:: ./doc_code/consuming_datasets.py +.. literalinclude:: ./doc_code/consuming_data.py :language: python :start-after: __split_begin__ :end-before: __split_end__ -.. _saving_datasets: +.. _saving_data: -Saving Datasets -=============== +Saving Data +================== Datasets can be written to local or remote storage in the desired data format. The supported formats include Parquet, CSV, JSON, NumPy. To control the number of output files, you may use :meth:`ds.repartition() ` to repartition the Dataset before writing out. -.. tabbed:: Parquet +.. tab-set:: + + .. tab-item:: Parquet - .. literalinclude:: ./doc_code/saving_datasets.py - :language: python - :start-after: __write_parquet_begin__ - :end-before: __write_parquet_end__ + .. literalinclude:: ./doc_code/saving_data.py + :language: python + :start-after: __write_parquet_begin__ + :end-before: __write_parquet_end__ -.. tabbed:: CSV + .. tab-item:: CSV - .. literalinclude:: ./doc_code/saving_datasets.py - :language: python - :start-after: __write_csv_begin__ - :end-before: __write_csv_end__ + .. literalinclude:: ./doc_code/saving_data.py + :language: python + :start-after: __write_csv_begin__ + :end-before: __write_csv_end__ -.. tabbed:: JSON + .. tab-item:: JSON - .. literalinclude:: ./doc_code/saving_datasets.py - :language: python - :start-after: __write_json_begin__ - :end-before: __write_json_end__ + .. literalinclude:: ./doc_code/saving_data.py + :language: python + :start-after: __write_json_begin__ + :end-before: __write_json_end__ -.. tabbed:: NumPy + .. tab-item:: NumPy - .. literalinclude:: ./doc_code/saving_datasets.py - :language: python - :start-after: __write_numpy_begin__ - :end-before: __write_numpy_end__ + .. literalinclude:: ./doc_code/saving_data.py + :language: python + :start-after: __write_numpy_begin__ + :end-before: __write_numpy_end__ -.. tabbed:: TFRecords + .. tab-item:: TFRecords - .. literalinclude:: ./doc_code/saving_datasets.py - :language: python - :start-after: __write_tfrecords_begin__ - :end-before: __write_tfrecords_end__ + .. literalinclude:: ./doc_code/saving_data.py + :language: python + :start-after: __write_tfrecords_begin__ + :end-before: __write_tfrecords_end__ diff --git a/doc/source/data/creating-datasets.rst b/doc/source/data/creating-datasets.rst deleted file mode 100644 index ff17277fd400..000000000000 --- a/doc/source/data/creating-datasets.rst +++ /dev/null @@ -1,887 +0,0 @@ -.. _creating_datasets: - -================= -Creating Datasets -================= - -Ray :class:`Datasets ` can be created from: - -* generated synthetic data, -* local and distributed in-memory data, and -* local and external storage systems (local disk, cloud storage, HDFS, etc.). - -This guide surveys the many ways to create a ``Dataset``. If none of these meet your -needs, please reach out to us on `Discourse `__ or open a feature -request on the `Ray GitHub repo `__, and check out -our :ref:`guide for implementing a custom Datasets datasource ` -if you're interested in rolling your own integration! - -.. _dataset_generate_data: - -------------------------- -Generating Synthetic Data -------------------------- - -.. tabbed:: Int Range - - Create a ``Dataset`` from a range of integers. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __gen_synth_int_range_begin__ - :end-before: __gen_synth_int_range_end__ - -.. tabbed:: Tabular Range - - Create an Arrow (tabular) ``Dataset`` from a range of integers, - with a single column containing this integer range. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __gen_synth_tabular_range_begin__ - :end-before: __gen_synth_tabular_range_end__ - -.. tabbed:: Tensor Range - - Create a tensor dataset from a range of integers, packing this integer range into - tensors of the provided shape. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __gen_synth_tensor_range_begin__ - :end-before: __gen_synth_tensor_range_end__ - -.. _dataset_reading_from_storage: - --------------------------- -Reading Files From Storage --------------------------- - -Using the ``ray.data.read_*()`` APIs, Datasets can be created from files on local disk -or remote storage system such as S3, GCS, Azure Blob Storage, or HDFS. Any filesystem -`supported by pyarrow `__ -can be used to specify file locations, and many common file formats are supported: -Parquet, CSV, JSON, NPY, text, binary. - -Each of these APIs take a path or list of paths to files or directories. Any directories -provided will be walked in order to obtain concrete file paths, at which point all files -will be read in parallel. - -.. _dataset_supported_file_formats: - -Supported File Formats -====================== - -.. tabbed:: Parquet - - Read Parquet files into a tabular ``Dataset``. The Parquet data will be read into - `Arrow Table `__ - blocks. Although this simple example demonstrates reading a single file, note that - Datasets can also read directories of Parquet files. We also support reading partitioned - Parquet datasets with partition column values pulled from the file paths. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_parquet_begin__ - :end-before: __read_parquet_end__ - - Datasets' Parquet reader also supports projection and filter pushdown, allowing column - selection and row filtering to be pushed down to the file scan. For column selection, - unselected columns will never be read from the file. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_parquet_pushdown_begin__ - :end-before: __read_parquet_pushdown_end__ - - See the API docs for :func:`read_parquet() `. - -.. tabbed:: CSV - - Read CSV files into a tabular ``Dataset``. The CSV data will be read into - `Arrow Table `__ - blocks. Although this simple example demonstrates reading a single file, note that - Datasets can also read directories of CSV files, with one tabular block created - per file. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_csv_begin__ - :end-before: __read_csv_end__ - - See the API docs for :func:`read_csv() `. - -.. tabbed:: JSON - - Read JSON files into a tabular ``Dataset``. The JSON data will be read into - `Arrow Table `__ - blocks. Although this simple example demonstrates reading a single file, note that - Datasets can also read directories of JSON files, with one tabular block created - per file. - - Currently, only newline-delimited JSON (NDJSON) is supported. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_json_begin__ - :end-before: __read_json_end__ - - See the API docs for :func:`read_json() `. - -.. tabbed:: NumPy - - Read NumPy files into a tensor ``Dataset``. The NumPy ndarray data will be read into - single-column - `Arrow Table `__ - blocks using our - :class:`tensor extension type `, - treating the outermost ndarray dimension as the row dimension. See our - :ref:`tensor data guide ` for more information on working - with tensors in Datasets. Although this simple example demonstrates reading a single - file, note that Datasets can also read directories of NumPy files, with one tensor - block created per file. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_numpy_begin__ - :end-before: __read_numpy_end__ - - See the API docs for :func:`read_numpy() `. - -.. tabbed:: Text - - Read text files into a ``Dataset``. Each line in each text file will be treated as a - row in the dataset, resulting in a list-of-strings block being created for each text - file. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_text_begin__ - :end-before: __read_text_end__ - - See the API docs for :func:`read_text() `. - -.. tabbed:: Images - - Call :func:`~ray.data.read_images` to read images into a :class:`~ray.data.Dataset`. - - This function stores image data in single-column - `Arrow Table `__ - blocks using the - :class:`tensor extension type `. - For more information on working with tensors in Datasets, read the - :ref:`tensor data guide `. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_images_begin__ - :end-before: __read_images_end__ - -.. tabbed:: Binary - - Read binary files into a ``Dataset``. Each binary file will be treated as a single row - of opaque bytes. These bytes can be decoded into tensor, tabular, text, or any other - kind of data using :meth:`~ray.data.Dataset.map_batches` to apply a per-row decoding - :ref:`user-defined function `. - - Although this simple example demonstrates reading a single file, note that Datasets - can also read directories of binary files, with one bytes block created per file. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_binary_begin__ - :end-before: __read_binary_end__ - - See the API docs for :func:`read_binary_files() `. - -.. tabbed:: TFRecords - - Call :func:`~ray.data.read_tfrecords` to read TFRecord files into a tabular - :class:`~ray.data.Dataset`. - - .. warning:: - Only `tf.train.Example `_ - records are supported. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_tfrecords_begin__ - :end-before: __read_tfrecords_end__ - -.. _dataset_reading_remote_storage: - - -Reading from Remote Storage -=========================== - -All of the file formats mentioned above can be read from remote storage, such as S3, -GCS, Azure Blob Storage, and HDFS. These storage systems are supported via Arrow's -filesystem APIs natively for S3 and HDFS, and as a wrapper around fsspec for GCS and -HDFS. All ``ray.data.read_*()`` APIs expose a ``filesystem`` argument that accepts both -`Arrow FileSystem `__ instances -and `fsspec FileSystem `__ instances, -allowing you to configure this connection to the remote storage system, such as -authn/authz and buffer/block size. - -For S3 and HDFS, the underlying `FileSystem -`__ -implementation will be inferred from the URL scheme (``"s3://"`` and ``"hdfs://"``); if -the default connection configuration suffices for your workload, you won't need to -specify a ``filesystem`` argument. - -We use Parquet files for the below examples, but all of the aforementioned file formats -are supported for each of these storage systems. - -.. tabbed:: S3 - - The AWS S3 storage system is inferred from the URI scheme (``s3://``), with required connection - configuration such as S3 credentials being pulled from the machine's environment - (e.g. the ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY`` environment variables). - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_parquet_s3_begin__ - :end-before: __read_parquet_s3_end__ - - If needing to customize this S3 storage system connection (credentials, region, - endpoint override, etc.), you can pass in an - `S3FileSystem `__ instance - to :func:`read_parquet() `. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __read_parquet_s3_with_fs_begin__ - :end-before: __read_parquet_s3_with_fs_end__ - -.. tabbed:: HDFS - - The HDFS storage system is inferred from the URI scheme (``hdfs://``), with required connection - configuration such as the host and the port being derived from the URI. - - .. note:: - - This example is not runnable as-is; you'll need to point it at your HDFS - cluster/data. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __read_parquet_hdfs_begin__ - :end-before: __read_parquet_hdfs_end__ - - If needing to customize this HDFS storage system connection (host, port, user, kerb - ticket, etc.), you can pass in an `HDFSFileSystem - `__ - instance to :func:`read_parquet() `. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __read_parquet_hdfs_with_fs_begin__ - :end-before: __read_parquet_hdfs_with_fs_end__ - -.. tabbed:: GCS - - Data can be read from Google Cloud Storage by providing a configured - `gcsfs GCSFileSystem `__, where the - appropriate Google Cloud project and credentials can be specified. - - .. note:: - This example is not runnable as-is; you'll need to point it at your GCS bucket and - configure your GCP project and credentials. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __read_parquet_gcs_begin__ - :end-before: __read_parquet_gcs_end__ - - .. tip:: - To verify that your GCP project and credentials are set up, validate - that the GCS `filesystem` has permissions to read the input `path`. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __validate_parquet_gcs_begin__ - :end-before: __validate_parquet_gcs_end__ - - For more examples, see the `GCSFS Documentation `__. - -.. tabbed:: ADL/ABS (Azure) - - Data can be read from Azure Blob Storage by providing a configured - `adlfs AzureBlobFileSystem `__, where the appropriate - account name and account key can be specified. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __read_parquet_az_begin__ - :end-before: __read_parquet_az_end__ - -Reading from Local Storage -========================== - -In Ray Datasets, users often read from remote storage systems as described above. In -some use cases, users may want to read from local storage. There are three ways to read -from a local filesystem: - -* **Providing a local filesystem path**: For example, in ``ray.data.read_csv("my_file.csv")``, - the given path will be resolved as a local filesystem path. - -.. note:: - - If the file exists only on the local node and you run this read operation in - distributed cluster, this will fail as it cannot access the file from remote node. - -* **Using ``local://`` custom URI scheme**: Similarly, this will be resolved to local - filesystem, e.g. ``ray.data.read_csv("local://my_file.csv")`` will read the - same file as the approach above. The difference is that this scheme will ensure - all read tasks happen on the local node, so it's safe to run in a distributed - cluster. -* **Using ``example://`` custom URI scheme**: The paths with this scheme will be resolved - to ``ray/data/examples/data`` directory in the Ray package. This scheme is used - only for testing or demoing examples. - -Reading Compressed Files -======================== - -Ray Datasets supports reading compressed files using the ``arrow_open_stream_args`` arg. -`Codecs supported by Arrow `__ -(bz2, brotli, gzip, lz4 or zstd) are compatible with Ray Datasets. -For example: - -.. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __read_compressed_begin__ - :end-before: __read_compressed_end__ - -.. _dataset_from_in_memory_data: - -------------------- -From In-Memory Data -------------------- - -Datasets can be constructed from existing in-memory data. In addition to being able to -construct a ``Dataset`` from plain Python objects, Datasets also interoperates with popular -single-node libraries (`Pandas `__, -`NumPy `__, `Arrow `__) as well as -distributed frameworks (:ref:`Dask `, :ref:`Spark `, -:ref:`Modin `, :ref:`Mars `). - -.. _dataset_from_in_memory_data_single_node: - -From Single-Node Data Libraries -=============================== - -In this section, we demonstrate creating a ``Dataset`` from single-node in-memory data. - -.. tabbed:: Pandas - - Create a ``Dataset`` from a Pandas DataFrame. This constructs a ``Dataset`` - backed by a single Pandas DataFrame block. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_pandas_begin__ - :end-before: __from_pandas_end__ - - We can also build a ``Dataset`` from more than one Pandas DataFrame, where each said - DataFrame will become a block in the ``Dataset``. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_pandas_mult_begin__ - :end-before: __from_pandas_mult_end__ - -.. tabbed:: NumPy - - Create a ``Dataset`` from a NumPy ndarray. This constructs a ``Dataset`` - backed by a single-column Arrow table block; the outer dimension of the ndarray - will be treated as the row dimension, and the column will have name ``"__value__"``. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_numpy_begin__ - :end-before: __from_numpy_end__ - - We can also build a ``Dataset`` from more than one NumPy ndarray, where each said - ndarray will become a single-column Arrow table block in the ``Dataset``. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_numpy_mult_begin__ - :end-before: __from_numpy_mult_end__ - -.. tabbed:: Arrow - - Create a ``Dataset`` from an - `Arrow Table `__. - This constructs a ``Dataset`` backed by a single Arrow ``Table`` block. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_arrow_begin__ - :end-before: __from_arrow_end__ - - We can also build a ``Dataset`` from more than one Arrow Table, where each said - ``Table`` will become a block in the ``Dataset``. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_arrow_mult_begin__ - :end-before: __from_arrow_mult_end__ - -.. tabbed:: Python Objects - - Create a ``Dataset`` from a list of Python objects; since each object in this - particular list is a dictionary, Datasets will treat this list as a list of tabular - records, and will construct an Arrow ``Dataset``. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_items_begin__ - :end-before: __from_items_end__ - -.. _dataset_from_in_memory_data_distributed: - -From Distributed Data Processing Frameworks -=========================================== - -In addition to working with single-node in-memory data, Datasets can be constructed from -distributed (multi-node) in-memory data, interoperating with popular distributed -data processing frameworks such as :ref:`Dask `, :ref:`Spark `, -:ref:`Modin `, and :ref:`Mars `. - -These conversions work by running Ray tasks converting each Dask/Spark/Modin/Mars -data partition to a block format supported by Datasets (copying data if needed), and using the -futures representing the return value of those conversion tasks as the ``Dataset`` block -futures. - -.. note:: - - These data processing frameworks must be running on Ray in order for these Datasets - integrations to work. See how these frameworks can be run on Ray in our - :ref:`data processing integrations docs `. - -.. tabbed:: Dask - - Create a ``Dataset`` from a - `Dask DataFrame `__. This constructs a - ``Dataset`` backed by the distributed Pandas DataFrame partitions that underly the - Dask DataFrame. - - This conversion has near-zero overhead, since Datasets simply reinterprets existing - Dask-in-Ray partition objects as Dataset blocks. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_dask_begin__ - :end-before: __from_dask_end__ - -.. tabbed:: Spark - - Create a ``Dataset`` from a `Spark DataFrame - `__. - This constructs a ``Dataset`` backed by the distributed Spark DataFrame partitions - that underly the Spark DataFrame. When this conversion happens, Spark-on-Ray (RayDP) - will save the Spark DataFrame partitions to Ray's object store in the Arrow format, - which Datasets will then interpret as its blocks. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __from_spark_begin__ - :end-before: __from_spark_end__ - -.. tabbed:: Modin - - Create a ``Dataset`` from a Modin DataFrame. This constructs a ``Dataset`` - backed by the distributed Pandas DataFrame partitions that underly the Modin DataFrame. - - This conversion has near-zero overhead, since Datasets simply reinterprets existing - Modin partition objects as Dataset blocks. - - .. literalinclude:: ./doc_code/creating_datasets.py - :language: python - :start-after: __from_modin_begin__ - :end-before: __from_modin_end__ - -.. tabbed:: Mars - - Create a ``Dataset`` from a Mars DataFrame. This constructs a ``Dataset`` - backed by the distributed Pandas DataFrame partitions that underly the Mars DataFrame. - - This conversion has near-zero overhead, since Datasets simply reinterprets existing - Mars partition objects as Dataset blocks. - - .. literalinclude:: ./doc_code/creating_datasets_untested.py - :language: python - :start-after: __from_mars_begin__ - :end-before: __from_mars_end__ - -.. _dataset_from_torch_tf: - -------------------------- -From Torch and TensorFlow -------------------------- - -.. tabbed:: PyTorch - - If you already have a Torch dataset available, you can create a Ray Dataset using - :class:`~ray.data.from_torch`. - - .. warning:: - :class:`~ray.data.from_torch` doesn't support parallel - reads. You should only use this datasource for small datasets like MNIST or - CIFAR. - - .. code-block:: python - - import ray - import torchvision - - dataset = torchvision.datasets.MNIST("data", download=True) - dataset = ray.data.from_torch(dataset) - dataset.take(1) - # (, 5) - -.. tabbed:: TensorFlow - - If you already have a TensorFlow dataset available, you can create a Ray Dataset - using :class:`~ray.data.from_tf`. - - .. warning:: - :class:`~ray.data.from_tf` doesn't support parallel reads. You - should only use this function with small datasets like MNIST or CIFAR. - - .. code-block:: python - - import ray - import tensorflow_datasets as tfds - - dataset, _ = tfds.load("cifar10", split=["train", "test"]) - dataset = ray.data.from_tf(dataset) - - dataset - # -> Dataset(num_blocks=200, num_rows=50000, schema={id: binary, image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8), label: int64}) - -.. _dataset_from_huggingface: - -------------------------------- -From 🤗 (Hugging Face) Datasets -------------------------------- - -You can convert 🤗 Datasets into Ray Datasets by using -:py:class:`~ray.data.from_huggingface`. This function accesses the underlying Arrow table and -converts it into a Ray Dataset directly. - -.. warning:: - :py:class:`~ray.data.from_huggingface` doesn't support parallel - reads. This will not usually be an issue with in-memory 🤗 Datasets, - but may fail with large memory-mapped 🤗 Datasets. 🤗 ``IterableDataset`` - objects are not supported. - -.. code-block:: python - - import ray.data - from datasets import load_dataset - - hf_datasets = load_dataset("wikitext", "wikitext-2-raw-v1") - ray_datasets = ray.data.from_huggingface(hf_datasets) - ray_datasets["train"].take(2) - # [{'text': ''}, {'text': ' = Valkyria Chronicles III = \n'}] - -.. _dataset_mongo_db: - ------------- -From MongoDB ------------- - -A Dataset can also be created from `MongoDB `__ with -:py:class:`~ray.data.read_mongo`. -This interacts with MongoDB similar to external filesystems, except here you will -need to specify the MongoDB source by its `uri `__, -`database and collection `__, -and specify a `pipeline `__ to run against -the collection. The execution results are then used to create a Dataset. - -.. note:: - - This example is not runnable as-is; you'll need to point it at your MongoDB - instance. - -.. code-block:: python - - import ray - - # Read a local MongoDB. - ds = ray.data.read_mongo( - uri="mongodb://localhost:27017", - database="my_db", - collection="my_collection", - pipeline=[{"$match": {"col": {"$gte": 0, "$lt": 10}}}, {"$sort": "sort_col"}], - ) - - # Reading a remote MongoDB is the same. - ds = ray.data.read_mongo( - uri="mongodb://username:password@mongodb0.example.com:27017/?authSource=admin", - database="my_db", - collection="my_collection", - pipeline=[{"$match": {"col": {"$gte": 0, "$lt": 10}}}, {"$sort": "sort_col"}], - ) - - # Write back to MongoDB. - ds.write_mongo( - MongoDatasource(), - uri="mongodb://username:password@mongodb0.example.com:27017/?authSource=admin", - database="my_db", - collection="my_collection", - ) - -.. _datasets_sql_databases: - --------------------------- -Reading From SQL Databases --------------------------- - -Call :func:`~ray.data.read_sql` to read data from a database that provides a -`Python DB API2-compliant `_ connector. - -.. tabbed:: MySQL - - To read from MySQL, install - `MySQL Connector/Python `_. It's the - first-party MySQL database connector. - - .. code-block:: console - - pip install mysql-connector-python - - Then, define your connection login and query the database. - - .. code-block:: python - - import mysql.connector - - import ray - - def create_connection(): - return mysql.connector.connect( - user="admin", - password=..., - host="example-mysql-database.c2c2k1yfll7o.us-west-2.rds.amazonaws.com", - connection_timeout=30, - database="example", - ) - - # Get all movies - dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) - # Get movies after the year 1980 - dataset = ray.data.read_sql( - "SELECT title, score FROM movie WHERE year >= 1980", create_connection - ) - # Get the number of movies per year - dataset = ray.data.read_sql( - "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection - ) - - -.. tabbed:: PostgreSQL - - To read from PostgreSQL, install `Psycopg 2 `_. It's - the most popular PostgreSQL database connector. - - .. code-block:: console - - pip install psycopg2-binary - - Then, define your connection login and query the database. - - .. code-block:: python - - import psycopg2 - - import ray - - def create_connection(): - return psycopg2.connect( - user="postgres", - password=..., - host="example-postgres-database.c2c2k1yfll7o.us-west-2.rds.amazonaws.com", - dbname="example", - ) - - # Get all movies - dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) - # Get movies after the year 1980 - dataset = ray.data.read_sql( - "SELECT title, score FROM movie WHERE year >= 1980", create_connection - ) - # Get the number of movies per year - dataset = ray.data.read_sql( - "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection - ) - -.. tabbed:: Snowflake - - To read from Snowflake, install the - `Snowflake Connector for Python `_. - - .. code-block:: console - - pip install snowflake-connector-python - - Then, define your connection login and query the database. - - .. code-block:: python - - import snowflake.connector - - import ray - - def create_connection(): - return snowflake.connector.connect( - user=..., - password=... - account="ZZKXUVH-IPB52023", - database="example", - ) - - # Get all movies - dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) - # Get movies after the year 1980 - dataset = ray.data.read_sql( - "SELECT title, score FROM movie WHERE year >= 1980", create_connection - ) - # Get the number of movies per year - dataset = ray.data.read_sql( - "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection - ) - - -.. tabbed:: Databricks - - To read from Databricks, install the - `Databricks SQL Connector for Python `_. - - .. code-block:: console - - pip install databricks-sql-connector - - - Then, define your connection logic and read from the Databricks SQL warehouse. - - .. code-block:: python - - from databricks import sql - - import ray - - def create_connection(): - return sql.connect( - server_hostname="dbc-1016e3a4-d292.cloud.databricks.com", - http_path="/sql/1.0/warehouses/a918da1fc0b7fed0", - access_token=..., - - - # Get all movies - dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) - # Get movies after the year 1980 - dataset = ray.data.read_sql( - "SELECT title, score FROM movie WHERE year >= 1980", create_connection - ) - # Get the number of movies per year - dataset = ray.data.read_sql( - "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection - ) - -.. tabbed:: BigQuery - - To read from BigQuery, install the - `Python Client for Google BigQuery `_. - This package includes a DB API2-compliant database connector. - - .. code-block:: console - - pip install google-cloud-bigquery - - Then, define your connection login and query the dataset. - - .. code-block:: python - - from google.cloud import bigquery - from google.cloud.bigquery import dbapi - - import ray - - def create_connection(): - client = bigquery.Client(...) - return dbapi.Connection(client) - - # Get all movies - dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) - # Get movies after the year 1980 - dataset = ray.data.read_sql( - "SELECT title, score FROM movie WHERE year >= 1980", create_connection - ) - # Get the number of movies per year - dataset = ray.data.read_sql( - "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection - ) - - -.. _datasets_custom_datasource: - ------------------- -Custom Datasources ------------------- - -Datasets can read and write in parallel to :ref:`custom datasources ` defined in Python. -Once you have implemented `YourCustomDataSource`, you can use it like any other source in Ray Data: - -.. code-block:: python - - # Read from a custom datasource. - ds = ray.data.read_datasource(YourCustomDatasource(), **read_args) - - # Write to a custom datasource. - ds.write_datasource(YourCustomDatasource(), **write_args) - -For more details, check out :ref:`guide for implementing a custom Datasets datasource `. - --------------------------- -Performance Considerations --------------------------- - -Read Parallelism -================ - -Datasets automatically selects the read ``parallelism`` according to the following procedure: - -1. The number of available CPUs is estimated. If in a placement group, the number of CPUs in the cluster is scaled by the size of the placement group compared to the cluster size. If not in a placement group, this is the number of CPUs in the cluster. -2. The parallelism is set to the estimated number of CPUs multiplied by 2. If the parallelism is less than 8, it is set to 8. -3. The in-memory data size is estimated. If the parallelism would create in-memory blocks that are larger on average than the target block size (512MiB), the parallelism is increased until the blocks are < 512MiB in size. -4. The parallelism is truncated to ``min(num_files, parallelism)``. - -To perform the read, ``parallelism`` parallel read tasks will be -launched, each reading one or more files and each creating a single block of data. -When reading from remote datasources, these parallel read tasks will be spread across -the nodes in your Ray cluster, creating the distributed collection of blocks that makes -up a distributed Ray Dataset. - -.. image:: images/dataset-read.svg - :width: 650px - :align: center - -This default parallelism can be overridden via the ``parallelism`` argument; see the -:ref:`performance guide ` for tips on how to tune this read -parallelism. - -.. _dataset_deferred_reading: - -Deferred Read Task Execution -============================ - -Datasets created via the ``ray.data.read_*()`` APIs are lazy: no read tasks are -executed until a downstream consumption operation triggers execution. Metadata -inspection functions like :meth:`ds.schema() ` and -:meth:`ds.show() ` will trigger execution of only one or some -tasks, instead of all tasks. This allows metadata to be inspected right away. Execution -of all read tasks can be triggered manually using the -:meth:`ds.materialize() ` API. diff --git a/doc/source/data/custom-datasource.rst b/doc/source/data/custom-datasource.rst index d08d1e105bf1..584b927f39ff 100644 --- a/doc/source/data/custom-datasource.rst +++ b/doc/source/data/custom-datasource.rst @@ -7,9 +7,9 @@ Custom Datasources .. note:: This MongoDatasource guide below is for education only. For production use of MongoDB - in Ray Datasets, see :ref:`Creating Dataset from MongoDB `. + in Ray Data, see :ref:`Creating Dataset from MongoDB `. -Ray Datasets supports multiple ways to :ref:`create a dataset `, +Ray Data supports multiple ways to :ref:`create a dataset `, allowing you to easily ingest data of common formats from popular sources. However, if the datasource you want to read from is not in the built-in list, don't worry, you can implement a custom one for your use case. In this guide, we will walk you through how to build @@ -94,7 +94,7 @@ MongoDB. This ``Reader`` creates a list of :class:`~ray.data.ReadTask` for the g list of MongoDB pipelines. Each :class:`~ray.data.ReadTask` returns a list of blocks when called, and each :class:`~ray.data.ReadTask` is executed in remote workers to parallelize the execution. -You can find documentation about Ray Datasets :ref:`block concept here ` and :ref:`block APIs here `. +You can find documentation about Ray Data :ref:`block concept here ` and :ref:`block APIs here `. First, let's handle a single MongoDB pipeline, which is the unit of execution in :class:`~ray.data.ReadTask`. We need to connect to MongoDB, execute the pipeline against it, @@ -169,7 +169,7 @@ a ``MongoDatasource``. :start-after: __mongo_datasource_start__ :end-before: __mongo_datasource_end__ -Now you can create a Ray Dataset from and write back to MongoDB, just like +Now you can create a Dataset from and write back to MongoDB, just like any other datasource! .. code-block:: python diff --git a/doc/source/data/dataset-internals.rst b/doc/source/data/data-internals.rst similarity index 80% rename from doc/source/data/dataset-internals.rst rename to doc/source/data/data-internals.rst index e3b196f99dec..e040a7fa08a1 100644 --- a/doc/source/data/dataset-internals.rst +++ b/doc/source/data/data-internals.rst @@ -7,70 +7,49 @@ Scheduling, Execution, and Memory Management Scheduling ========== -Datasets uses Ray core for execution, and hence is subject to the same scheduling considerations as normal Ray tasks and actors. Datasets uses the following custom scheduling settings by default for improved performance: +Ray Data uses Ray core for execution, and hence is subject to the same scheduling considerations as normal Ray tasks and actors. Ray Data uses the following custom scheduling settings by default for improved performance: * The ``SPREAD`` scheduling strategy is used to ensure data blocks are evenly balanced across the cluster. * Retries of application-level exceptions are enabled to handle transient errors from remote datasources. -* Dataset tasks ignore placement groups by default, see :ref:`Datasets and Placement Groups `. +* Dataset tasks ignore placement groups by default, see :ref:`Ray Data and Placement Groups `. .. _datasets_tune: -Datasets and Tune +Ray Data and Tune ~~~~~~~~~~~~~~~~~ -When using Datasets in conjunction with :ref:`Ray Tune `, it is important to ensure there are enough free CPUs for Datasets to run on. By default, Tune will try to fully utilize cluster CPUs. This can prevent Datasets from scheduling tasks, reducing performance or causing workloads to hang. +When using Ray Data in conjunction with :ref:`Ray Tune `, it is important to ensure there are enough free CPUs for Ray Data to run on. By default, Tune will try to fully utilize cluster CPUs. This can prevent Ray Data from scheduling tasks, reducing performance or causing workloads to hang. -As an example, the following shows two ways to use Datasets together with Tune: +To ensure CPU resources are always available for Ray Data execution, limit the number of concurrent Tune trials. This can be done using the ``max_concurrent_trials`` Tune option. -.. tabbed:: Limiting Tune Concurrency - - By limiting the number of concurrent Tune trials, we ensure CPU resources are always available for Datasets execution. - This can be done using the ``max_concurrent_trials`` Tune option. - - .. literalinclude:: ./doc_code/key_concepts.py - :language: python - :start-after: __resource_allocation_1_begin__ - :end-before: __resource_allocation_1_end__ - -.. tabbed:: Reserving CPUs (Experimental) - - Alternatively, we can tell Tune to set aside CPU resources for other libraries. - This can be done by setting ``_max_cpu_fraction_per_node=0.8``, which reserves - 20% of node CPUs for Dataset execution. - - .. literalinclude:: ./doc_code/key_concepts.py - :language: python - :start-after: __resource_allocation_2_begin__ - :end-before: __resource_allocation_2_end__ - - .. warning:: - - This option is experimental and not currently recommended for use with - autoscaling clusters (scale-up will not trigger properly). +.. literalinclude:: ./doc_code/key_concepts.py + :language: python + :start-after: __resource_allocation_1_begin__ + :end-before: __resource_allocation_1_end__ .. _datasets_pg: -Datasets and Placement Groups +Ray Data and Placement Groups ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -By default, Datasets configures its tasks and actors to use the cluster-default scheduling strategy ("DEFAULT"). You can inspect this configuration variable here: +By default, Ray Data configures its tasks and actors to use the cluster-default scheduling strategy ("DEFAULT"). You can inspect this configuration variable here: :class:`ray.data.DataContext.get_current().scheduling_strategy `. This scheduling strategy will schedule these tasks and actors outside any present -placement group. If you want to force Datasets to schedule tasks within the current placement group (i.e., to use current placement group resources specifically for Datasets), you can set ``ray.data.DataContext.get_current().scheduling_strategy = None``. +placement group. If you want to force Ray Data to schedule tasks within the current placement group (i.e., to use current placement group resources specifically for Ray Data), you can set ``ray.data.DataContext.get_current().scheduling_strategy = None``. -This should be considered for advanced use cases to improve performance predictability only. We generally recommend letting Datasets run outside placement groups as documented in the :ref:`Datasets and Other Libraries ` section. +This should be considered for advanced use cases to improve performance predictability only. We generally recommend letting Ray Data run outside placement groups as documented in the :ref:`Ray Data and Other Libraries ` section. -.. _datasets_execution: +.. _dataset_execution: Execution ========= -The Datasets execution by default is: +Ray Data execution by default is: - **Lazy**: This means that transformations on Dataset are not executed until a consumption operation (e.g. :meth:`ds.iter_batches() `) or :meth:`Dataset.materialize() ` is called. This creates opportunities for optimizing the execution plan (e.g. :ref:`stage fusion `). -- **Pipelined**: This means that Dataset transformations will be executed in a +- **Streaming**: This means that Dataset transformations will be executed in a streaming way, incrementally on the base data, instead of on all of the data at once, and overlapping the execution of operations. This can be used for streaming data loading into ML training to overlap the data preprocessing and model training, @@ -94,11 +73,11 @@ exceptions to this rule, where transformations such as :meth:`ds.union() :meth:`ds.limit() ` trigger execution; we plan to make these operations lazy in the future. -Check the API docs for Datasets methods to see if they +Check the API docs for Ray Data methods to see if they trigger execution. Those that do trigger execution will have a ``Note`` indicating as much. -.. _datasets_streaming_execution: +.. _streaming_execution: Streaming Execution ~~~~~~~~~~~~~~~~~~~ @@ -206,7 +185,7 @@ Locality with Output (ML ingest use case) ctx.execution_options.locality_with_output = True -Setting this to True tells Datasets to prefer placing operator tasks onto the consumer node in the cluster, rather than spreading them evenly across the cluster. This can be useful if you know you'll be consuming the output data directly on the consumer node (i.e., for ML training ingest). However, this may incur a performance penalty for other use cases. +Setting this to True tells Ray Data to prefer placing operator tasks onto the consumer node in the cluster, rather than spreading them evenly across the cluster. This can be useful if you know you'll be consuming the output data directly on the consumer node (i.e., for ML training ingest). However, this may incur a performance penalty for other use cases. Scalability ----------- @@ -217,7 +196,7 @@ We expect the data streaming backend to scale to tens of thousands of files / bl Stage Fusion Optimization ~~~~~~~~~~~~~~~~~~~~~~~~~ -In order to reduce memory usage and task overheads, Datasets will automatically fuse together +In order to reduce memory usage and task overheads, Ray Data will automatically fuse together lazy operations that are compatible: * Same compute pattern: embarrassingly parallel map vs. all-to-all shuffle @@ -241,34 +220,34 @@ You can tell if stage fusion is enabled by checking the :ref:`Dataset stats ` with batch size small enough such that the output batch can comfortably fit into memory. +Large block size can lead to potential out-of-memory situations. To avoid these issues, make sure no single item in your Ray Data is too large, and always call :meth:`ds.map_batches() ` with batch size small enough such that the output batch can comfortably fit into memory. Object Store Memory ~~~~~~~~~~~~~~~~~~~ -Datasets uses the Ray object store to store data blocks, which means it inherits the memory management features of the Ray object store. This section discusses the relevant features: +Ray Data uses the Ray object store to store data blocks, which means it inherits the memory management features of the Ray object store. This section discusses the relevant features: -* Object Spilling: Since Datasets uses the Ray object store to store data blocks, any blocks that can't fit into object store memory are automatically spilled to disk. The objects are automatically reloaded when needed by downstream compute tasks: +* Object Spilling: Since Ray Data uses the Ray object store to store data blocks, any blocks that can't fit into object store memory are automatically spilled to disk. The objects are automatically reloaded when needed by downstream compute tasks: * Locality Scheduling: Ray will preferentially schedule compute tasks on nodes that already have a local copy of the object, reducing the need to transfer objects between nodes in the cluster. * Reference Counting: Dataset blocks are kept alive by object store reference counting as long as there is any Dataset that references them. To free memory, delete any Python references to the Dataset object. Block Data Formats ~~~~~~~~~~~~~~~~~~ -In order to optimize conversion costs, Datasets can hold tabular data in-memory +In order to optimize conversion costs, Ray Data can hold tabular data in-memory as either `Arrow Tables `__ or `Pandas DataFrames `__. -Different ways of creating Datasets leads to a different starting internal format: +Different ways of creating Ray Data leads to a different starting internal format: * Reading tabular files (Parquet, CSV, JSON) creates Arrow blocks initially. * Converting from Pandas, Dask, Modin, and Mars creates Pandas blocks initially. @@ -276,5 +255,5 @@ Different ways of creating Datasets leads to a different starting internal forma * Reading TFRecord file creates Arrow blocks. * Reading MongoDB creates Arrow blocks. -However, this internal format is not exposed to the user. Datasets converts between formats +However, this internal format is not exposed to the user. Ray Data converts between formats as needed internally depending on the specified ``batch_format`` of transformations. diff --git a/doc/source/data/data.rst b/doc/source/data/data.rst new file mode 100644 index 000000000000..95624773d7d2 --- /dev/null +++ b/doc/source/data/data.rst @@ -0,0 +1,225 @@ +.. include:: /_includes/data/announcement.rst + +.. _data: + +================================== +Ray Data: Scalable Datasets for ML +================================== + +.. _data-intro: + +Ray Data scales common ML data processing patterns in batch inference +and distributed training applications. Ray Data does this by providing +streaming distributed transformations +such as maps (:meth:`map_batches `), +global and grouped aggregations (:class:`GroupedData `), and +shuffling operations (:meth:`random_shuffle `, +:meth:`sort `, +:meth:`repartition `). + +Read on for an overview of the main use cases and operations supported by Ray Data. + +.. image:: images/dataset.svg + +.. + https://docs.google.com/drawings/d/16AwJeBNR46_TsrkOmMbGaBK7u-OPsf_V8fHjU-d2PPQ/edit + +------------------------- +Streaming Batch Inference +------------------------- + +Ray Data simplifies general purpose parallel GPU and CPU compute in Ray through its +powerful streaming :ref:`Dataset ` primitive. Datasets enable workloads such as +:doc:`GPU batch inference ` to run efficiently on large datasets, +maximizing resource utilization by streaming the working data through Ray object store memory. + +.. image:: images/stream-example.png + :width: 650px + :align: center + +.. + https://docs.google.com/presentation/d/1l03C1-4jsujvEFZUM4JVNy8Ju8jnY5Lc_3q7MBWi2PQ/edit#slide=id.g230eb261ad2_0_0 + +As part of the Ray ecosystem, Ray Data can leverage the full functionality of Ray's distributed scheduler, +e.g., using actors for optimizing setup time and GPU scheduling, and supports data throughputs of +100GiB/s or more for common inference workloads. + +To learn more about the features Ray Data supports, read the +:ref:`Data User Guide `. + +--------------------------------------- +Streaming Preprocessing for ML Training +--------------------------------------- + +Use Ray Data to load and preprocess data for distributed :ref:`ML training pipelines ` in a streaming fashion. +Ray Data serves as a last-mile bridge from storage or ETL pipeline outputs to distributed +applications and libraries in Ray. Don't use it as a replacement for more general data +processing systems. + +.. image:: images/dataset-loading-1.png + :width: 650px + :align: center + +.. + https://docs.google.com/presentation/d/1l03C1-4jsujvEFZUM4JVNy8Ju8jnY5Lc_3q7MBWi2PQ/edit + +---------------------- +Where to Go from Here? +---------------------- + +As new user of Ray Data, you may want to start with our :ref:`Getting Started Guide `. +If you've run your first examples already, you might want to dive into Ray Data' +:ref:`key concepts ` or our :ref:`User Guide ` instead. +Advanced users can refer directly to the Ray Data :ref:`API reference ` for their projects. + +.. grid:: 1 2 2 2 + :gutter: 1 + :class-container: container pb-6 + + .. grid-item-card:: + + **Getting Started** + ^^^ + + Start with our quick start tutorials for working with Data. + These concrete examples will give you an idea of how to use Ray Data. + + +++ + .. button-ref:: data_getting_started + :color: primary + :outline: + :expand: + + Get Started with Ray Data + + .. grid-item-card:: + + **Key Concepts** + ^^^ + + Understand the key concepts behind Ray Data. + Learn what :ref:`Datasets ` are and how they are executed in Ray + Data. + + +++ + .. button-ref:: data_key_concepts + :color: primary + :outline: + :expand: + + Learn Key Concepts + + .. grid-item-card:: + + **User Guides** + ^^^ + + Learn how to :ref:`load data `, :ref:`save + data `, :ref:`transform data `, + :ref:`access and exchange data `, or + :ref:`work with tensor data `. + + +++ + .. button-ref:: data_user_guide + :color: primary + :outline: + :expand: + + Start Using Ray Data + + .. grid-item-card:: + + **Examples** + ^^^ + + Find both simple and scaling-out examples of using Ray Data for data + processing and ML ingest. + + +++ + .. button-ref:: data-recipes + :color: primary + :outline: + :expand: + + Ray Data Examples + + .. grid-item-card:: + + **Ray Data FAQ** + ^^^ + + Find answers to commonly asked questions in our detailed FAQ. + + +++ + .. button-ref:: data_faq + :color: primary + :outline: + :expand: + + Ray Data FAQ + + .. grid-item-card:: + + **API** + ^^^ + + Get more in-depth information about the Ray Data API. + + +++ + .. button-ref:: data-api + :color: primary + :outline: + :expand: + + Read the API Reference + + .. grid-item-card:: + + **Other Data Processing Solutions** + ^^^ + + For running ETL pipelines, check out :ref:`Spark-on-Ray `. For scaling + up your data science workloads, check out :ref:`Dask-on-Ray `, + :ref:`Modin `, and :ref:`Mars-on-Ray `. + + +++ + .. button-ref:: integrations + :color: primary + :outline: + :expand: + + Check Out Other Data Processing Options + + +------------------------ +Datasource Compatibility +------------------------ + +Ray Data supports reading and writing many file formats. +To view supported formats, read the :ref:`Input/Output reference `. + +If your use case isn't supported, reach out on `Discourse `__ or open a feature +request on the `Ray GitHub repo `__, and check out +our :ref:`guide for implementing a custom datasource ` +if you're interested in rolling your own integration! + +---------- +Learn More +---------- + +- `[Blog] Streaming distributed execution across CPUs and GPUs `__ +- `[Blog] Offline Batch Inference: Comparing Ray, Apache Spark, and SageMaker `__ +- `[Blog] Using Ray Data to parallelize LangChain inference `__ + +---------- +Contribute +---------- + +Contributions to Ray Data are :ref:`welcome `! +There are many potential improvements, including: + +- Supporting more data sources and transforms. +- Integration with more ecosystem libraries. +- Performance optimizations. + +.. include:: /_includes/data/announcement_bottom.rst diff --git a/doc/source/data/dataset-tensor-support.rst b/doc/source/data/dataset-tensor-support.rst deleted file mode 100644 index bed848b8fd02..000000000000 --- a/doc/source/data/dataset-tensor-support.rst +++ /dev/null @@ -1,250 +0,0 @@ -.. _datasets_tensor_support: - -ML Tensor Support -================= - -Tensor (multi-dimensional array) data is ubiquitous in ML workloads. However, popular data formats such as Pandas, Parquet, and Arrow don't natively support tensor data types. To bridge this gap, Datasets provides a unified tensor data type that can be used to represent, transform, and store tensor data: - -* For Pandas, Datasets will transparently convert ``List[np.ndarray]`` columns to and from the :class:`TensorDtype ` extension type. -* For Parquet, Datasets has an Arrow extension :class:`ArrowTensorType ` that allows tensors to be loaded from and stored in the Parquet format. -* In addition, single-column tensor datasets can be created from NumPy (.npy) files. - -Datasets automatically converts between the extension types/arrays above. This means you can think of a ``Tensor`` as a first-class data type in Datasets. - -Creating Tensor Datasets ------------------------- - -This section shows how to create single and multi-column tensor datasets. - -.. tabbed:: Synthetic Data - - Create a synthetic tensor dataset from a range of integers. - - **Single-column only**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_range_begin__ - :end-before: __create_range_end__ - -.. tabbed:: Pandas UDF - - Create tensor datasets by returning ``List[np.ndarray]`` columns from a Pandas - :ref:`user-defined function `. - - **Single-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_pandas_begin__ - :end-before: __create_pandas_end__ - - **Multi-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_pandas_2_begin__ - :end-before: __create_pandas_2_end__ - -.. tabbed:: NumPy - - Create from in-memory NumPy data or from previously saved NumPy (.npy) files. - - **Single-column only**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_numpy_begin__ - :end-before: __create_numpy_end__ - -.. tabbed:: Parquet - - There are two ways to construct a Parquet tensor dataset: (1) loading a - previously-saved tensor dataset, or (2) casting non-tensor Parquet columns to tensor - type. When casting data, a tensor schema or deserialization - :ref:`user-defined function ` must be provided. The - following are examples for each method. - - **Previously-saved tensor datasets**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_parquet_1_begin__ - :end-before: __create_parquet_1_end__ - - **Cast from data stored in C-contiguous format**: - - For tensors stored as raw NumPy ndarray bytes in C-contiguous order (e.g., via - `ndarray.tobytes() `__), all you need to specify is the tensor column schema. The following is an end-to-end example: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_parquet_2_begin__ - :end-before: __create_parquet_2_end__ - - **Cast from data stored in custom formats**: - - For tensors stored in other formats (e.g., pickled), you can specify a deserializer - :ref:`user-defined function ` that returns - :class:`~ray.data.extensions.tensor_extension.TensorArray` columns: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_parquet_3_begin__ - :end-before: __create_parquet_3_end__ - -.. tabbed:: Images - - Load image data stored as individual files using :func:`~ray.data.read_images`: - - **Image and label columns**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_images_begin__ - :end-before: __create_images_end__ - -.. note:: - - By convention, single-column tensor datasets are represented with a single ``__value__`` column. - This kind of dataset will be converted automatically to/from NumPy ndarray format in all transformation and consumption APIs. - -Transforming / Consuming Tensor Data ------------------------------------- - -Like any other Dataset, Datasets with tensor columns can be consumed / transformed in batches via the :meth:`ds.iter_batches(batch_format=\) ` and :meth:`ds.map_batches(fn, batch_format=\) ` APIs. This section shows the available batch formats and their behavior: - -.. tabbed:: "default" - - **Single-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_native_begin__ - :end-before: __consume_native_end__ - - **Multi-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_native_2_begin__ - :end-before: __consume_native_2_end__ - -.. tabbed:: "pandas" - - **Single-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_pandas_begin__ - :end-before: __consume_pandas_end__ - - **Multi-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_pandas_2_begin__ - :end-before: __consume_pandas_2_end__ - -.. tabbed:: "pyarrow" - - **Single-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_pyarrow_begin__ - :end-before: __consume_pyarrow_end__ - - **Multi-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_pyarrow_2_begin__ - :end-before: __consume_pyarrow_2_end__ - -.. tabbed:: "numpy" - - **Single-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_numpy_begin__ - :end-before: __consume_numpy_end__ - - **Multi-column**: - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __consume_numpy_2_begin__ - :end-before: __consume_numpy_2_end__ - -Saving Tensor Datasets ----------------------- - -Because tensor datasets rely on Datasets-specific extension types, they can only be -saved in formats that preserve Arrow metadata (currently only Parquet). In addition, -single-column tensor datasets can be saved in NumPy format. - -.. tabbed:: Parquet - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __write_1_begin_ - :end-before: __write_1_end__ - -.. tabbed:: NumPy - - .. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __write_2_begin_ - :end-before: __write_2_end__ - -.. _ragged_tensor_support: - -Ragged Tensor Support ---------------------- - -`Ragged tensors `__, i.e. tensors with non-uniform dimensions, pop up in NLP -(`textual sentences/documents of different lengths `__, -`N-grams `__), -computer vision (images of differing resolution, -`ssd300_vgg16 detection outputs `__), -and audio ML (differing durations). Datasets has basic support for ragged tensors, -namely tensors that are a collection (batch) of variably-shaped subtensors, e.g. a batch -of images of differing sizes or a batch of sentences of differing lengths. - -.. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __create_variable_shaped_tensors_begin__ - :end-before: __create_variable_shaped_tensors_end__ - -These variable-shaped tensors can be exchanged with popular training frameworks that support ragged tensors, such as `TensorFlow `__. - -.. literalinclude:: ./doc_code/tensor.py - :language: python - :start-after: __tf_variable_shaped_tensors_begin__ - :end-before: __tf_variable_shaped_tensors_end__ - -.. _disable_tensor_extension_casting: - -Disabling Tensor Extension Casting ----------------------------------- - -To disable automatic casting of Pandas and Arrow arrays to -:class:`~ray.data.extensions.tensor_extension.TensorArray`, run the code -below. - -.. code-block:: - - from ray.data import DataContext - - ctx = DataContext.get_current() - ctx.enable_tensor_extension_casting = False - - -Limitations ------------ - -The following are current limitations of tensor datasets. - -* Arbitrarily `nested/ragged tensors `__ are not supported. Only tensors with all uniform dimensions (i.e. a fully well-defined shape) and tensors representing a collection of variable-shaped tensor elements (e.g. a collection of images with different shapes) are supported; arbitrary raggedness and nested ragged tensors is not supported. diff --git a/doc/source/data/dataset.rst b/doc/source/data/dataset.rst deleted file mode 100644 index d55e77b4a8ae..000000000000 --- a/doc/source/data/dataset.rst +++ /dev/null @@ -1,210 +0,0 @@ -.. include:: /_includes/data/announcement.rst - -.. _datasets: - -============================================ -Ray Datasets: Distributed Data Preprocessing -============================================ - -.. _datasets-intro: - -Ray Datasets are the standard way to load and exchange data in Ray libraries and applications. -They provide basic distributed data transformations such as maps -(:meth:`map_batches `), -global and grouped aggregations (:class:`GroupedData `), and -shuffling operations (:meth:`random_shuffle `, -:meth:`sort `, -:meth:`repartition `), -and are compatible with a variety of file formats, data sources, and distributed frameworks. - -Here's an overview of the integrations with other processing frameworks, file formats, and supported operations, -as well as a glimpse at the Ray Datasets API. - -Check the :ref:`Input/Output reference ` to see if your favorite format -is already supported. - -.. image:: images/dataset.svg - -.. - https://docs.google.com/drawings/d/16AwJeBNR46_TsrkOmMbGaBK7u-OPsf_V8fHjU-d2PPQ/edit - - ----------------------------------------------- -Data Loading and Preprocessing for ML Training ----------------------------------------------- - -Use Ray Datasets to load and preprocess data for distributed :ref:`ML training pipelines `. -Compared to other loading solutions, Datasets are more flexible (e.g., can express higher-quality per-epoch global shuffles) and provides `higher overall performance `__. - -Use Datasets as a last-mile bridge from storage or ETL pipeline outputs to distributed -applications and libraries in Ray. Don't use it as a replacement for more general data -processing systems. - -.. image:: images/dataset-loading-1.png - :width: 650px - :align: center - -.. - https://docs.google.com/presentation/d/1l03C1-4jsujvEFZUM4JVNy8Ju8jnY5Lc_3q7MBWi2PQ/edit - -To learn more about the features Datasets supports, read the -:ref:`Datasets User Guide `. - ------------------------------ -Datasets for Parallel Compute ------------------------------ - -Datasets also simplify general purpose parallel GPU and CPU compute in Ray; for -instance, for :ref:`GPU batch inference `. -They provide a higher-level API for Ray tasks and actors for such embarrassingly parallel compute, -internally handling operations like batching, pipelining, and memory management. - -.. image:: images/dataset-compute-1.png - :width: 500px - :align: center - -As part of the Ray ecosystem, Ray Datasets can leverage the full functionality of Ray's distributed scheduler, -e.g., using actors for optimizing setup time and GPU scheduling. - ----------------------- -Where to Go from Here? ----------------------- - -As new user of Ray Datasets, you may want to start with our :ref:`Getting Started guide`. -If you've run your first examples already, you might want to dive into Ray Datasets' -:ref:`key concepts ` or our :ref:`User Guide ` instead. -Advanced users can refer directly to the Ray Datasets :ref:`API reference ` for their projects. - -.. panels:: - :container: text-center - :column: col-lg-6 px-2 py-2 - :card: - - **Getting Started** - ^^^ - - Start with our quick start tutorials for working with Datasets. - These concrete examples will give you an idea of how to use Ray Datasets. - - +++ - .. link-button:: datasets_getting_started - :type: ref - :text: Get Started with Ray Datasets - :classes: btn-outline-info btn-block - --- - - **Key Concepts** - ^^^ - - Understand the key concepts behind Ray Datasets. - Learn what :ref:`Datasets ` are and how they are executed in Ray - Datasets. - - +++ - .. link-button:: data_key_concepts - :type: ref - :text: Learn Key Concepts - :classes: btn-outline-info btn-block - --- - - **User Guides** - ^^^ - - Learn how to :ref:`create datasets `, :ref:`save - datasets `, :ref:`transform datasets `, - :ref:`access and exchange datasets `, :ref:`pipeline - transformations `, or - :ref:`work with tensor data `. - - +++ - .. link-button:: data_user_guide - :type: ref - :text: Start Using Ray Datasets - :classes: btn-outline-info btn-block - --- - - **Examples** - ^^^ - - Find both simple and scaling-out examples of using Ray Datasets for data - processing and ML ingest. - - +++ - .. link-button:: datasets-recipes - :type: ref - :text: Ray Datasets Examples - :classes: btn-outline-info btn-block - --- - - **Ray Datasets FAQ** - ^^^ - - Find answers to commonly asked questions in our detailed FAQ. - - +++ - .. link-button:: datasets_faq - :type: ref - :text: Ray Datasets FAQ - :classes: btn-outline-info btn-block - --- - - **API** - ^^^ - - Get more in-depth information about the Ray Datasets API. - - +++ - .. link-button:: data-api - :type: ref - :text: Read the API Reference - :classes: btn-outline-info btn-block - --- - - **Other Data Processing Solutions** - ^^^ - - For running ETL pipelines, check out :ref:`Spark-on-Ray `. For scaling - up your data science workloads, check out :ref:`Dask-on-Ray `, - :ref:`Modin `, and :ref:`Mars-on-Ray `. - - +++ - .. link-button:: integrations - :type: ref - :text: Check Out Other Data Processing Options - :classes: btn-outline-info btn-block - ------------------------- -Datasource Compatibility ------------------------- - -Ray Datasets supports reading and writing many file formats. -To view supported formats, read the :ref:`Input/Output reference `. - -If your use case isn't supported, reach out on `Discourse `__ or open a feature -request on the `Ray GitHub repo `__, and check out -our :ref:`guide for implementing a custom Datasets datasource ` -if you're interested in rolling your own integration! - -.. _data-talks: - ----------- -Learn More ----------- - -- [slides] `Talk given at PyData 2021 `_ -- [blog] `Data Ingest in a Third Generation ML Architecture `_ -- [blog] `Building an end-to-end ML pipeline using Mars and XGBoost on Ray `_ -- [blog] `Ray Datasets for large-scale machine learning ingest and scoring `_ - ----------- -Contribute ----------- - -Contributions to Ray Datasets are :ref:`welcome `! -There are many potential improvements, including: - -- Supporting more data sources and transforms. -- Integration with more ecosystem libraries. -- Performance optimizations. - -.. include:: /_includes/data/announcement_bottom.rst diff --git a/doc/source/data/doc_code/batch_formats.py b/doc/source/data/doc_code/batch_formats.py new file mode 100644 index 000000000000..2099f70e9bb8 --- /dev/null +++ b/doc/source/data/doc_code/batch_formats.py @@ -0,0 +1,80 @@ +# flake8: noqa +# isort: skip_file +# fmt: off + +# __simple_map_function_start__ +import ray + +ds = ray.data.read_csv("example://iris.csv") + +def map_function(data): + return data[data["sepal.length"] < 5] + +batch = ds.take_batch(10, batch_format="pandas") +mapped_batch = map_function(batch) + +transformed = ds.map_batches(map_function, batch_format="pandas", batch_size=10) +# __simple_map_function_end__ + +# __simple_pandas_start__ +import ray +import pandas as pd + +ds = ray.data.read_csv("example://iris.csv") +ds.show(1) +# -> {'sepal.length': 5.1, ..., 'petal.width': 0.2, 'variety': 'Setosa'} + +def transform_pandas(df_batch: pd.DataFrame) -> pd.DataFrame: + df_batch = df_batch[df_batch["variety"] == "Versicolor"] + df_batch.loc[:, "normalized.sepal.length"] = df_batch["sepal.length"] / df_batch["sepal.length"].max() + df_batch = df_batch.drop(columns=["sepal.length"]) + return df_batch + +ds.map_batches(transform_pandas, batch_format="pandas").show(1) +# -> {..., 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} +# __simple_pandas_end__ + +# __simple_numpy_start__ +from typing import Dict + +import ray +import numpy as np +from typing import Dict + + +ds = ray.data.range_tensor(1000, shape=(2, 2)) + +def transform_numpy(arr: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + arr["data"] = arr["data"] * 2 + return arr + + +# test map function on a batch +batch = ds.take_batch(1) +mapped_batch = transform_numpy(batch) + +ds.map_batches(transform_numpy) +# __simple_numpy_end__ + + +# __simple_pyarrow_start__ +import ray +import pyarrow as pa +import pyarrow.compute as pac + +ds = ray.data.read_csv("example://iris.csv") + + +def transform_pyarrow(batch: pa.Table) -> pa.Table: + batch = batch.filter(pac.equal(batch["variety"], "Versicolor")) + return batch.drop(["sepal.length"]) + + +# test map function on a batch +batch = ds.take_batch(1, batch_format="pyarrow") +mapped_batch = transform_pyarrow(batch) + +ds.map_batches(transform_pyarrow, batch_format="pyarrow").show(1) +# -> {'sepal.width': 3.2, ..., 'variety': 'Versicolor'} +# __simple_pyarrow_end__ +# fmt: on diff --git a/doc/source/data/doc_code/consuming_datasets.py b/doc/source/data/doc_code/consuming_data.py similarity index 70% rename from doc/source/data/doc_code/consuming_datasets.py rename to doc/source/data/doc_code/consuming_data.py index 1d946d5b5458..3f7a41806dbb 100644 --- a/doc/source/data/doc_code/consuming_datasets.py +++ b/doc/source/data/doc_code/consuming_data.py @@ -8,26 +8,21 @@ # Take up to five records as a batch. print(ds.take(5)) -# -> [0, 1, 2, 3, 4] +# -> [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': 4}] -# Similar to above but returning in a batch format (like iter_batches / map_batches). -print(ds.take_batch(5, batch_format="pandas")) -# -> value -# 0 0 -# 1 1 -# 2 2 -# 3 3 -# 4 4 +# Similar to above but returning in a batch format. +print(ds.take_batch(5)) +# -> {'id': array([0, 1, 2, 3, 4])} # Warning: This will print all of the rows! print(ds.take_all()) ds.show(5) -# -> 0 -# 1 -# 2 -# 3 -# 4 +# -> {'id': 0} +# {'id': 1} +# {'id': 2} +# {'id': 3} +# {'id': 4} # __take_end__ # fmt: on @@ -40,7 +35,7 @@ # Consume all rows in the Dataset. for row in ds.iter_rows(): - assert isinstance(row, int) + assert isinstance(row, dict) num_rows += 1 print(num_rows) @@ -58,7 +53,7 @@ # Consume all batches in the Dataset. for batch in ds.iter_batches(batch_size=2): - assert isinstance(batch, list) + assert isinstance(batch, dict) num_batches += 1 print(num_batches) @@ -69,7 +64,7 @@ for batch in ds.iter_batches(batch_size=2, batch_format="pandas"): assert isinstance(batch, pd.DataFrame) # Simple integer Dataset is converted to a single-column Pandas DataFrame. - cum_sum += batch["value"] + cum_sum += batch["id"] print(cum_sum) # -> 49995000 @@ -81,11 +76,11 @@ import ray @ray.remote -def consume(data: ray.data.Dataset[int]) -> int: +def consume(data: ray.data.Dataset) -> int: num_batches = 0 # Consume data in 2-record batches. for batch in data.iter_batches(batch_size=2): - assert len(batch) == 2 + assert len(batch["id"]) == 2 num_batches += 1 return num_batches @@ -103,10 +98,11 @@ class Worker: def __init__(self, rank: int): pass - def train(self, shard: ray.data.Dataset[int]) -> int: + def train(self, shard: ray.data.DataIterator) -> int: + total = 0 for batch in shard.iter_torch_batches(batch_size=256): - pass - return shard.count() + total += len(batch["id"]) + return total workers = [Worker.remote(i) for i in range(4)] # -> [Actor(Worker, ...), Actor(Worker, ...), ...] @@ -114,9 +110,11 @@ def train(self, shard: ray.data.Dataset[int]) -> int: ds = ray.data.range(10000) # -> Dataset(num_blocks=200, num_rows=10000, schema=) -shards = ds.split(n=4, locality_hints=workers) -# -> [Dataset(num_blocks=13, num_rows=2500, schema=), -# Dataset(num_blocks=13, num_rows=2500, schema=), ...] +shards = ds.streaming_split(n=4, equal=True) +# -> [, +# , +# , +# ] ray.get([w.train.remote(s) for w, s in zip(workers, shards)]) # -> [2500, 2500, 2500, 2500] diff --git a/doc/source/data/doc_code/hf_quick_start.py b/doc/source/data/doc_code/hf_quick_start.py new file mode 100644 index 000000000000..e54a7a0131c4 --- /dev/null +++ b/doc/source/data/doc_code/hf_quick_start.py @@ -0,0 +1,96 @@ +# flake8: noqa +# isort: skip_file +# fmt: off + +# __hf_super_quick_start__ +import ray +import numpy as np +from typing import Dict + +ds = ray.data.from_numpy(np.asarray(["Complete this", "for me"])) + +class HuggingFacePredictor: + def __init__(self): + from transformers import pipeline + self.model = pipeline("text-generation", model="gpt2") + + def __call__(self, batch: Dict[str, np.ndarray]): + model_out = self.model(list(batch["data"]), max_length=20, num_return_sequences=1) + batch["output"] = [sequence[0]["generated_text"] for sequence in model_out] + return batch + +scale = ray.data.ActorPoolStrategy(size=2) +predictions = ds.map_batches(HuggingFacePredictor, compute=scale) +predictions.show(limit=1) +# __hf_super_quick_end__ + +# __hf_no_ray_start__ +import numpy as np +from typing import Dict +from transformers import pipeline + +batches = {"data": np.asarray(["Complete this", "for me"])} + +model = pipeline("text-generation", model="gpt2") + +def transform(batch: Dict[str, np.ndarray]): + return model(list(batch["data"]), max_length=20) + +results = transform(batches) +# __hf_no_ray_end__ + +# __hf_quickstart_load_start__ +import ray +import numpy as np +from typing import Dict + +ds = ray.data.from_numpy(np.asarray(["Complete this", "for me"])) +# __hf_quickstart_load_end__ + + +# __hf_quickstart_model_start__ +class HuggingFacePredictor: + def __init__(self): # <1> + from transformers import pipeline + self.model = pipeline("text-generation", model="gpt2") + + def __call__(self, batch: Dict[str, np.ndarray]): # <2> + model_out = self.model(list(batch["data"]), max_length=20, num_return_sequences=1) + batch["output"] = [sequence[0]["generated_text"] for sequence in model_out] + return batch +# __hf_quickstart_model_end__ + + +# __hf_quickstart_prediction_test_start__ +hfp = HuggingFacePredictor() +batch = ds.take_batch(10) +test = hfp(batch) +# __hf_quickstart_prediction_test_end__ + + +# __hf_quickstart_prediction_start__ +scale = ray.data.ActorPoolStrategy(size=2) +predictions = ds.map_batches(HuggingFacePredictor, compute=scale) + +predictions.show(limit=1) +# [{'generated_text': 'Complete these sentences until you understand them.'}] +# __hf_quickstart_prediction_end__ + +# __hf_quickstart_air_start__ +import pandas as pd +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.pipelines import pipeline +from ray.train.huggingface import HuggingFacePredictor + + +tokenizer = AutoTokenizer.from_pretrained("sgugger/gpt2-like-tokenizer") +model_config = AutoConfig.from_pretrained("gpt2") +model = AutoModelForCausalLM.from_config(model_config) +pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) + +predictor = HuggingFacePredictor(pipeline=pipeline) + +prompts = pd.DataFrame(["Complete these sentences", "for me"], columns=["sentences"]) +predictions = predictor.predict(prompts) +# __hf_quickstart_air_end__ +# fmt: on diff --git a/doc/source/data/doc_code/key_concepts.py b/doc/source/data/doc_code/key_concepts.py index b0935a743ed3..5b6ec252643b 100644 --- a/doc/source/data/doc_code/key_concepts.py +++ b/doc/source/data/doc_code/key_concepts.py @@ -5,7 +5,7 @@ import ray from ray import tune -# This Dataset workload will use spare cluster resources for execution. +# This workload will use spare cluster resources for execution. def objective(*args): ray.data.range(10).show() @@ -13,7 +13,7 @@ def objective(*args): ray.init(num_cpus=4) # By setting `max_concurrent_trials=3`, this ensures the cluster will always -# have a sparse CPU for Datasets. Try setting `max_concurrent_trials=4` here, +# have a sparse CPU for Dataset. Try setting `max_concurrent_trials=4` here, # and notice that the experiment will appear to hang. tuner = tune.Tuner( tune.with_resources(objective, {"cpu": 1}), @@ -33,7 +33,7 @@ def objective(*args): import ray from ray import tune -# This Dataset workload will use reserved cluster resources for execution. +# This workload will use reserved cluster resources for execution. def objective(*args): ray.data.range(10).show() @@ -41,7 +41,7 @@ def objective(*args): ray.init(num_cpus=4) # This runs smoothly since _max_cpu_fraction_per_node is set to 0.8, effectively -# reserving 1 CPU for Datasets task execution. +# reserving 1 CPU for Dataset task execution. tuner = tune.Tuner( tune.with_resources(objective, tune.PlacementGroupFactory( [{"CPU": 1}], diff --git a/doc/source/data/doc_code/creating_datasets.py b/doc/source/data/doc_code/loading_data.py similarity index 65% rename from doc/source/data/doc_code/creating_datasets.py rename to doc/source/data/doc_code/loading_data.py index 05e79cdc0194..5ddd111f0f6a 100644 --- a/doc/source/data/doc_code/creating_datasets.py +++ b/doc/source/data/doc_code/loading_data.py @@ -9,61 +9,22 @@ # For tfrecords ray.init(runtime_env={"pip": ["tensorflow_metadata"]}) -# fmt: off -# __gen_synth_int_range_begin__ -# Create a Dataset of Python objects. -ds = ray.data.range(10000) -# -> Dataset(num_blocks=200, num_rows=10000, schema=) - -ds.take(5) -# -> [0, 1, 2, 3, 4] -# __gen_synth_int_range_end__ -# fmt: on - # fmt: off # __gen_synth_tabular_range_begin__ -# Create a Dataset of Arrow records. -ds = ray.data.range_table(10000) -# -> Dataset(num_blocks=200, num_rows=10000, schema={value: int64}) +# Create a Dataset of integers. +ds = ray.data.range(10000) +# -> Dataset(num_blocks=200, num_rows=10000, schema={id: int64}) -ds.take(5) -# -> [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': 4}] +ds.take_batch(5) +# -> {'id': array([0, 1, 2, 3, 4])} # __gen_synth_tabular_range_end__ # fmt: on -# fmt: off -# __gen_synth_tensor_range_begin__ -# Create a Dataset of tensors. -ds = ray.data.range_tensor(100 * 64 * 64, shape=(64, 64)) -# -> Dataset( -# num_blocks=200, -# num_rows=409600, -# schema={__value__: numpy.ndarray(shape=(64, 64), dtype=int64)} -# ) - -ds.take(2) -# -> [array([[0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0], -# ..., -# [0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0]]), -# array([[1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# ..., -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1]])] -# __gen_synth_tensor_range_end__ -# fmt: on - # fmt: off # __from_items_begin__ -# Create a Dataset of tabular (Arrow) records. +# Create a Dataset from python dicts. ds = ray.data.from_items([{"col1": i, "col2": str(i)} for i in range(10000)]) -# -> Dataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string}) +# -> MaterializedDataset(num_blocks=200, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -76,10 +37,10 @@ # __from_pandas_begin__ import pandas as pd -# Create a tabular Dataset from a Pandas DataFrame. +# Create a Dataset from a Pandas DataFrame. df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) ds = ray.data.from_pandas(df) -# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object}) +# -> MaterializedDataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -100,9 +61,9 @@ pd.DataFrame({"col1": list(chunk), "col2": list(map(str, chunk))}) for chunk in chunks ] -# Create a tabular Dataset from multiple Pandas DataFrames. +# Create a Dataset from multiple Pandas DataFrames. ds = ray.data.from_pandas(dfs) -# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}) +# -> MaterializedDataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -115,22 +76,22 @@ # __from_numpy_begin__ import numpy as np -# Create a tensor Dataset from a 3D NumPy ndarray. +# Create a Dataset from a 3D NumPy ndarray. arr = np.ones((3, 4, 4)) # The outer dimension is treated as the row dimension. ds = ray.data.from_numpy(arr) -# -> Dataset( +# -> MaterializedDataset( # num_blocks=1, # num_rows=3, -# schema={__value__: numpy.ndarray(shape=(4, 4), dtype=double)} +# schema={data: numpy.ndarray(shape=(4, 4), dtype=double)} # ) ds.show(2) -# -> {'value': array([[1., 1., 1., 1.], +# -> {'data': array([[1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.]])} -# -> {'value': array([[1., 1., 1., 1.], +# -> {'data': array([[1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.]])} @@ -140,17 +101,11 @@ # fmt: off # __read_images_begin__ ds = ray.data.read_images("example://image-datasets/simple") -# -> Dataset(num_blocks=3, num_rows=3, +# -> Dataset(num_blocks=3, num_rows=3, # schema={image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)}) ds.take(1) -# -> [array([[[ 88, 70, 68], -# [103, 88, 85], -# [112, 96, 97], -# ..., -# [168, 151, 81], -# [167, 149, 83], -# [166, 148, 82]]], dtype=uint8)] +# -> [{'image': array([[[ 88, 70, 68], ...]]), dtype=uint8)}] # __read_images_end__ # fmt: on @@ -158,22 +113,22 @@ # __from_numpy_mult_begin__ import numpy as np -# Create a tensor Dataset from multiple 3D NumPy ndarray. +# Create a Dataset from multiple 3D NumPy ndarray. arrs = [np.random.rand(2, 4, 4) for _ in range(4)] # The outer dimension is treated as the row dimension. ds = ray.data.from_numpy(arrs) -# -> Dataset( +# -> MaterializedDataset( # num_blocks=4, # num_rows=8, -# schema={__value__: numpy.ndarray(shape=(4, 4), dtype=double)} +# schema={data: numpy.ndarray(shape=(4, 4), dtype=double)} # ) ds.show(2) -# -> {'value': array([[0.06587483, 0.67808656, 0.76461924, 0.83428549], +# -> {'data': array([[0.06587483, 0.67808656, 0.76461924, 0.83428549], # [0.04932103, 0.25112165, 0.26476714, 0.24599738], # [0.67624391, 0.58689537, 0.12594709, 0.94663371], # [0.32435665, 0.97719096, 0.03234169, 0.71563231]])} -# -> {'value': array([[0.98570318, 0.65956399, 0.82168898, 0.09798336], +# -> {'data': array([[0.98570318, 0.65956399, 0.82168898, 0.09798336], # [0.22426704, 0.34209978, 0.02605247, 0.48200137], # [0.17312096, 0.38789983, 0.42663678, 0.92652456], # [0.80787394, 0.92437162, 0.11185822, 0.3319638 ]])} @@ -184,10 +139,10 @@ # __from_arrow_begin__ import pyarrow as pa -# Create a tabular Dataset from an Arrow Table. +# Create a Dataset from an Arrow Table. t = pa.table({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) ds = ray.data.from_arrow(t) -# -> Dataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string}) +# -> MaterializedDataset(num_blocks=1, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -208,9 +163,9 @@ pa.table({"col1": list(chunk), "col2": list(map(str, chunk))}) for chunk in chunks ] -# Create a tabular Dataset from multiple Arrow Tables. +# Create a Dataset from multiple Arrow Tables. ds = ray.data.from_arrow(ts) -# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}) +# -> MaterializedDataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -226,9 +181,9 @@ df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) ddf = dd.from_pandas(df, npartitions=4) -# Create a tabular Dataset from a Dask DataFrame. +# Create a Dataset from a Dask DataFrame. ds = ray.data.from_dask(ddf) -# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}) +# -> MaterializedDataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -243,9 +198,9 @@ df = pd.DataFrame({"col1": list(range(10000)), "col2": list(map(str, range(10000)))}) mdf = md.DataFrame(df) -# Create a tabular Dataset from a Modin DataFrame. +# Create a Dataset from a Modin DataFrame. ds = ray.data.from_modin(mdf) -# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}) +# -> MaterializedDataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -256,7 +211,7 @@ # fmt: off # __read_parquet_begin__ -# Create a tabular Dataset by reading a Parquet file. +# Create a Dataset by reading a Parquet file. ds = ray.data.read_parquet("example://iris.parquet") # -> Dataset( # num_blocks=1, @@ -292,7 +247,7 @@ # __read_parquet_pushdown_begin__ import pyarrow as pa -# Create a tabular Dataset by reading a Parquet file, pushing column selection and row +# Create a Dataset by reading a Parquet file, pushing column selection and row # filtering down to the file scan. ds = ray.data.read_parquet( "example://iris.parquet", @@ -309,7 +264,7 @@ # fmt: off # __read_csv_begin__ -# Create a tabular Dataset by reading a CSV file. +# Create a Dataset by reading a CSV file. ds = ray.data.read_csv("example://iris.csv") # -> Dataset( # num_blocks=1, @@ -343,7 +298,7 @@ # fmt: off # __read_json_begin__ -# Create a tabular Dataset by reading a JSON file. +# Create a Dataset by reading a JSON file. ds = ray.data.read_json("example://iris.json") # -> Dataset( # num_blocks=1, @@ -377,32 +332,29 @@ # fmt: off # __read_numpy_begin__ -# Create a tensor Dataset by reading a NumPy file. +# Create a Dataset by reading a NumPy file. ds = ray.data.read_numpy("example://mnist_subset.npy") # -> Dataset( # num_blocks=1, # num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)} +# schema={data: numpy.ndarray(shape=(28, 28), dtype=uint8)} # ) ds.show(2) -# [array([[0, ...]]), array([[0, ...]])] +# -> {'data': array([[0, ...]], dtype=uint8)} +# {'data': array([[0, ...]], dtype=uint8)} # __read_numpy_end__ # fmt: on # fmt: off # __read_text_begin__ -# Create a tabular Dataset by reading a text file. +# Create a Dataset by reading a text file. ds = ray.data.read_text("example://sms_spam_collection_subset.txt") -# -> Dataset(num_blocks=1, num_rows=10, schema=) +# -> Dataset(num_blocks=1, num_rows=10, schema={text: string}) -ds.show(3) -# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e -# buffet... Cine there got amore wat... -# ham Ok lar... Joking wif u oni... -# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA -# to 87121 to receive entry question(std txt rate)T&C's apply -# 08452810075over18's +ds.show(2) +# -> {'text': 'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'} +# {'text': 'ham\tOk lar... Joking wif u oni...'} # __read_text_end__ # fmt: on @@ -411,30 +363,25 @@ from io import BytesIO import PIL.Image -# Create a tabular Dataset by reading a binary file. +# Create a Dataset by reading a binary file. ds = ray.data.read_binary_files("example://mnist_subset_partitioned/0/1.png") -# -> Dataset(num_blocks=1, num_rows=1, schema=) +# -> Dataset(num_blocks=1, num_rows=1, schema={bytes: string}) -ds = ds.map(lambda bytes_: np.asarray(PIL.Image.open(BytesIO(bytes_)).convert("L"))) +ds = ds.map(lambda row: {"image": np.asarray(PIL.Image.open(BytesIO(row["bytes"])).convert("L"))}) # -> Dataset( # num_blocks=1, # num_rows=1, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)} +# schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)} # ) -ds.show(3) -# -> ham Go until jurong point, crazy.. Available only in bugis n great world la e -# buffet... Cine there got amore wat... -# ham Ok lar... Joking wif u oni... -# spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA -# to 87121 to receive entry question(std txt rate)T&C's apply -# 08452810075over18's +ds.take(1) +# -> [{'image': array([[[ 88, 70, 68], ...]]), dtype=uint8)}] # __read_binary_end__ # fmt: on # fmt: off # __read_parquet_s3_begin__ -# Create a tabular Dataset by reading a Parquet file from S3. +# Create a Dataset by reading a Parquet file from S3. ds = ray.data.read_parquet("s3://anonymous@air-example-data/ursa-labs-taxi-data/by_year/2019/01/data.parquet") # -> Dataset( # num_blocks=1, @@ -459,7 +406,7 @@ # 'passenger_count': 1, # 'trip_distance': 1.5, # 'rate_code_id': '1', -# 'store_and_fwd_flag': 'N', +# 'store_and_fwd_flag': 'N', # ..., # } # { @@ -469,7 +416,7 @@ # 'passenger_count': 1, # 'trip_distance': 2.5999999046325684, # 'rate_code_id': '1', -# 'store_and_fwd_flag': 'N', +# 'store_and_fwd_flag': 'N', # ..., # } # __read_parquet_s3_end__ @@ -487,7 +434,7 @@ # fmt: off # __read_tfrecords_begin__ -# Create a tabular Dataset by reading a TFRecord file. +# Create a Dataset by reading a TFRecord file. ds = ray.data.read_tfrecords("example://iris.tfrecords") # Dataset( # num_blocks=1, @@ -502,11 +449,11 @@ # ) ds.show(1) # { -# "sepal.length": 5.099999904632568, -# "sepal.width": 3.5, -# "petal.length": 1.399999976158142, -# "petal.width": 0.20000000298023224, -# "label": b"Setosa", +# 'sepal.length': 5.099999904632568, +# 'sepal.width': 3.5, +# 'petal.length': 1.399999976158142, +# 'petal.width': 0.20000000298023224, +# 'label': b'Setosa', # } # __read_tfrecords_end__ # fmt: on diff --git a/doc/source/data/doc_code/creating_datasets_untested.py b/doc/source/data/doc_code/loading_data_untested.py similarity index 94% rename from doc/source/data/doc_code/creating_datasets_untested.py rename to doc/source/data/doc_code/loading_data_untested.py index fbc7923b3b98..83d9e9f6685f 100644 --- a/doc/source/data/doc_code/creating_datasets_untested.py +++ b/doc/source/data/doc_code/loading_data_untested.py @@ -13,8 +13,8 @@ executor_memory="500MB") df = spark.createDataFrame([(i, str(i)) for i in range(10000)], ["col1", "col2"]) # Create a tabular Dataset from a Spark DataFrame. -ds = ray.data.from_dask(df) -# -> Dataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}) +ds = ray.data.from_spark(df) +# -> MaterializedDataset(num_blocks=10, num_rows=10000, schema={col1: int64, col2: string}) ds.show(3) # -> {'col1': 0, 'col2': '0'} @@ -125,7 +125,7 @@ mdf = md.DataFrame(df, num_partitions=8) # Create a tabular Dataset from a Mars DataFrame. ds = ray.data.from_mars(mdf) -# -> Dataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}) +# -> MaterializedDataset(num_blocks=8, num_rows=10000, schema={col1: int64, col2: object}) ds.show(3) # -> {'col1': 0, 'col2': '0'} diff --git a/doc/source/data/doc_code/pytorch_quick_start.py b/doc/source/data/doc_code/pytorch_quick_start.py new file mode 100644 index 000000000000..033e06617b8a --- /dev/null +++ b/doc/source/data/doc_code/pytorch_quick_start.py @@ -0,0 +1,101 @@ +# flake8: noqa +# isort: skip_file +# fmt: off + +# __pt_super_quick_start__ +import ray +import numpy as np +from typing import Dict +import torch +import torch.nn as nn + +ds = ray.data.from_numpy(np.ones((1, 100))) + +class TorchPredictor: + + def __init__(self): + self.model = nn.Sequential( + nn.Linear(in_features=100, out_features=1), + nn.Sigmoid(), + ) + self.model.eval() + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict: + tensor = torch.as_tensor(batch["data"], dtype=torch.float32) + with torch.inference_mode(): + return {"output": self.model(tensor).detach().numpy()} + +scale = ray.data.ActorPoolStrategy(size=2) +predictions = ds.map_batches(TorchPredictor, compute=scale) +predictions.show(limit=1) +# {'output': array([0.45092654])} +# __pt_super_quick_end__ + + +# __pt_no_ray_start__ +import torch +import torch.nn as nn +import numpy as np +from typing import Dict + +batches = {"data": np.ones((1, 100))} + +model = nn.Sequential( + nn.Linear(in_features=100, out_features=1), + nn.Sigmoid(), +) +model.eval() + +def transform(batch: Dict[str, np.ndarray]): + tensor = torch.as_tensor(batch["data"], dtype=torch.float32) + with torch.inference_mode(): + return {"output": model(tensor).detach().numpy()} + +results = transform(batches) +# __pt_no_ray_end__ + + +# __pt_quickstart_load_start__ +import ray +import numpy as np +from typing import Dict + + +ds = ray.data.from_numpy(np.ones((1, 100))) +# __pt_quickstart_load_end__ + + +# __pt_quickstart_model_start__ +import torch +import torch.nn as nn + +class TorchPredictor: + + def __init__(self): # <1> + self.model = nn.Sequential( + nn.Linear(in_features=100, out_features=1), + nn.Sigmoid(), + ) + self.model.eval() + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict: # <2> + tensor = torch.as_tensor(batch["data"], dtype=torch.float32) + with torch.inference_mode(): + return {"output": self.model(tensor).detach().numpy()} +# __pt_quickstart_model_end__ + + +# __pt_quickstart_prediction_test_start__ +tp = TorchPredictor() +batch = ds.take_batch(10) +test = tp(batch) +# __pt_quickstart_prediction_test_end__ + + +# __pt_quickstart_prediction_start__ +scale = ray.data.ActorPoolStrategy(size=2) +predictions = ds.map_batches(TorchPredictor, compute=scale) +predictions.show(limit=1) +# {'output': array([0.45092654])} +# __pt_quickstart_prediction_end__ +# fmt: on diff --git a/doc/source/data/doc_code/quick_start.py b/doc/source/data/doc_code/quick_start.py index fd660be1e605..5d63ddbaab55 100644 --- a/doc/source/data/doc_code/quick_start.py +++ b/doc/source/data/doc_code/quick_start.py @@ -48,13 +48,13 @@ # __create_from_files_begin__ # Create from CSV. ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") -# Dataset(num_blocks=1, num_rows=150, +# Dataset(num_blocks=1, num_rows=150, # schema={sepal length (cm): double, sepal width (cm): double, # petal length (cm): double, petal width (cm): double, target: int64}) # Create from Parquet. ds = ray.data.read_parquet("s3://anonymous@air-example-data/iris.parquet") -# Dataset(num_blocks=1, num_rows=150, +# Dataset(num_blocks=1, num_rows=150, # schema={sepal.length: double, sepal.width: double, # petal.length: double, petal.width: double, variety: string}) @@ -75,7 +75,7 @@ def transform_batch(df: pandas.DataFrame) -> pandas.DataFrame: return df[(df["sepal.length"] < 5.5) & (df["petal.length"] > 3.5)] -transformed_ds = ds.map_batches(transform_batch) +transformed_ds = ds.map_batches(transform_batch, batch_format="pandas") # Dataset(num_blocks=10, num_rows=3, # schema={sepal.length: float64, sepal.width: float64, # petal.length: float64, petal.width: float64, variety: object}) diff --git a/doc/source/data/doc_code/saving_datasets.py b/doc/source/data/doc_code/saving_data.py similarity index 86% rename from doc/source/data/doc_code/saving_datasets.py rename to doc/source/data/doc_code/saving_data.py index ece4e6240d75..59252f13b4e6 100644 --- a/doc/source/data/doc_code/saving_datasets.py +++ b/doc/source/data/doc_code/saving_data.py @@ -10,9 +10,7 @@ import ray ds = ray.data.range(1000) -# -> Dataset(num_blocks=200, num_rows=1000, schema=) -ds.take(5) -# -> [0, 1, 2, 3, 4] +# -> Dataset(num_blocks=200, num_rows=1000, schema={id: int64}) # Write out just one file. ds.repartition(1).write_parquet("/tmp/one_parquet") @@ -31,9 +29,7 @@ import ray ds = ray.data.range(1000) -# -> Dataset(num_blocks=200, num_rows=1000, schema=) -ds.take(5) -# -> [0, 1, 2, 3, 4] +# -> Dataset(num_blocks=200, num_rows=1000, schema={id: int64}) # Write out just one file. ds.repartition(1).write_csv("/tmp/one_csv") @@ -52,9 +48,7 @@ import ray ds = ray.data.range(1000) -# -> Dataset(num_blocks=200, num_rows=1000, schema=) -ds.take(5) -# -> [0, 1, 2, 3, 4] +# -> Dataset(num_blocks=200, num_rows=1000, schema={id: int64}) # Write out just one file. ds.repartition(1).write_json("/tmp/one_json") @@ -77,18 +71,18 @@ # -> Dataset( # num_blocks=1, # num_rows=1000, -# schema={value: }, +# schema={data: }, # ) ds.show(2) -# -> {'value': array(0)} -# -> {'value': array(1)} +# -> {'data': array(0)} +# -> {'data': array(1)} # Write out just one file. -ds.repartition(1).write_numpy("/tmp/one_numpy") +ds.repartition(1).write_numpy("/tmp/one_numpy", column="data") # -> /tmp/one_numpy/78c91652e2364a7481cf171bed6d96e4_000000.npy # Write out multiple files. -ds.repartition(3).write_numpy("/tmp/multi_numpy") +ds.repartition(3).write_numpy("/tmp/multi_numpy", column="data") # -> /tmp/multi_numpy/b837e5b5a18448bfa3f8388f5d99d033_000000.npy # -> /tmp/multi_numpy/b837e5b5a18448bfa3f8388f5d99d033_000001.npy # -> /tmp/multi_numpy/b837e5b5a18448bfa3f8388f5d99d033_000002.npy diff --git a/doc/source/data/doc_code/tensor.py b/doc/source/data/doc_code/tensor.py index e2a515e91c30..f7689b973efd 100644 --- a/doc/source/data/doc_code/tensor.py +++ b/doc/source/data/doc_code/tensor.py @@ -1,65 +1,31 @@ # flake8: noqa +import ray from typing import Dict, Any # fmt: off # __create_range_begin__ -import ray - # Create a Dataset of tensors. ds = ray.data.range_tensor(10000, shape=(64, 64)) # -> Dataset(num_blocks=200, num_rows=10000, -# schema={__value__: numpy.ndarray(shape=(64, 64), dtype=int64)}) +# schema={data: numpy.ndarray(shape=(64, 64), dtype=int64)}) -ds.take(2) -# -> [array([[0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0], -# ..., -# [0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0], -# [0, 0, 0, ..., 0, 0, 0]]), -# array([[1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# ..., -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1]])] +ds.take(1) +# -> {'data': array([[0, 0, 0, ..., 0, 0, 0], +# [0, 0, 0, ..., 0, 0, 0], +# [0, 0, 0, ..., 0, 0, 0], +# ..., +# [0, 0, 0, ..., 0, 0, 0], +# [0, 0, 0, ..., 0, 0, 0], +# [0, 0, 0, ..., 0, 0, 0]])} # __create_range_end__ -# __create_pandas_begin__ -import ray - +# __create_pandas_2_begin__ import pandas as pd import numpy as np -# Start with a tabular base dataset. -ds = ray.data.range_table(1000) - -# Create a single TensorArray column. -def single_col_udf(batch: pd.DataFrame) -> pd.DataFrame: - bs = len(batch) - - # Lists of ndarrays are automatically cast to TensorArray. - arr = [np.zeros((128, 128, 3)) for _ in range(bs)] - return pd.DataFrame({"__value__": arr}) - - ## Alternatively, manually construct a TensorArray from a single ndarray. - # from ray.data.extensions.tensor_extension import TensorArray - # arr = TensorArray(np.zeros((bs, 128, 128, 3), dtype=np.int64)) - # return pd.DataFrame({"__value__": arr}) - - -ds.map_batches(single_col_udf) -ds.materialize() -# -> Dataset(num_blocks=17, num_rows=1000, -# schema={__value__: numpy.ndarray(shape=(128, 128, 3), dtype=int64)}) -# __create_pandas_end__ - -# __create_pandas_2_begin__ # Create multiple TensorArray columns. -def multi_col_udf(batch: pd.DataFrame) -> pd.DataFrame: +def gen_image_and_embed(batch: pd.DataFrame) -> pd.DataFrame: bs = len(batch) # Lists of ndarrays are automatically cast to TensorArray. @@ -72,36 +38,31 @@ def multi_col_udf(batch: pd.DataFrame) -> pd.DataFrame: # embed = TensorArray(np.zeros((bs, 256,), dtype=np.uint8)) # return pd.DataFrame({"image": image, "embed": embed}) - -ds.map_batches(multi_col_udf) +ds.map_batches(gen_image_and_embed, batch_format="pandas") ds.materialize() # -> Dataset(num_blocks=17, num_rows=1000, -# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=int64), -# embed: numpy.ndarray(shape=(256,), dtype=uint8)}) +# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=int64), +# embed: numpy.ndarray(shape=(256,), dtype=uint8)}) # __create_pandas_2_end__ # __create_numpy_begin__ -import ray - # From in-memory numpy data. ray.data.from_numpy(np.zeros((1000, 128, 128, 3), dtype=np.int64)) # -> Dataset(num_blocks=1, num_rows=1000, -# schema={__value__: numpy.ndarray(shape=(128, 128, 3), dtype=int64)}) +# schema={data: numpy.ndarray(shape=(128, 128, 3), dtype=int64)}) # From saved numpy files. ray.data.read_numpy("example://mnist_subset.npy") # -> Dataset(num_blocks=1, num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)}) +# schema={data: numpy.ndarray(shape=(28, 28), dtype=uint8)}) # __create_numpy_end__ # __create_parquet_1_begin__ -import ray - # Reading previously saved Tensor data works out of the box. ds = ray.data.read_parquet("example://parquet_images_mini") # -> Dataset(num_blocks=3, num_rows=3, -# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), -# label: string}) +# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), +# label: string}) ds.take(1) # -> [{'image': @@ -126,7 +87,6 @@ def multi_col_udf(batch: pd.DataFrame) -> pd.DataFrame: shutil.rmtree("/tmp/some_path", ignore_errors=True) # __create_parquet_2_begin__ -import ray import numpy as np import pandas as pd @@ -198,7 +158,7 @@ def cast_udf(block: pa.Table) -> pa.Table: # __create_images_begin__ ds = ray.data.read_images("example://image-datasets/simple") # -> Dataset(num_blocks=3, num_rows=3, -# schema={__value__: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)}) +# schema={data: numpy.ndarray(shape=(32, 32, 3), dtype=uint8)}) ds.take(1) # -> [array([[[ 88, 70, 68], @@ -210,93 +170,11 @@ def cast_udf(block: pa.Table) -> pa.Table: # [166, 148, 82]]], dtype=uint8)] # __create_images_end__ - -# __consume_native_begin__ -import ray - -# Read a single-column example dataset. -ds = ray.data.read_numpy("example://mnist_subset.npy") -# -> Dataset(num_blocks=1, num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)}) - -def add_one(batch: np.ndarray) -> np.ndarray: - return batch + 1 - -# This processes batches in numpy.ndarray format. -ds = ds.map_batches(add_one) - -# This returns batches in numpy.ndarray format. -next(ds.iter_batches()) -# -> array([[[1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# ..., -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1]], -# -# ..., -# -# [[1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# ..., -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1]]], dtype=uint8) -# __consume_native_end__ - -# __consume_native_2_begin__ -import ray - -# Read a multi-column example dataset. -ds = ray.data.read_parquet("example://parquet_images_mini") -# -> Dataset(num_blocks=3, num_rows=3, -# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), -# label: string}) - -def add_one(batch: pd.DataFrame) -> pd.DataFrame: - batch["image"] += 1 - return batch - -# This processes batches in pd.DataFrame format. -ds = ds.map_batches(add_one) - -# This returns pandas batches with List[np.ndarray] columns. -next(ds.iter_batches()) -# -> image label -# 0 [[[ 96, 76, 61], [ 92, 72, 57], [ 92, 72,... cat -# 1 [[[ 38, 38, 39], [ 39, 39, 40], [ 39, 39,... cat -# 2 [[[ 47, 39, 33], [ 43, 36, 29], [ 43, 36,... dog -# __consume_native_2_end__ - -# __consume_pandas_begin__ -import ray - -# Read a single-column example dataset. -ds = ray.data.read_numpy("example://mnist_subset.npy") -# -> Dataset(num_blocks=1, num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)}) - -def add_one(batch: pd.DataFrame) -> pd.DataFrame: - batch["__value__"] += 1 - return batch - -# This processes batches in pd.DataFrame format. -ds = ds.map_batches(add_one, batch_format="pandas") - -# This returns pandas batches with List[np.ndarray] columns. -next(ds.iter_batches(batch_format="pandas")) -# -> __value__ -# 0 [[ 1, 1, 1, 1, 1, 1, 1, 1, 1,... -# 1 [[ 1, 1, 1, 1, 1, 1, 1, 1, 1,... -# 2 [[ 1, 1, 1, 1, 1, 1, 1, 1, 1,... -# __consume_pandas_end__ - # __consume_pandas_2_begin__ -import ray - -# Read a multi-column example dataset. ds = ray.data.read_parquet("example://parquet_images_mini") # -> Dataset(num_blocks=3, num_rows=3, -# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), -# label: string}) +# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), +# label: string}) def add_one(batch: pd.DataFrame) -> pd.DataFrame: batch["image"] += 1 @@ -313,54 +191,24 @@ def add_one(batch: pd.DataFrame) -> pd.DataFrame: # 2 [[[ 47, 39, 33], [ 43, 36, 29], [ 43, 36,... dog # __consume_pandas_2_end__ -# __consume_pyarrow_begin__ -import ray +# __consume_pyarrow_2_begin__ from ray.data.extensions.tensor_extension import ArrowTensorArray -import pyarrow - -# Read a single-column example dataset. -ds = ray.data.read_numpy("example://mnist_subset.npy") -# -> Dataset(num_blocks=1, num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)}) - -def add_one(batch: pyarrow.Table) -> pyarrow.Table: - np_col = np.array( - [ - np.ndarray((28, 28), buffer=buf, dtype=np.uint8) - for buf in batch.column("__value__") - ] - ) - np_col += 1 - - return batch.set_column( - batch._ensure_integer_index("__value__"), - "__value__", - ArrowTensorArray.from_numpy(np_col), - ) - -# This processes batches in pyarrow.Table format. -ds = ds.map_batches(add_one, batch_format="pyarrow") - -# This returns batches in pyarrow.Table format. -next(ds.iter_batches(batch_format="pyarrow")) -# pyarrow.Table -# __value__: extension> -# ---- -# __value__: [[[1,1,1,1,1,1,1,1,1,1,...],...,[1,1,1,1,1,1,1,1,1,1,...]]] -# __consume_pyarrow_end__ - -# __consume_pyarrow_2_begin__ -# Read a multi-column example dataset. ds = ray.data.read_parquet("example://parquet_images_mini") # -> Dataset(num_blocks=3, num_rows=3, -# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), label: object}) +# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), +# label: object}) + +def add_one(batch: pa.Table) -> pa.Table: + + def to_numpy(buf): + if not isinstance(buf, np.ndarray): + buf = buf.as_py() + return buf -def add_one(batch: pyarrow.Table) -> pyarrow.Table: np_col = np.array( [ - np.ndarray((128, 128, 3), buffer=buf, dtype=np.uint8) - for buf in batch.column("image") + to_numpy(buf) for buf in batch.column("image") ] ) np_col += 1 @@ -384,43 +232,11 @@ def add_one(batch: pyarrow.Table) -> pyarrow.Table: # label: [["cat"]] # __consume_pyarrow_2_end__ -# __consume_numpy_begin__ -import ray - -# Read a single-column example dataset. -ds = ray.data.read_numpy("example://mnist_subset.npy") -# -> Dataset(num_blocks=1, num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)}) - -def add_one(batch: np.ndarray) -> np.ndarray: - batch += 1 - return batch - -# This processes batches in np.ndarray format. -ds = ds.map_batches(add_one, batch_format="numpy") - -# This returns batches in np.ndarray format. -next(ds.iter_batches(batch_format="numpy")) -# -> array([[[1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# ..., -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1]], -# -# ..., -# -# [[1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1], -# ..., -# [1, 1, 1, ..., 1, 1, 1], -# [1, 1, 1, ..., 1, 1, 1]]], dtype=uint8) -# __consume_numpy_end__ - # __consume_numpy_2_begin__ -# Read a multi-column example dataset. ds = ray.data.read_parquet("example://parquet_images_mini") # -> Dataset(num_blocks=3, num_rows=3, -# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), label: object}) +# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), +# label: object}) def add_one(batch: Dict[str, Any]) -> Dict[str, Any]: assert isinstance(batch, dict) @@ -456,7 +272,8 @@ def add_one(batch: Dict[str, Any]) -> Dict[str, Any]: # Read a multi-column example dataset. ds = ray.data.read_parquet("example://parquet_images_mini") # -> Dataset(num_blocks=3, num_rows=3, -# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), label: object}) +# schema={image: numpy.ndarray(shape=(128, 128, 3), dtype=uint8), +# label: object}) # You can write the dataset to Parquet. ds.write_parquet("/tmp/some_path") @@ -475,15 +292,15 @@ def add_one(batch: Dict[str, Any]) -> Dict[str, Any]: # Read a single-column example dataset. ds = ray.data.read_numpy("example://mnist_subset.npy") # -> Dataset(num_blocks=1, num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)}) +# schema={data: numpy.ndarray(shape=(28, 28), dtype=uint8)}) # You can write the dataset to Parquet. -ds.write_numpy("/tmp/some_path") +ds.write_numpy("/tmp/some_path", column="data") # And you can read it back. read_ds = ray.data.read_numpy("/tmp/some_path") print(read_ds.schema()) -# -> __value__: extension> +# -> data: extension> # __write_2_end__ # fmt: off @@ -493,8 +310,8 @@ def add_one(batch: Dict[str, Any]) -> Dict[str, Any]: df = pd.DataFrame({"feature": ragged_array, "label": [1, 1]}) ds = ray.data.from_pandas([df, df]) # -> Dataset(num_blocks=2, num_rows=4, -# schema={feature: numpy.ndarray(shape=(None, None), dtype=float64), -# label: int64}) +# schema={feature: numpy.ndarray(shape=(None, None), dtype=float64), +# label: int64}) ds.take(2) # -> [{'feature': array([[1., 1.], @@ -508,7 +325,7 @@ def add_one(batch: Dict[str, Any]) -> Dict[str, Any]: # fmt: off # __tf_variable_shaped_tensors_begin___ -# Convert Ray Dataset to a TensorFlow Dataset. +# Convert Dataset to a TensorFlow Dataset. tf_ds = ds.to_tf( batch_size=2, feature_columns="feature", diff --git a/doc/source/data/doc_code/tf_quick_start.py b/doc/source/data/doc_code/tf_quick_start.py new file mode 100644 index 000000000000..dd3cde9119dd --- /dev/null +++ b/doc/source/data/doc_code/tf_quick_start.py @@ -0,0 +1,86 @@ +# flake8: noqa +# isort: skip_file +# fmt: off + +# __tf_super_quick_start__ +import ray +import numpy as np +from typing import Dict + +ds = ray.data.from_numpy(np.ones((1, 100))) + +class TFPredictor: + def __init__(self): + from tensorflow import keras + + input_layer = keras.Input(shape=(100,)) + output_layer = keras.layers.Dense(1, activation="sigmoid") + self.model = keras.Sequential([input_layer, output_layer]) + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict: + return {"output": self.model(batch["data"]).numpy()} + +scale = ray.data.ActorPoolStrategy(size=2) +predictions = ds.map_batches(TFPredictor, compute=scale) +predictions.show(limit=1) +# {'output': array([0.45119727])} +# __tf_super_quick_end__ + + +# __tf_no_ray_start__ +from tensorflow import keras +import numpy as np +from typing import Dict + +batches = {"data": np.ones((1, 100))} + +input_layer = keras.Input(shape=(100,)) +output_layer = keras.layers.Dense(1, activation="sigmoid") +model = keras.Sequential([input_layer, output_layer]) + +def transform(batch: Dict[str, np.ndarray]): + return {"output": model(batch["data"]).numpy()} + +results = transform(batches) +# __tf_no_ray_end__ + + +# __tf_quickstart_load_start__ +import ray +import numpy as np +from typing import Dict + + +ds = ray.data.from_numpy(np.ones((1, 100))) +# __tf_quickstart_load_end__ + + +# __tf_quickstart_model_start__ +class TFPredictor: + def __init__(self): # <1> + from tensorflow import keras + + input_layer = keras.Input(shape=(100,)) + output_layer = keras.layers.Dense(1, activation="sigmoid") + self.model = keras.Sequential([input_layer, output_layer]) + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict: # <2> + return {"output": self.model(batch["data"]).numpy()} +# __tf_quickstart_model_end__ + + +# __tf_quickstart_prediction_test_start__ +tfp = TFPredictor() +batch = ds.take_batch(10) +test = tfp(batch) +# __tf_quickstart_prediction_test_end__ + + +# __tf_quickstart_prediction_start__ +scale = ray.data.ActorPoolStrategy(size=2) + +predictions = ds.map_batches(TFPredictor, compute=scale) +predictions.show(limit=1) +# {'output': array([0.45119727])} +# __tf_quickstart_prediction_end__ +# fmt: on diff --git a/doc/source/data/doc_code/torch_image_batch_trained.py b/doc/source/data/doc_code/torch_image_batch_trained.py new file mode 100644 index 000000000000..205245a54208 --- /dev/null +++ b/doc/source/data/doc_code/torch_image_batch_trained.py @@ -0,0 +1,60 @@ +# flake8: noqa +# isort: skip_file +# fmt: off + +# __pt_load_start__ +import ray + +ray.init(num_gpus=4) + +data_url = "s3://anonymous@air-example-data-2/1G-image-data-synthetic-raw" # <1> +ds = ray.data.read_images(data_url).limit(1000) # <2> +# __pt_load_end__ + +# __pt_preprocess_start__ +from typing import Dict +import numpy as np +from torchvision import transforms +from torchvision.models import ResNet18_Weights + +resnet_transforms = ResNet18_Weights.DEFAULT.transforms +transform = transforms.Compose([transforms.ToTensor(), resnet_transforms()]) # <1> + +def preprocess_images(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: # <2> + transformed_images = [transform(image) for image in batch["image"]] + return {"preprocessed": transformed_images} + +ds = ds.map_batches(preprocess_images) # <3> +# __pt_preprocess_end__ + + +# __pt_model_start__ +import torch +from torchvision.models import resnet18 + + +class TorchPredictor: + def __init__(self): # <1> + self.model = resnet18(pretrained=True).cuda() + self.model.eval() + + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: # <2> + torch_batch = torch.stack(batch["preprocessed"]) # <3> + with torch.inference_mode(): + prediction = self.model(torch_batch) + return {"class": prediction.argmax(dim=1).detach().numpy()} # <4> +# __pt_model_end__ + + +# __pt_prediction_start__ +predictions = ds.map_batches( + TorchPredictor, + compute=ray.data.ActorPoolStrategy(size=4), # <1> + num_gpus=1, # <2> + batch_size=8, +) + +predictions.show(limit=1) +# {'class': 258} +# __pt_prediction_end__ +# fmt: on diff --git a/doc/source/data/doc_code/transforming_data.py b/doc/source/data/doc_code/transforming_data.py new file mode 100644 index 000000000000..184d96b81b7f --- /dev/null +++ b/doc/source/data/doc_code/transforming_data.py @@ -0,0 +1,201 @@ +# flake8: noqa +# fmt: off + +# __map_batches_begin__ +import ray +import numpy as np +from typing import Dict + +# Load data. +ds = ray.data.from_items(["Test", "String", "Test String"]) +# -> Dataset(num_blocks=1, num_rows=3, schema={item: string}) + +# Define the transform function. +def to_lowercase(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + lowercase_batch = [b.lower() for b in batch["item"]] + return {"text": lowercase_batch} + +ds.map_batches(to_lowercase).show() +# -> {'text': 'test'} +# -> {'text': 'string'} +# -> {'text': 'test string'} +# __map_batches_end__ + + +# __map_begin__ +import ray +from typing import Dict, Any + +# Load data. +ds = ray.data.from_items(["Test", "String", "Test String"]) +# -> Dataset(num_blocks=1, num_rows=3, schema={item: string}) + +# Define the transform function. +def to_lowercase(row: Dict[str, Any]) -> Dict[str, Any]: + lowercase = row["item"].lower() + return {"text": lowercase} + +ds.map(to_lowercase).show() +# -> {'text': 'test'} +# -> {'text': 'string'} +# -> {'text': 'test string'} +# __map_end__ + +# __writing_numpy_udfs_begin__ +import ray +import numpy as np +from typing import Dict + +ds = ray.data.read_csv("example://iris.csv") + +def numpy_transform(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + new_col = batch["sepal.length"] / np.max(batch["sepal.length"]) + batch["normalized.sepal.length"] = new_col + del batch["sepal.length"] + return batch + +ds.map_batches(numpy_transform, batch_format="numpy").show(2) +# -> {'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, +# 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} +# -> {'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, +# 'variety': 'Versicolor', 'normalized.sepal.length': 0.9142857142857144} +# __writing_numpy_udfs_end__ + +# __writing_pandas_udfs_begin__ +import ray +import pandas as pd + +ds = ray.data.read_csv("example://iris.csv") + +def pandas_transform(df: pd.DataFrame) -> pd.DataFrame: + df.loc[:, "normalized.sepal.length"] = df["sepal.length"] / df["sepal.length"].max() + df = df.drop(columns=["sepal.length"]) + return df + +ds.map_batches(pandas_transform, batch_format="pandas").show(2) +# -> {'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, +# 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} +# -> {'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, +# 'variety': 'Versicolor', 'normalized.sepal.length': 0.9142857142857144} +# __writing_pandas_udfs_end__ + +# __writing_arrow_udfs_begin__ +import ray +import pyarrow as pa +import pyarrow.compute as pac + +ds = ray.data.read_csv("example://iris.csv") + +def pyarrow_transform(batch: pa.Table) -> pa.Table: + batch = batch.append_column( + "normalized.sepal.length", + pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])), + ) + return batch.drop(["sepal.length"]) + +ds.map_batches(pyarrow_transform, batch_format="pyarrow").show(2) +# -> {'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, +# 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} +# -> {'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, +# 'variety': 'Versicolor', 'normalized.sepal.length': 0.9142857142857144} +# __writing_arrow_udfs_end__ + +# __dataset_compute_strategy_begin__ +import ray +import pandas as pd +import numpy as np +from ray.data import ActorPoolStrategy + +# Dummy model to predict Iris variety. +def predict_iris(df: pd.DataFrame) -> pd.DataFrame: + conditions = [ + (df["sepal.length"] < 5.0), + (df["sepal.length"] >= 5.0) & (df["sepal.length"] < 6.0), + (df["sepal.length"] >= 6.0) + ] + values = ["Setosa", "Versicolor", "Virginica"] + return pd.DataFrame({"predicted_variety": np.select(conditions, values)}) + +class IrisInferModel: + # Do any expensive model setup in the __init__ function. + def __init__(self): + self._model = predict_iris + + # This method is called repeatedly by Ray Data to process batches. + def __call__(self, batch: pd.DataFrame) -> pd.DataFrame: + return self._model(batch) + +ds = ray.data.read_csv("example://iris.csv").repartition(10) + +# Batch inference processing with Ray tasks (the default compute strategy). +predicted = ds.map_batches(predict_iris, batch_format="pandas") + +# Batch inference processing with Ray actors (pool of size 5). +predicted = ds.map_batches( + IrisInferModel, compute=ActorPoolStrategy(size=5), batch_size=10) +# __dataset_compute_strategy_end__ + +# __writing_generator_udfs_begin__ +import ray +from typing import Iterator + +# Load iris data. +ds = ray.data.read_csv("example://iris.csv") + +# UDF to repeat the dataframe 100 times, in chunks of 20. +def repeat_dataframe(df: pd.DataFrame) -> Iterator[pd.DataFrame]: + for _ in range(5): + yield pd.concat([df]*20) + +ds.map_batches(repeat_dataframe, batch_format="pandas").show(2) +# -> {'sepal.length': 5.1, 'sepal.width': 3.5, 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} +# -> {'sepal.length': 4.9, 'sepal.width': 3.0, 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} +# __writing_generator_udfs_end__ + +# __shuffle_begin__ +import ray + +# The dataset starts off with 1000 blocks. +ds = ray.data.range(10000, parallelism=1000) +# -> Dataset(num_blocks=1000, num_rows=10000, schema={id: int64}) + +# Repartition the data into 100 blocks. Since shuffle=False, Ray Data will minimize +# data movement during this operation by merging adjacent blocks. +ds = ds.repartition(100, shuffle=False).materialize() +# -> MaterializedDataset(num_blocks=100, num_rows=10000, schema={id: int64}) + +# Repartition the data into 200 blocks, and force a full data shuffle. +# This operation will be more expensive +ds = ds.repartition(200, shuffle=True).materialize() +# -> MaterializedDataset(num_blocks=200, num_rows=10000, schema={id: int64}) +# __shuffle_end__ + +# __map_groups_begin__ +import ray +import numpy as np +from typing import Dict + +# Load iris data. +ds = ray.data.read_csv("example://iris.csv") + +# The user function signature for `map_groups` is the same as that of `map_batches`. +# It takes in a batch representing the grouped data, and must return a batch of +# zero or more records as the result. +def custom_count(group: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + # Since we are grouping by variety, all elements in this batch are equal. + variety = group["variety"][0] + count = len(group["variety"]) + # Here we return a batch of a single record for the group (array of len 1). + return { + "variety": np.array([variety]), + "count": np.array([count]), + } + +ds = ds.groupby("variety").map_groups(custom_count) +ds.show() +# -> {'variety': 'Setosa', 'count': 50} +# {'variety': 'Versicolor', 'count': 50} +# {'variety': 'Virginica', 'count': 50} +# __map_groups_end__ + +# fmt: on diff --git a/doc/source/data/doc_code/transforming_datasets.py b/doc/source/data/doc_code/transforming_datasets.py deleted file mode 100644 index 648735f0de20..000000000000 --- a/doc/source/data/doc_code/transforming_datasets.py +++ /dev/null @@ -1,696 +0,0 @@ -# flake8: noqa - -# fmt: off -# __dataset_transformation_begin__ -import ray -import pandas - -# Create a dataset from file with Iris data. -# Tip: "example://" is a convenient protocol to access the -# python/ray/data/examples/data directory. -ds = ray.data.read_csv("example://iris.csv") -# Dataset(num_blocks=1, num_rows=150, -# schema={sepal.length: float64, sepal.width: float64, -# petal.length: float64, petal.width: float64, variety: object}) -ds.show(3) -# -> {'sepal.length': 5.1, 'sepal.width': 3.5, -# 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} -# -> {'sepal.length': 4.9, 'sepal.width': 3.0, -# 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} -# -> {'sepal.length': 4.7, 'sepal.width': 3.2, -# 'petal.length': 1.3, 'petal.width': 0.2, 'variety': 'Setosa'} - -# Repartition the dataset to 5 blocks. -ds = ds.repartition(5) -# -> Repartition -# +- Dataset(num_blocks=1, num_rows=150, -# schema={sepal.length: float64, sepal.width: float64, -# petal.length: float64, petal.width: float64, variety: object}) - -# Find rows with sepal.length < 5.5 and petal.length > 3.5. -def transform_batch(df: pandas.DataFrame) -> pandas.DataFrame: - return df[(df["sepal.length"] < 5.5) & (df["petal.length"] > 3.5)] - -# Map processing the dataset. -ds.map_batches(transform_batch).show() -# -> {'sepal.length': 5.2, 'sepal.width': 2.7, -# 'petal.length': 3.9, 'petal.width': 1.4, 'variety': 'Versicolor'} -# -> {'sepal.length': 5.4, 'sepal.width': 3.0, -# 'petal.length': 4.5, 'petal.width': 1.5, 'variety': 'Versicolor'} -# -> {'sepal.length': 4.9, 'sepal.width': 2.5, -# 'petal.length': 4.5, 'petal.width': 1.7, 'variety': 'Virginica'} - -# Split the dataset into 2 datasets -ds.split(2) -# -> [Dataset(num_blocks=3, num_rows=90, -# schema={sepal.length: double, sepal.width: double, -# petal.length: double, petal.width: double, variety: string}), -# Dataset(num_blocks=2, num_rows=60, -# schema={sepal.length: double, sepal.width: double, -# petal.length: double, petal.width: double, variety: string})] - -# Sort the dataset by sepal.length. -ds = ds.sort("sepal.length") -ds.show(3) -# -> {'sepal.length': 4.3, 'sepal.width': 3.0, -# 'petal.length': 1.1, 'petal.width': 0.1, 'variety': 'Setosa'} -# -> {'sepal.length': 4.4, 'sepal.width': 2.9, -# 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} -# -> {'sepal.length': 4.4, 'sepal.width': 3.0, -# 'petal.length': 1.3, 'petal.width': 0.2, 'variety': 'Setosa'} - -# Shuffle the dataset. -ds = ds.random_shuffle() -ds.show(3) -# -> {'sepal.length': 6.7, 'sepal.width': 3.1, -# 'petal.length': 4.4, 'petal.width': 1.4, 'variety': 'Versicolor'} -# -> {'sepal.length': 6.7, 'sepal.width': 3.3, -# 'petal.length': 5.7, 'petal.width': 2.1, 'variety': 'Virginica'} -# -> {'sepal.length': 4.5, 'sepal.width': 2.3, -# 'petal.length': 1.3, 'petal.width': 0.3, 'variety': 'Setosa'} - -# Group by the variety. -ds.groupby("variety").count().show() -# -> {'variety': 'Setosa', 'count()': 50} -# -> {'variety': 'Versicolor', 'count()': 50} -# -> {'variety': 'Virginica', 'count()': 50} -# __dataset_transformation_end__ -# fmt: on - -# fmt: off -# __writing_default_udfs_tabular_begin__ -import ray -import pandas as pd - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") -print(ds.default_batch_format()) -# - -# UDF as a function on Pandas DataFrame batches. -def pandas_transform(df_batch: pd.DataFrame) -> pd.DataFrame: - # Filter rows. - df_batch = df_batch[df_batch["variety"] == "Versicolor"] - # Add derived column. - # Notice here that `df["sepal.length"].max()` is only the max value of the column - # within a given batch (instead of globally)!! - df_batch.loc[:, "normalized.sepal.length"] = df_batch["sepal.length"] / df_batch["sepal.length"].max() - # Drop column. - df_batch = df_batch.drop(columns=["sepal.length"]) - return df_batch - -ds.map_batches(pandas_transform).show(2) -# -> {'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, -# 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} -# -> {'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, -# 'variety': 'Versicolor', 'normalized.sepal.length': 0.9142857142857144} -# __writing_default_udfs_tabular_end__ -# fmt: on - -# fmt: off -# __writing_default_udfs_tensor_begin__ -import ray -import numpy as np - -# Load dataset. -ds = ray.data.range_tensor(1000, shape=(2, 2)) -print(ds.default_batch_format()) -# - -# UDF as a function on NumPy ndarray batches. -def tensor_transform(arr: np.ndarray) -> np.ndarray: - # Notice here that the ndarray is of shape (batch_size, 2, 2) - # Multiply each element in the ndarray by a factor of 2 - return arr * 2 - -ds.map_batches(tensor_transform).show(2) -# [array([[0, 0], -# [0, 0]]), -# array([[2, 2], -# [2, 2]])] - -# __writing_default_udfs_tensor_end__ -# fmt: on - -# fmt: off -# __writing_default_udfs_list_begin__ -import ray - -# Load dataset. -ds = ray.data.range(1000) -print(ds.default_batch_format()) -# - -# UDF as a function on Python list batches. -def list_transform(list) -> list: - # Notice here that the list is of length batch_size - # Multiply each element in the list by a factor of 2 - return [x * 2 for x in list] - -ds.map_batches(list_transform).show(2) -# 0 -# 2 - -# __writing_default_udfs_list_end__ -# fmt: on - -# fmt: off -# __writing_pandas_udfs_begin__ -import ray -import pandas as pd - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") - -# UDF as a function on Pandas DataFrame batches. -def pandas_transform(df: pd.DataFrame) -> pd.DataFrame: - # Filter rows. - df = df[df["variety"] == "Versicolor"] - # Add derived column. - df.loc[:, "normalized.sepal.length"] = df["sepal.length"] / df["sepal.length"].max() - # Drop column. - df = df.drop(columns=["sepal.length"]) - return df - -ds.map_batches(pandas_transform).show(2) -# -> {'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, -# 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} -# -> {'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, -# 'variety': 'Versicolor', 'normalized.sepal.length': 0.9142857142857144} -# __writing_pandas_udfs_end__ -# fmt: on - -# fmt: off -# __writing_arrow_udfs_begin__ -import ray -import pyarrow as pa -import pyarrow.compute as pac - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") - -# UDF as a function on Arrow Table batches. -def pyarrow_transform(batch: pa.Table) -> pa.Table: - batch = batch.filter(pac.equal(batch["variety"], "Versicolor")) - batch = batch.append_column( - "normalized.sepal.length", - pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])), - ) - return batch.drop(["sepal.length"]) - -ds.map_batches(pyarrow_transform, batch_format="pyarrow").show(2) -# -> {'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, -# 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} -# -> {'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, -# 'variety': 'Versicolor', 'normalized.sepal.length': 0.9142857142857144} -# __writing_arrow_udfs_end__ -# fmt: on - -# fmt: off -# __writing_numpy_udfs_begin__ -import ray -import numpy as np - -# Load dataset. -ds = ray.data.read_numpy("example://mnist_subset.npy") - -# UDF as a function on NumPy ndarray batches. -def normalize(arr: np.ndarray) -> np.ndarray: - # Normalizes each image to [0, 1] range. - mins = arr.min((1, 2))[:, np.newaxis, np.newaxis] - maxes = arr.max((1, 2))[:, np.newaxis, np.newaxis] - range_ = maxes - mins - idx = np.where(range_ == 0) - mins[idx] = 0 - range_[idx] = 1 - return (arr - mins) / range_ - -ds = ds.map_batches(normalize, batch_format="numpy") -# -> MapBatches(normalize) -# +- Dataset(num_blocks=1, -# num_rows=3, -# schema={__value__: numpy.ndarray(shape=(28, 28), dtype=uint8)} -# ) -# __writing_numpy_udfs_end__ -# fmt: on - -# fmt: off -# __writing_callable_classes_udfs_begin__ -import ray - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") - -# UDF as a function on Pandas DataFrame batches. -class ModelUDF: - def __init__(self): - self.model = lambda df: df["sepal.length"] > 0.65 - - def __call__(self, df: pd.DataFrame) -> pd.DataFrame: - # Filter rows. - df = df[df["variety"] == "Versicolor"] - # Apply model. - df["output"] = self.model(df) - return df - -ds.map_batches(ModelUDF, compute="actors").show(2) -# -> {'sepal.length': 7.0, 'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, -# 'variety': 'Versicolor', 'output': True} -# -> {'sepal.length': 6.4, 'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, -# 'variety': 'Versicolor', 'output': False}` -# __writing_callable_classes_udfs_end__ -# fmt: on - -# fmt: off -# __writing_generator_udfs_begin__ -import ray -from typing import Iterator - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") - -# UDF to repeat the dataframe 100 times, in chunks of 20. -def repeat_dataframe(df: pd.DataFrame) -> Iterator[pd.DataFrame]: - for _ in range(5): - yield pd.concat([df]*20) - -ds.map_batches(repeat_dataframe).show(2) -# -> {'sepal.length': 5.1, 'sepal.width': 3.5, 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} -# -> {'sepal.length': 4.9, 'sepal.width': 3.0, 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} -# __writing_generator_udfs_end__ -# fmt: on - -# fmt: off -# __writing_pandas_out_udfs_begin__ -import ray -import pandas as pd -from typing import List - -# Load dataset. -ds = ray.data.from_items(["test", "string", "teststring"]) -# -> Dataset(num_blocks=1, num_rows=3, schema=) - -# Convert to Pandas. -def convert_to_pandas(text: List[str]) -> pd.DataFrame: - return pd.DataFrame({"text": text}, dtype="string") - -ds = ds.map_batches(convert_to_pandas) -# -> MapBatches(convert_to_pandas) -# +- Dataset(num_blocks=3, num_rows=3, schema=) - -ds.show(2) -# -> {'text': 'test'} -# -> {'text': 'string'} - -print(ds) -# -> Dataset(num_blocks=3, num_rows=3, schema={text: string}) -# __writing_pandas_out_udfs_end__ -# fmt: on - -# fmt: off -# __writing_arrow_out_udfs_begin__ -import ray -import pyarrow as pa -from typing import List - -# Load dataset. -ds = ray.data.from_items(["test", "string", "teststring"]) -# -> Dataset(num_blocks=1, num_rows=3, schema=) - -# Convert to Arrow. -def convert_to_arrow(text: List[str]) -> pa.Table: - return pa.table({"text": text}) - -ds = ds.map_batches(convert_to_arrow) -# -> MapBatches(convert_to_arrow) -# +- Dataset(num_blocks=1, num_rows=3, schema=) - -ds.show(2) -# -> {'text': 'test'} -# -> {'text': 'string'} - -print(ds) -# -> Dataset(num_blocks=3, num_rows=3, schema={text: string}) -# __writing_arrow_out_udfs_end__ -# fmt: on - -# fmt: off -# __writing_numpy_out_udfs_begin__ -import ray -import pandas as pd -import numpy as np -from typing import Dict - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") -# -> Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -# Convert to NumPy. -def convert_to_numpy(df: pd.DataFrame) -> np.ndarray: - return df[["sepal.length", "sepal.width"]].to_numpy() - -ds = ds.map_batches(convert_to_numpy) -# -> MapBatches(convert_to_numpy) -# +- Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -ds.show(2) -# -> [5.1 3.5] -# [4.9 3. ] -# __writing_numpy_out_udfs_end__ -# fmt: on - -# fmt: off -# __writing_numpy_dict_out_udfs_begin__ -import ray -import pandas as pd -import numpy as np -from typing import Dict - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") -# -> Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -# Convert to dict of NumPy ndarrays. -def convert_to_numpy(df: pd.DataFrame) -> Dict[str, np.ndarray]: - return { - "sepal_len_and_width": df[["sepal.length", "sepal.width"]].to_numpy(), - "petal_len": df["petal.length"].to_numpy(), - "petal_width": df["petal.width"].to_numpy(), - } - -ds = ds.map_batches(convert_to_numpy) -# -> MapBatches(convert_to_numpy) -# +- Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -ds.show(2) -# -> {'sepal_len_and_width': array([5.1, 3.5]), 'petal_len': 1.4, 'petal_width': 0.2} -# -> {'sepal_len_and_width': array([4.9, 3. ]), 'petal_len': 1.4, 'petal_width': 0.2} -# __writing_numpy_dict_out_udfs_end__ -# fmt: on - -# fmt: off -# __writing_simple_out_udfs_begin__ -import ray -import pandas as pd -from typing import List - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") -# -> Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -# Convert to list of dicts. -def convert_to_list(df: pd.DataFrame) -> List[dict]: - return df.to_dict("records") - -ds = ds.map_batches(convert_to_list) -# -> MapBatches(convert_to_list) -# +- Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -ds.show(2) -# -> {'sepal.length': 5.1, 'sepal.width': 3.5, 'petal.length': 1.4, 'petal.width': 0.2, -# 'variety': 'Setosa'} -# -> {'sepal.length': 4.9, 'sepal.width': 3.0, 'petal.length': 1.4, 'petal.width': 0.2, -# 'variety': 'Setosa'} -# __writing_simple_out_udfs_end__ -# fmt: on - -# fmt: off -# __writing_dict_out_row_udfs_begin__ -import ray -import pandas as pd -from typing import Dict - -# Load dataset. -ds = ray.data.range(10) -# -> Dataset(num_blocks=10, num_rows=10, schema=) - -# Convert row to dict. -def row_to_dict(row: int) -> Dict[str, int]: - return {"foo": row} - -ds = ds.map(row_to_dict) -# -> Map -# +- Dataset(num_blocks=10, num_rows=10, schema=) - -ds.show(2) -# -> {'foo': 0} -# -> {'foo': 1} -# __writing_dict_out_row_udfs_end__ -# fmt: on - -# fmt: off -# __writing_table_row_out_row_udfs_begin__ -import ray -from ray.data.row import TableRow -import pandas as pd -from typing import Dict - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") -# -> Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -# Treat row as dict. -def map_row(row: TableRow) -> TableRow: - row = row.as_pydict() - row["sepal.area"] = row["sepal.length"] * row["sepal.width"] - return row - -ds = ds.map(map_row) -# -> Map -# +- Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -ds.show(2) -# -> {'sepal.length': 5.1, 'sepal.width': 3.5, 'petal.length': 1.4, 'petal.width': 0.2, -# 'variety': 'Setosa', 'sepal.area': 17.849999999999998} -# -> {'sepal.length': 4.9, 'sepal.width': 3.0, 'petal.length': 1.4, 'petal.width': 0.2, -# 'variety': 'Setosa', 'sepal.area': 14.700000000000001} -# __writing_table_row_out_row_udfs_end__ -# fmt: on - -# fmt: off -# __writing_numpy_out_row_udfs_begin__ -import ray -import numpy as np -from typing import Dict - -# Load dataset. -ds = ray.data.range(10) -# -> Dataset(num_blocks=10, num_rows=10, schema=) - -# Convert row to NumPy ndarray. -def row_to_numpy(row: int) -> np.ndarray: - return np.full(shape=(2, 2), fill_value=row) - -ds = ds.map(row_to_numpy) -# -> Map -# +- Dataset(num_blocks=10, num_rows=10, schema=) - -ds.show(2) -# -> [[0 0] -# [0 0]] -# [[1 1] -# [1 1]] -# __writing_numpy_out_row_udfs_end__ -# fmt: on - -# fmt: off -# __writing_simple_out_row_udfs_begin__ -import ray -from ray.data.row import TableRow -from typing import List - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") -# -> Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -# Convert row to simple (opaque) row. -def map_row(row: TableRow) -> tuple: - return tuple(row.items()) - -ds = ds.map(map_row) -# -> Map -# +- Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -ds.show(2) -# -> (('sepal.length', 5.1), ('sepal.width', 3.5), ('petal.length', 1.4), -# ('petal.width', 0.2), ('variety', 'Setosa')) -# -> (('sepal.length', 4.9), ('sepal.width', 3.0), ('petal.length', 1.4), -# ('petal.width', 0.2), ('variety', 'Setosa')) -# __writing_simple_out_row_udfs_end__ -# fmt: on - -# fmt: off -# __configuring_batch_size_begin__ -import ray -import pandas as pd - -# Load dataset. -ds = ray.data.read_csv("example://iris.csv") - -# UDF as a function on Pandas DataFrame batches. -def pandas_transform(df: pd.DataFrame) -> pd.DataFrame: - # Filter rows. - df = df[df["variety"] == "Versicolor"] - # Add derived column. - df.loc[:, "normalized.sepal.length"] = df["sepal.length"] / df["sepal.length"].max() - # Drop column. - df = df.drop(columns=["sepal.length"]) - return df - -# Have each batch that pandas_transform receives contain 10 rows. -ds = ds.map_batches(pandas_transform, batch_size=10) -# -> MapBatches(pandas_transform) -# +- Dataset( -# num_blocks=1, -# num_rows=150, -# schema={ -# sepal.length: double, -# sepal.width: double, -# petal.length: double, -# petal.width: double, -# variety: string, -# }, -# ) - -ds.show(2) -# -> {'sepal.width': 3.2, 'petal.length': 4.7, 'petal.width': 1.4, -# 'variety': 'Versicolor', 'normalized.sepal.length': 1.0} -# -> {'sepal.width': 3.2, 'petal.length': 4.5, 'petal.width': 1.5, -# 'variety': 'Versicolor', 'normalized.sepal.length': 0.9142857142857144} -# __configuring_batch_size_end__ -# fmt: on - -# fmt: off -# __dataset_compute_strategy_begin__ -import ray -import pandas -import numpy -from ray.data import ActorPoolStrategy - -# Dummy model to predict Iris variety. -def predict_iris(df: pandas.DataFrame) -> pandas.DataFrame: - conditions = [ - (df["sepal.length"] < 5.0), - (df["sepal.length"] >= 5.0) & (df["sepal.length"] < 6.0), - (df["sepal.length"] >= 6.0) - ] - values = ["Setosa", "Versicolor", "Virginica"] - return pandas.DataFrame({"predicted_variety": numpy.select(conditions, values)}) - -class IrisInferModel: - def __init__(self): - self._model = predict_iris - - def __call__(self, batch: pandas.DataFrame) -> pandas.DataFrame: - return self._model(batch) - -ds = ray.data.read_csv("example://iris.csv").repartition(10) - -# Batch inference processing with Ray tasks (the default compute strategy). -predicted = ds.map_batches(predict_iris) - -# Batch inference processing with Ray actors. Autoscale the actors between 3 and 10. -predicted = ds.map_batches( - IrisInferModel, compute=ActorPoolStrategy(min_size=3, max_size=10), batch_size=10) -# __dataset_compute_strategy_end__ -# fmt: on diff --git a/doc/source/data/examples/advanced-pipelines.rst b/doc/source/data/examples/advanced-pipelines.rst deleted file mode 100644 index 2b41a66d18e6..000000000000 --- a/doc/source/data/examples/advanced-pipelines.rst +++ /dev/null @@ -1,112 +0,0 @@ -.. _data_pipeline_usage: - --------------------------- -Advanced Pipeline Examples --------------------------- - -This page covers more advanced examples for dataset pipelines. - -.. _dataset-pipeline-per-epoch-shuffle: - -Pre-repeat vs post-repeat transforms -==================================== - -Transformations prior to the call to ``.repeat()`` will be cached. However, note that the initial read will not be cached unless there is a subsequent transformation or ``.materialize()`` call. Transformations made to the DatasetPipeline after the repeat will always be executed once for each repetition of the Dataset. - -For example, in the following pipeline, the ``map(func)`` transformation only occurs once. However, the random shuffle is applied to each repetition in the pipeline. However, if we omitted the map transformation, then the pipeline would re-read from the base data on each repetition. - -.. note:: - Global per-epoch shuffling is an expensive operation that will slow down your ML - ingest pipeline, prevents you from using a fully-streaming ML ingest pipeline, and - can cause large increases in memory utilization and spilling to disk; only use - global per-epoch shuffling if your model benefits from it! If your model doesn't - benefit from global per-epoch shuffling and/or you run into performance or stability - issues, you should try out windowed or local per-epoch shuffling. - -**Code**: - -.. code-block:: python - - # Create a pipeline that loops over its source dataset indefinitely. - pipe: DatasetPipeline = ray.data \ - .read_datasource(...) \ - .map(func) \ - .repeat() \ - .random_shuffle_each_window() - - @ray.remote(num_gpus=1) - def train_func(pipe: DatasetPipeline): - model = MyModel() - for batch in pipe.iter_torch_batches(): - model.fit(batch) - - # Read from the pipeline in a remote training function. - ray.get(train_func.remote(pipe)) - - -**Pipeline**: - -.. image:: ../images/dataset-repeat-1.svg - -.. important:: - - Result caching only applies if there are *transformation* stages prior to the pipelining operation. If you ``repeat()`` or ``window()`` a Dataset right after the read call (e.g., ``ray.data.read_parquet(...).repeat()``), then the read will still be re-executed on each repetition. This optimization saves memory, at the cost of repeated reads from the datasource. To force result caching in all cases, use ``.materialize().repeat()``. - -Changing Pipeline Structure -=========================== - -Sometimes, you may want to change the structure of an existing pipeline. For example, after generating a pipeline with ``ds.window(k)``, you may want to repeat that windowed pipeline ``n`` times. This can be done with ``ds.window(k).repeat(n)``. As another example, suppose you have a repeating pipeline generated with ``ds.repeat(n)``. The windowing of that pipeline can be changed with ``ds.repeat(n).rewindow(k)``. Note the subtle difference in the two examples: the former is repeating a windowed pipeline that has a base window size of ``k``, while the latter is re-windowing a pipeline of initial window size of ``ds.num_blocks()``. The latter may produce windows that span multiple copies of the same original data if ``preserve_epoch=False`` is set: - -.. code-block:: python - - # Window followed by repeat. - ray.data.from_items([0, 1, 2, 3, 4]) \ - .window(blocks_per_window=2) \ - .repeat(2) \ - .show_windows() - # -> - # ------ Epoch 0 ------ - # === Window 0 === - # 0 - # 1 - # === Window 1 === - # 2 - # 3 - # === Window 2 === - # 4 - # ------ Epoch 1 ------ - # === Window 3 === - # 0 - # 1 - # === Window 4 === - # 2 - # 3 - # === Window 5 === - # 4 - - # Repeat followed by window. Since preserve_epoch=True, at epoch boundaries - # windows may be smaller than the target size. If it was set to False, all - # windows except the last would be the target size. - ray.data.from_items([0, 1, 2, 3, 4]) \ - .repeat(2) \ - .rewindow(blocks_per_window=2, preserve_epoch=True) \ - .show_windows() - # -> - # ------ Epoch 0 ------ - # === Window 0 === - # 0 - # 1 - # === Window 1 === - # 2 - # 3 - # === Window 2 === - # 4 - # ------ Epoch 1 ------ - # === Window 3 === - # 0 - # 1 - # === Window 4 === - # 2 - # 3 - # === Window 5 === - # 4 diff --git a/doc/source/data/examples/batch_training.ipynb b/doc/source/data/examples/batch_training.ipynb index c873dbd716e4..d4dcf959e536 100644 --- a/doc/source/data/examples/batch_training.ipynb +++ b/doc/source/data/examples/batch_training.ipynb @@ -6,7 +6,7 @@ "source": [ "(mmt-datasets)=\n", "\n", - "# Batch Training with Ray Datasets" + "# Batch Training with Ray Data" ] }, { @@ -15,7 +15,7 @@ "source": [ "**Batch training** and tuning are common tasks in simple machine learning use-cases such as time series forecasting. They require fitting of simple models on data batches corresponding to different locations, products, etc. Batch training can take less time to process all the data at once, but only if those batches can run in parallel!\n", "\n", - "This notebook showcases how to conduct batch training regression algorithms from [XGBoost](https://docs.ray.io/en/latest/tune/examples/tune-xgboost.html) and [Scikit-learn](https://docs.ray.io/en/latest/ray-more-libs/joblib.html) with **[Ray Datasets](https://docs.ray.io/en/latest/data/dataset.html)**. **XGBoost** is a popular open-source library used for regression and classification. **Scikit-learn** is a popular open-source library with a vast assortment of well-known ML algorithms.\n", + "This notebook showcases how to conduct batch training regression algorithms from [XGBoost](https://docs.ray.io/en/latest/tune/examples/tune-xgboost.html) and [Scikit-learn](https://docs.ray.io/en/latest/ray-more-libs/joblib.html) with **[Ray Data](data)**. **XGBoost** is a popular open-source library used for regression and classification. **Scikit-learn** is a popular open-source library with a vast assortment of well-known ML algorithms.\n", "\n", "```{tip}\n", "The workload showcased in this notebook can be expressed using different Ray components, such as Ray Data, Ray Tune and Ray Core.\n", @@ -37,11 +37,11 @@ "# Contents\n", "\n", "In this this tutorial, you will learn about:\n", - " 1. [Creating a Ray Dataset](#create_ds)\n", - " 2. [Filtering a Ray Dataset on Read](#filter_ds)\n", - " 3. [Inspecting a Ray Dataset](#inspect_ds)\n", - " 4. [Transforming a Ray Dataset in parallel](#transform_ds)\n", - " 5. [Batch training with Ray Datasets in parallel](#batch_train_ds)\n", + " 1. [Creating a Dataset](#create_ds)\n", + " 2. [Filtering a Dataset on Read](#filter_ds)\n", + " 3. [Inspecting a Dataset](#inspect_ds)\n", + " 4. [Transforming a Dataset in parallel](#transform_ds)\n", + " 5. [Batch training with Ray Data in parallel](#batch_train_ds)\n", " 6. [Load a saved model and perform batch prediction](#load_model)\n", "\n", "# Walkthrough\n", @@ -201,7 +201,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Creating Ray Dataset " + "## Creating a Dataset " ] }, { @@ -209,10 +209,10 @@ "metadata": {}, "source": [ "```{tip}\n", - "Ray Datasets uses PyArrow dataset and table for reading or writing large parquet files. Its native multithreaded C++ adpater is faster than pandas `read_parquet`, even using `engine='pyarrow'`. For more details see [Ray Datasets User Guide](https://docs.ray.io/en/latest/data/user-guide.html).\n", + "Ray Data uses PyArrow dataset and table for reading or writing large parquet files. Its native multithreaded C++ adpater is faster than pandas `read_parquet`, even using `engine='pyarrow'`. For more details see [Ray Data User Guide](https://docs.ray.io/en/latest/data/user-guide.html).\n", "```\n", "\n", - "[Ray Datasets](datasets) are the standard way to load and exchange data in Ray libraries and applications. We will use the [Ray Dataset APIs](dataset-api) to read the data and quickly inspect it.\n", + "[Ray Data](data) is the standard way to load and exchange data in Ray libraries and applications. We will use the [Ray Data APIs](data-api) to read the data and quickly inspect it.\n", "\n", "First, we will define some global variables we will use throughout the notebook, such as the list of S3 links to the files making up the dataset and the possible location IDs." ] @@ -289,7 +289,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Filtering a Ray Dataset on Read \n", + "### Filtering a Dataset on Read \n", "\n", "Normally there is some last-mile data processing required before training. Let's just assume we know the data processing steps are:\n", "- Drop negative trip distances, 0 fares, 0 passengers.\n", @@ -300,10 +300,10 @@ "Instead of blindly reading all the data, it would be better if we only read the data we needed. This is similar concept to SQL `SELECT only rows, columns you need` vs `SELECT *`.\n", "\n", "```{tip}\n", - "Best practice is to filter as much as you can directly in the Ray Dataset `read_parquet()`.\n", + "Best practice is to filter as much as you can directly in the Dataset `read_parquet()`.\n", "```\n", "\n", - "Note that Ray Datasets' Parquet reader supports projection (column selection) and row filter pushdown, where we can push the above column selection and the row-based filter to the Parquet read. If we specify column selection at Parquet read time, the unselected columns won't even be read from disk. This can save a lot of memory, especially with big datasets, and allow us to avoid OOM issues.\n", + "Note that Ray Data' Parquet reader supports projection (column selection) and row filter pushdown, where we can push the above column selection and the row-based filter to the Parquet read. If we specify column selection at Parquet read time, the unselected columns won't even be read from disk. This can save a lot of memory, especially with big datasets, and allow us to avoid OOM issues.\n", "\n", "The row-based filter is specified via [Arrow's dataset field expressions](https://arrow.apache.org/docs/6.0/python/generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression). \n" ] @@ -368,11 +368,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Inspecting a Ray Dataset \n", + "### Inspecting a Dataset \n", "\n", - "Let's get some basic statistics about our newly created Ray Dataset.\n", + "Let's get some basic statistics about our newly created Dataset.\n", "\n", - "As our Ray Dataset is backed by Parquet, we can obtain the number of rows from the metadata without triggering a full data read.\n" + "As our Dataset is backed by Parquet, we can obtain the number of rows from the metadata without triggering a full data read.\n" ] }, { @@ -456,9 +456,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Transforming a Ray Dataset in parallel using custom functions \n", + "### Transforming a Dataset in parallel using custom functions \n", "\n", - "Ray Datasets allows you to specify custom data transform functions. These [user defined functions (UDFs)](transforming_datasets) can be called using `Dataset.map_batches(my_function)`. The transformation will be conducted in parallel for each data batch.\n", + "Ray Data allows you to specify custom data transform functions. These [user defined functions (UDFs)](transforming_data) can be called using `Dataset.map_batches(my_function)`. The transformation will be conducted in parallel for each data batch.\n", "\n", "```{tip}\n", "You may need to call `Dataset.repartition(n)` first to split the Dataset into more blocks internally. By default, each block corresponds to one file. The upper bound of parallelism is the number of blocks.\n", @@ -554,7 +554,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Batch training with Ray Datasets " + "## Batch training with Ray Data " ] }, { @@ -685,7 +685,7 @@ "source": [ "The `train_and_evaluate` function contains the logic for train-test splitting and fitting of a model using the `fit_and_score_sklearn` function.\n", "\n", - "As an input, this function takes in a pandas DataFrame. When we call `Dataset.map_batches` or `Dataset.groupby().map_groups()`, the Dataset will be batched into multiple pandas DataFrames and this function will run for each batch in parallel. We will return the model and its error. Those results will be collected back into a Ray Dataset." + "As an input, this function takes in a pandas DataFrame. When we call `Dataset.map_batches` or `Dataset.groupby().map_groups()`, the Dataset will be batched into multiple pandas DataFrames and this function will run for each batch in parallel. We will return the model and its error. Those results will be collected back into a Dataset." ] }, { @@ -732,7 +732,7 @@ "Recall how we wrote a data transform `transform_batch` UDF? It was called with pattern:\n", "- `Dataset.map_batches(transform_batch, batch_format=\"pandas\")`\n", "\n", - "Similarly, we can write a custom groupy-aggregate function `agg_func` which will run for each [Ray Dataset *group-by*](datasets-groupbys) group in parallel. The usage pattern is:\n", + "Similarly, we can write a custom groupy-aggregate function `agg_func` which will run for each [Dataset *group-by*](data-groupbys) group in parallel. The usage pattern is:\n", "- `Dataset.groupby(column).map_groups(agg_func, batch_format=\"pandas\")`.\n", "\n", "In the cell below, we define our custom `agg_func`." @@ -745,7 +745,7 @@ "outputs": [], "source": [ "# A Pandas DataFrame aggregation function for processing\n", - "# grouped batches of Ray Dataset data.\n", + "# grouped batches of Dataset data.\n", "def agg_func(df: pd.DataFrame) -> pd.DataFrame:\n", " location_id = df[\"dropoff_location_id\"][0]\n", "\n", @@ -772,9 +772,9 @@ "source": [ "### Run batch training using `map_groups`\n", "\n", - "The main \"driver code\" reads each Parquet file (where each file corresponds to one month of NYC taxi data) into a Ray Dataset `ds`. \n", + "The main \"driver code\" reads each Parquet file (where each file corresponds to one month of NYC taxi data) into a Dataset `ds`. \n", "\n", - "Then we use Ray Dataset *group-by* to map each group into a batch of data and run `agg_func` on each grouping in parallel by calling `ds.groupby(\"dropoff_location_id\").map_groups(agg_func, batch_format=\"pandas\")`." + "Then we use Dataset *group-by* to map each group into a batch of data and run `agg_func` on each grouping in parallel by calling `ds.groupby(\"dropoff_location_id\").map_groups(agg_func, batch_format=\"pandas\")`." ] }, { @@ -813,12 +813,12 @@ "\n", "start = time.time()\n", "\n", - "# Read data into Ray Dataset\n", + "# Read data into Dataset\n", "# ds = pushdown_read_data(s3_files, sample_locations)\\\n", "# .repartition(14)\\\n", "# .ds.map_batches(transform_df, batch_format=\"pandas\")\n", "\n", - "# Use Ray Dataset groupby.map_groups() to process each group in parallel and return a Ray Dataset.\n", + "# Use Dataset groupby.map_groups() to process each group in parallel and return a Dataset.\n", "results = ds.groupby(\"dropoff_location_id\").map_groups(agg_func, batch_format=\"pandas\")\n", "\n", "total_time_taken = time.time() - start\n", diff --git a/doc/source/data/examples/index.rst b/doc/source/data/examples/index.rst index 3bcec2037a62..4a4fcb39feb0 100644 --- a/doc/source/data/examples/index.rst +++ b/doc/source/data/examples/index.rst @@ -1,74 +1,64 @@ -.. _datasets-examples-ref: +.. _data-examples-ref: ======== Examples ======== .. tip:: Check out the Datasets :ref:`User Guide ` to learn more about - Datasets' features in-depth. + Dataset features in-depth. -.. _datasets-recipes: +.. _data-recipes: Simple Data Processing Examples ------------------------------- -Ray Datasets is a data processing engine that supports multiple data +Ray Data is a data processing engine that supports multiple data modalities and types. Here you will find a few end-to-end examples of some basic data -processing with Ray Datasets on tabular data, text (coming soon!), and imagery (coming +processing with Ray Data on tabular data, text (coming soon!), and imagery (coming soon!). -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /images/taxi.png - - +++ - .. link-button:: nyc_taxi_basic_processing - :type: ref - :text: Processing the NYC taxi dataset - :classes: btn-link btn-block stretched-link - --- - :img-top: /images/taxi.png - - +++ - .. link-button:: batch_training - :type: ref - :text: Batch Training with Ray Datasets - :classes: btn-link btn-block stretched-link - --- - :img-top: /images/ocr.jpg - - +++ - .. link-button:: ocr_example - :type: ref - :text: Scaling OCR with Ray Datasets - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 3 + :gutter: 2 + :class-container: container pb-4 + + .. grid-item-card:: + :img-top: /images/taxi.png + :class-img-top: pt-5 w-75 d-block mx-auto + + .. button-ref:: nyc_taxi_basic_processing + + Processing the NYC taxi dataset + + .. grid-item-card:: + :img-top: /images/taxi.png + :class-img-top: pt-5 w-75 d-block mx-auto + + .. button-ref:: batch_training + + Batch Training with Ray Data + + .. grid-item-card:: + :img-top: /images/ocr.jpg + :class-img-top: pt-5 w-75 d-block mx-auto + + .. button-ref:: ocr_example + + Scaling OCR with Ray Data + Other Examples -------------- -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /images/dataset-repeat-2.svg - - +++ - .. link-button:: advanced-pipelines - :type: ref - :text: Advanced Pipeline Examples - :classes: btn-link btn-block stretched-link - --- - :img-top: ../images/dataset-arch.svg - - +++ - .. link-button:: random-access - :type: ref - :text: Random Data Access (Experimental) - :classes: btn-link btn-block stretched-link + +.. grid:: 1 2 3 3 + :gutter: 2 + :class-container: container pb-4 + + .. grid-item-card:: + :img-top: ../images/dataset-arch.svg + :class-img-top: pt-5 w-75 d-block mx-auto + + .. button-ref:: random-access + + Random Data Access (Experimental) diff --git a/doc/source/data/examples/nyc_taxi_basic_processing.ipynb b/doc/source/data/examples/nyc_taxi_basic_processing.ipynb index 15b2983aaeae..599a479327c0 100644 --- a/doc/source/data/examples/nyc_taxi_basic_processing.ipynb +++ b/doc/source/data/examples/nyc_taxi_basic_processing.ipynb @@ -25,9 +25,9 @@ "id": "af627a74", "metadata": {}, "source": [ - "# Processing NYC taxi data using Ray Datasets\n", + "# Processing NYC taxi data using Ray Data\n", "\n", - "The [NYC Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) is a popular tabular dataset. In this example, we demonstrate some basic data processing on this dataset using Ray Datasets.\n", + "The [NYC Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) is a popular tabular dataset. In this example, we demonstrate some basic data processing on this dataset using Ray Data.\n", "\n", "## Overview\n", "\n", @@ -37,8 +37,8 @@ " - Calculating some common global and grouped statistics on the dataset\n", " - Dropping columns and rows\n", " - Adding a derived column\n", - " - Shuffling the dataset\n", - " - Sharding the dataset and feeding it to parallel consumers (trainers)\n", + " - Shuffling the data\n", + " - Sharding the data and feeding it to parallel consumers (trainers)\n", " - Applying batch (offline) inference to the data\n", "\n", "## Walkthrough\n", @@ -68,9 +68,7 @@ "source": [ "### Reading and Inspecting the Data\n", "\n", - "Next, we read a few of the files from the dataset. This read is lazy, where reading and all future transformations are delayed until a downstream operation triggers execution (e.g. consuming the data with {meth}`ds.take() `)\n", - "\n", - "We could process the entire Dataset in a streaming fashion using pipelining or all of it in parallel using a multi-node Ray cluster, but we save that for our large-scale examples." + "Next, we read a few of the files from the dataset. This read is lazy, where reading and all future transformations are delayed until a downstream operation triggers execution (e.g. consuming the data with {meth}`ds.take() `)\n" ] }, { @@ -332,7 +330,7 @@ "For the NYC taxi dataset, instead of reading individual per-month Parquet files, we can read the entire 2009 directory.\n", "\n", "```{warning}\n", - "This could be a lot of data (downsampled with 0.01 ratio leads to ~50.2 MB on disk, ~147 MB in memory), so be careful triggering full reads on a limited-memory machine! This is one place where Datasets' lazy reading comes in handy: Datasets will not execute any read tasks eagerly and will execute the minimum number of file reads to satisfy downstream operations, which allows us to inspect a subset of the data without having to read the entire dataset.\n", + "This could be a lot of data (downsampled with 0.01 ratio leads to ~50.2 MB on disk, ~147 MB in memory), so be careful triggering full reads on a limited-memory machine! This is one place where Dataset's lazy reading comes in handy: Dataset will not execute any read tasks eagerly and will execute the minimum number of file reads to satisfy downstream operations, which allows us to inspect a subset of the data without having to read the entire dataset.\n", "```" ] }, @@ -358,7 +356,7 @@ "id": "6616a15d", "metadata": {}, "source": [ - "The metadata that Datasets prints in its repr is guaranteed to not trigger reads of all files; data such as the row count and the schema is pulled directly from the Parquet metadata." + "The metadata that Dataset prints in its repr is guaranteed to not trigger reads of all files; data such as the row count and the schema is pulled directly from the Parquet metadata." ] }, { @@ -387,7 +385,7 @@ "id": "e61dd6d7", "metadata": {}, "source": [ - "That's a lot of rows! Since we're not going to use this full-year dataset, let's now delete this dataset to free up some memory in our Ray cluster." + "That's a lot of rows! Since we're not going to use this full-year data, let's now delete this dataset to free up some memory in our Ray cluster." ] }, { @@ -593,7 +591,7 @@ "id": "0ade2a72", "metadata": {}, "source": [ - "See {ref}`Transforming Datasets ` for more information on how we can process our data with Ray Datasets." + "See {ref}`Transforming Data ` for more information on how we can process our data with Ray Data." ] }, { @@ -603,7 +601,7 @@ "source": [ "#### Advanced Aside - Projection and Filter Pushdown\n", "\n", - "Note that Ray Datasets' Parquet reader supports projection (column selection) and row filter pushdown, where we can push the above column selection and the row-based filter to the Parquet read. If we specify column selection at Parquet read time, the unselected columns won't even be read from disk!\n", + "Note that Ray Data' Parquet reader supports projection (column selection) and row filter pushdown, where we can push the above column selection and the row-based filter to the Parquet read. If we specify column selection at Parquet read time, the unselected columns won't even be read from disk!\n", "\n", "The row-based filter is specified via\n", "[Arrow's dataset field expressions](https://arrow.apache.org/docs/6.0/python/generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression). See the {ref}`feature guide for reading Parquet data ` for more information." @@ -832,7 +830,7 @@ "```{tip}\n", "Refer to the blog on [Model Batch Inference in Ray](https://www.anyscale.com/blog/model-batch-inference-in-ray-actors-actorpool-and-datasets) for an overview of batch inference strategies in Ray and additional examples.\n", "```\n", - "After we've trained a model, we may want to perform batch (offline) inference on such a tabular dataset. With Ray Datasets, this is as easy as a {meth}`ds.map_batches() ` call!\n", + "After we've trained a model, we may want to perform batch (offline) inference on such a tabular dataset. With Ray Data, this is as easy as a {meth}`ds.map_batches() ` call!\n", "\n", "First, we define a callable class that will cache the loading of the model in its constructor." ] @@ -914,7 +912,7 @@ } ], "source": [ - "ds.map_batches(BatchInferModel, batch_size=2048, compute=\"actors\").take()" + "ds.map_batches(BatchInferModel, batch_size=2048, compute=ray.data.ActorPoolStrategy()).take()" ] }, { @@ -976,7 +974,7 @@ " BatchInferModel,\n", " batch_size=256,\n", " #num_gpus=1, # Uncomment this to run this on GPUs!\n", - " compute=\"actors\",\n", + " compute=ray.data.ActorPoolStrategy(),\n", ").take()" ] }, diff --git a/doc/source/data/examples/ocr_example.ipynb b/doc/source/data/examples/ocr_example.ipynb index c7bd6ebea7fd..c36825ba413d 100644 --- a/doc/source/data/examples/ocr_example.ipynb +++ b/doc/source/data/examples/ocr_example.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, - "id": "905f9cad", + "execution_count": null, + "id": "49fe2185", "metadata": { "tags": [ "remove-cell" @@ -22,19 +22,19 @@ }, { "cell_type": "markdown", - "id": "6945c179", + "id": "2a344178", "metadata": {}, "source": [ - "# Scaling OCR using Ray Datasets\n", + "# Scaling OCR using Ray Data\n", "\n", - "In this example, we will show you how to run optical character recognition (OCR) on a set of documents and analyze the resulting text with the natural language processing library [spaCy](https://spacy.io/). Running OCR on a large dataset is very computationally expensive, so using Ray for distributed processing can really speed up the analysis. Ray Datasets makes it easy to compose the different steps of the pipeline, namely the OCR and the natural language processing. Ray Datasets' actor support also allows us to be more efficient by sharing the spaCy NLP context between several datapoints.\n", + "In this example, we will show you how to run optical character recognition (OCR) on a set of documents and analyze the resulting text with the natural language processing library [spaCy](https://spacy.io/). Running OCR on a large dataset is very computationally expensive, so using Ray for distributed processing can really speed up the analysis. Ray Data makes it easy to compose the different steps of the pipeline, namely the OCR and the natural language processing. Ray Data' actor support also allows us to be more efficient by sharing the spaCy NLP context between several datapoints.\n", "\n", "To make it more interesting, we will run the analysis on the [LightShot](https://www.kaggle.com/datasets/datasnaek/lightshot) dataset. It is a large publicly available OCR dataset with a wide variety of different documents, all of them screenshots of various forms. It is easy to replace that dataset with your own data and adapt the example to your own use cases!\n", "\n", "## Overview\n", "\n", "This tutorial will cover:\n", - " - Creating a Ray Dataset that represents the images in the dataset\n", + " - Creating a Dataset that represents the images in the dataset\n", " - Running the computationally expensive OCR process on each image in the dataset in parallel\n", " - Filtering the dataset by keeping only images that contain text\n", " - Performing various NLP operations on the text\n", @@ -43,75 +43,42 @@ "\n", "Let's start by preparing the dependencies and downloading the dataset. First we install the OCR software `tesseract` and its Python client:\n", "\n", - "````{tabbed} macOS\n", + "``````{tab-set}\n", + "\n", + "````{tab-item} macOS\n", "```\n", "brew install tesseract\n", "pip install pytesseract\n", "```\n", "````\n", "\n", - "````{tabbed} linux\n", + "````{tab-item} linux\n", "```\n", "sudo apt-get install tesseract-ocr\n", "pip install pytesseract\n", "```\n", "````\n", "\n", + "``````\n", + "\n", "By default, the following example will run on a tiny dataset we provide. If you want to run it on the full dataset, we recommend to run it on a cluster since processing all the images with tesseract takes a lot of time.\n", "\n", "````{note}\n", "If you want to run the example on the full [LightShot](https://www.kaggle.com/datasets/datasnaek/lightshot) dataset, you need to download the dataset and extract it. You can extract the dataset by first running `unzip archive.zip` and then `unrar x LightShot13k.rar .` and then you can upload the dataset to S3 with `aws s3 cp LightShot13k/ s3:/// --recursive`.\n", - "````" - ] - }, - { - "cell_type": "markdown", - "id": "c08612ac", - "metadata": {}, - "source": [ - "Let's now import Ray and initialize a local Ray cluster. If you want to run OCR at a very large scale, you should run this workload on a multi-node cluster." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "37f22aa8", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-04 14:35:19,444\tINFO services.py:1476 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" - ] - }, - { - "data": { - "text/plain": [ - "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.7.4', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '127.0.0.1', 'raylet_ip_address': '127.0.0.1', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-07-04_14-35-16_950060_89285/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-07-04_14-35-16_950060_89285/sockets/raylet', 'webui_url': '127.0.0.1:8265', 'session_dir': '/tmp/ray/session_2022-07-04_14-35-16_950060_89285', 'metrics_export_port': 60416, 'gcs_address': '127.0.0.1:61663', 'address': '127.0.0.1:61663', 'node_id': 'b6c981243d51558d13e4290f0f63552a6126f8a8d9e472baafe9dd5b'})" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "````\n", + "\n", + "\n", + "Let's now import Ray and initialize a local Ray cluster. If you want to run OCR at a very large scale, you should run this workload on a multi-node cluster.\n", + "\n", + "```python\n", "# Import ray and initialize a local Ray cluster.\n", "import ray\n", - "ray.init()" - ] - }, - { - "cell_type": "markdown", - "id": "ee90daa8", - "metadata": {}, - "source": [ + "ray.init()\n", + "```\n", + "\n", "### Running the OCR software on the data\n", "\n", - "We can now use the {meth}`ray.data.read_binary_files ` function to read all the images from S3. We set the `include_paths=True` option to create a dataset of the S3 paths and image contents. We then run the {meth}`ds.map ` function on this dataset to execute the actual OCR process on each file and convert the screen shots into text. This will create a tabular dataset with columns `path` and `text`, see also [](transform_datasets_row_output_types).\n", + "We can now use the {meth}`ray.data.read_binary_files ` function to read all the images from S3. We set the `include_paths=True` option to create a dataset of the S3 paths and image contents. We then run the {meth}`ds.map ` function on this dataset to execute the actual OCR process on each file and convert the screen shots into text. This will create a tabular dataset with columns `path` and `text`, see also [](transforming_data).\n", "\n", "````{note}\n", "If you want to load the data from a private bucket, you have to run\n", @@ -125,25 +92,9 @@ " secret_key=\"...\",\n", " session_token=\"...\"))\n", "```\n", - "````" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d31d3303", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-04 14:35:53,683\tWARNING read_api.py:256 -- The number of blocks in this dataset (3) limits its parallelism to 3 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", - "Read->Map: 100%|██████████| 3/3 [00:07<00:00, 2.34s/it]\n" - ] - } - ], - "source": [ + "````\n", + "\n", + "```python\n", "from io import BytesIO\n", "from PIL import Image\n", "import pytesseract\n", @@ -159,46 +110,25 @@ " \"s3://anonymous@air-example-data/ocr_tiny_dataset\",\n", " include_paths=True)\n", "\n", - "results = ds.map(perform_ocr)" - ] - }, - { - "cell_type": "markdown", - "id": "e22e7cd7", - "metadata": {}, - "source": [ + "results = ds.map(perform_ocr)\n", + "```\n", + "\n", "Let us have a look at some of the data points with the {meth}`take ` function." ] }, { "cell_type": "code", - "execution_count": 6, - "id": "5518b831", + "execution_count": null, + "id": "45aa1983", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[ArrowRow({'path': 'air-example-data/ocr_tiny_dataset/gnome_screenshot.png',\n", - " 'text': '= Cancel\\n\\nTake Screenshot\\n© Grab the whole screen\\n\\nGrab the current window\\n\\n|_| eeeeeeter\\n\\nGrab after a delay of 0\\n\\nEffects\\nInclude pointer\\n\\n¥ Include the window border\\n\\nApply effect: None Sa\\n\\n+. seconds\\n'}),\n", - " ArrowRow({'path': 'air-example-data/ocr_tiny_dataset/miranda_screenshot.png',\n", - " 'text': '© Viktor (Online) : Message Session\\n\\n“etto| © Whter | steno\\n\\nremus\\ntet? Fiviha\\n\\n17: dokonca to vie aj video @\\nViktor\\n\\n1818. 55 samozrejme\\n\\n1818: len moj brat to skusal\\nremus\\n\\nWA\\n\\n098003 —\\n\\nseettsgmailcom [0]\\n\\nonline\\n\\nHacemen\\n@ Ce\\n\\nieFFo\\n169 6 je <>vin ©®\\n\\nBe 22\\n\\naway\\n\\nTue\\nhn\\n\\n& Wee\\n\\nYep, Tm here\\n\\n&\\nea\\na\\nLS]\\n\\n'}),\n", - " ArrowRow({'path': 'air-example-data/ocr_tiny_dataset/qemu_screenshot.png',\n", - " 'text': 'File Edit View Bookmarks\\n\\n[i New Tab [If] split view ~\\n\\n43044 kousekip\\n\\nPlugins\\n\\nkousekip:ako-kaede-mirai(htop)\\n\\nkousekip:ako-kaede-mirai(qemu-system-x86)\\n\\nSettings\\n\\nHelp\\n\\nkousekip:ako-kaede-miral(htop) — Konsole vax\\n\\nFl Paste Q Find\\n\\nEMU vax\\n\\nMachine View\\n\\nApplications Places System @)C) Fri Feb 18, 13:56\\n\\nTerminal\\n\\nroot root\\nroot sys\\nroot sys\\nroot sys\\nroot sys\\nroot sys\\nroot root\\nroot sys\\nroot bin\\nroot root\\nroot sys\\nroot root\\nroot sys\\nroot sys\\nroot root\\nroot root\\nroot root\\nroot sys\\nroot root\\nroot sys\\nroot sys\\n2 root —sys\\nkousekip@ako-kaede-mirai-sun:~$ If\\n\\nbin -> ./usr/bin\\nboot\\ndev\\ndevices\\netc\\nexport\\nhome\\nkernel\\nlib\\nmedia\\nmnt\\n\\nnet\\nopt\\nplatform\\nproc\\nroot\\nrpool\\nsbin\\nsystem\\n‘tmp\\nusr\\nvar\\n\\n@kousekip\\nidesktop\\n\\n©\\n\\n©\\n\\nBUNwnSunennh SnuNaeon\\n\\n(Documents\\nDownloads\\nGaMusic\\n\\n5\\n\\nBitrash\\nDevices\\n(Floppy Drive\\nNetwork\\n\\n@ Browse Netw...\\n\\n9\\n9\\n6\\n4\\n9\\n\\n53\\n5\\n6\\n4\\n9\\n10\\n0\\n6\\n18\\n7\\n\\nfovey\\\\aliarel(elare)\\n\\n'})]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "results.take(10)" ] }, { "cell_type": "markdown", - "id": "67ed5a8d", + "id": "36741417", "metadata": {}, "source": [ "### Saving and loading the result of the OCR run\n", @@ -207,57 +137,29 @@ "Saving the dataset is optional, you can also continue with the in-memory data without persisting it to storage.\n", "````\n", "\n", - "We can save the result of running tesseract on the dataset on disk so we can read it out later if we want to re-run the NLP analysis without needing to re-run the OCR (which is very expensive on the whole dataset). This can be done with the {meth}`write_parquet ` function:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7c2d8abe", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Write Progress: 100%|██████████| 3/3 [00:00<00:00, 207.11it/s]\n" - ] - } - ], - "source": [ + "We can save the result of running tesseract on the dataset on disk so we can read it out later if we want to re-run the NLP analysis without needing to re-run the OCR (which is very expensive on the whole dataset). This can be done with the {meth}`write_parquet ` function:\n", + "\n", + "```python\n", "import os\n", - "results.write_parquet(os.path.expanduser(\"~/LightShot13k_results\"))" - ] - }, - { - "cell_type": "markdown", - "id": "7a387f42", - "metadata": {}, - "source": [ - "You can later reload the dataset with the {meth}`read_parquet ` function:" + "results.write_parquet(os.path.expanduser(\"~/LightShot13k_results\"))\n", + "```\n", + "\n", + "You can later reload the data with the {meth}`read_parquet ` function:" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "af63be93", + "execution_count": null, + "id": "c8d419fa", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-04 14:36:13,515\tWARNING read_api.py:256 -- The number of blocks in this dataset (6) limits its parallelism to 6 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n" - ] - } - ], + "outputs": [], "source": [ "results = ray.data.read_parquet(os.path.expanduser(\"~/LightShot13k_results\"))" ] }, { "cell_type": "markdown", - "id": "f6a7bf0f", + "id": "decffa3c", "metadata": {}, "source": [ "### Process the extracted text data with spaCy\n", @@ -274,7 +176,7 @@ { "cell_type": "code", "execution_count": null, - "id": "69321ee3", + "id": "1604b02f", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +187,7 @@ }, { "cell_type": "markdown", - "id": "b01d2add", + "id": "fc96fb8b", "metadata": {}, "source": [ "This is some code to determine the language of a piece of text:" @@ -293,21 +195,10 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "ee4cc430", + "execution_count": null, + "id": "3374fc47", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'language': 'en', 'score': 0.9999976594668697}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import spacy\n", "from spacy.language import Language\n", @@ -325,39 +216,20 @@ }, { "cell_type": "markdown", - "id": "95ab0646", + "id": "05d218ee", "metadata": {}, "source": [ "It gives both the language and a confidence score for that language.\n", "\n", - "In order to run the code on the dataset, we should use Ray Datasets' built in support for actors since the `nlp` object is not serializable and we want to avoid having to recreate it for each individual sentence. We also batch the computation with the {meth}`map_batches ` function to ensure spaCy can use more efficient vectorized operations where available:" + "In order to run the code on the dataset, we should use Ray Data' built in support for actors since the `nlp` object is not serializable and we want to avoid having to recreate it for each individual sentence. We also batch the computation with the {meth}`map_batches ` function to ensure spaCy can use more efficient vectorized operations where available:" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "85a4a414", + "execution_count": null, + "id": "30648ced", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Read progress: 100%|██████████| 6/6 [00:00<00:00, 485.55it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 6/6 [00:06<00:00, 1.04s/it]\n" - ] - }, - { - "data": { - "text/plain": [ - "Dataset(num_blocks=6, num_rows=6, schema={path: object, text: object, language: object, score: float64})" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import spacy\n", "from spacy.language import Language\n", @@ -379,12 +251,12 @@ " df[\"score\"] = [doc._.language[\"score\"] for doc in docs]\n", " return df\n", "\n", - "results.limit(10).map_batches(SpacyBatchInference, compute=\"actors\")" + "results.limit(10).map_batches(SpacyBatchInference, compute=ray.data.ActorPoolStrategy())" ] }, { "cell_type": "markdown", - "id": "ca995036", + "id": "490bca7c", "metadata": {}, "source": [ "We can now get language statistics over the whole dataset:" @@ -392,40 +264,18 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "f64f8b3c", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Read: 100%|██████████| 6/6 [00:00<00:00, 19.95it/s]\n", - "Map Progress (1 actors 1 pending): 100%|██████████| 6/6 [00:05<00:00, 1.09it/s]\n", - "Sort Sample: 100%|██████████| 6/6 [00:00<00:00, 919.27it/s]\n", - "Shuffle Map: 100%|██████████| 6/6 [00:00<00:00, 159.14it/s]\n", - "Shuffle Reduce: 100%|██████████| 6/6 [00:00<00:00, 364.59it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'language': 'af', 'count()': 2}\n", - "{'language': 'en', 'count()': 4}\n" - ] - } - ], + "execution_count": null, + "id": "346ac322", + "metadata": {}, + "outputs": [], "source": [ - "languages = results.map_batches(SpacyBatchInference, compute=\"actors\")\n", + "languages = results.map_batches(SpacyBatchInference, compute=ray.data.ActorPoolStrategy())\n", "languages.groupby(\"language\").count().show()" ] }, { "cell_type": "markdown", - "id": "0d638758", + "id": "c9453342", "metadata": {}, "source": [ "````{note}\n", @@ -452,68 +302,15 @@ "{'language': 'nl', 'count()': 982}\n", "{'language': 'no', 'count()': 56}\n", "```\n", - "````" - ] - }, - { - "cell_type": "markdown", - "id": "9cc5ca11", - "metadata": {}, - "source": [ - "We can now filter to include only the English documents and also sort them according to their score." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8c4bd03d", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Filter: 100%|██████████| 6/6 [00:00<00:00, 561.84it/s]\n", - "Sort Sample: 100%|██████████| 6/6 [00:00<00:00, 1311.81it/s]\n", - "Shuffle Map: 100%|██████████| 6/6 [00:00<00:00, 319.24it/s]\n", - "Shuffle Reduce: 100%|██████████| 6/6 [00:00<00:00, 450.79it/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[ArrowRow({'path': 'air-example-data/ocr_tiny_dataset/gnome_screenshot.png',\n", - " 'text': '= Cancel\\n\\nTake Screenshot\\n© Grab the whole screen\\n\\nGrab the current window\\n\\n|_| eeeeeeter\\n\\nGrab after a delay of 0\\n\\nEffects\\nInclude pointer\\n\\n¥ Include the window border\\n\\nApply effect: None Sa\\n\\n+. seconds\\n',\n", - " 'language': 'en',\n", - " 'score': 0.9999976791815426}),\n", - " ArrowRow({'path': 'air-example-data/ocr_tiny_dataset/gnome_screenshot.png',\n", - " 'text': '= Cancel\\n\\nTake Screenshot\\n© Grab the whole screen\\n\\nGrab the current window\\n\\n|_| eeeeeeter\\n\\nGrab after a delay of 0\\n\\nEffects\\nInclude pointer\\n\\n¥ Include the window border\\n\\nApply effect: None Sa\\n\\n+. seconds\\n',\n", - " 'language': 'en',\n", - " 'score': 0.9999965244942747}),\n", - " ArrowRow({'path': 'air-example-data/ocr_tiny_dataset/miranda_screenshot.png',\n", - " 'text': '© Viktor (Online) : Message Session\\n\\n“etto| © Whter | steno\\n\\nremus\\ntet? Fiviha\\n\\n17: dokonca to vie aj video @\\nViktor\\n\\n1818. 55 samozrejme\\n\\n1818: len moj brat to skusal\\nremus\\n\\nWA\\n\\n098003 —\\n\\nseettsgmailcom [0]\\n\\nonline\\n\\nHacemen\\n@ Ce\\n\\nieFFo\\n169 6 je <>vin ©®\\n\\nBe 22\\n\\naway\\n\\nTue\\nhn\\n\\n& Wee\\n\\nYep, Tm here\\n\\n&\\nea\\na\\nLS]\\n\\n',\n", - " 'language': 'en',\n", - " 'score': 0.8571411027551514}),\n", - " ArrowRow({'path': 'air-example-data/ocr_tiny_dataset/miranda_screenshot.png',\n", - " 'text': '© Viktor (Online) : Message Session\\n\\n“etto| © Whter | steno\\n\\nremus\\ntet? Fiviha\\n\\n17: dokonca to vie aj video @\\nViktor\\n\\n1818. 55 samozrejme\\n\\n1818: len moj brat to skusal\\nremus\\n\\nWA\\n\\n098003 —\\n\\nseettsgmailcom [0]\\n\\nonline\\n\\nHacemen\\n@ Ce\\n\\nieFFo\\n169 6 je <>vin ©®\\n\\nBe 22\\n\\naway\\n\\nTue\\nhn\\n\\n& Wee\\n\\nYep, Tm here\\n\\n&\\nea\\na\\nLS]\\n\\n',\n", - " 'language': 'en',\n", - " 'score': 0.5714285419353925})]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "languages.filter(lambda row: row[\"language\"] == \"en\").sort(\"score\", descending=True).take(1000)" - ] - }, - { - "cell_type": "markdown", - "id": "8c05df96", - "metadata": {}, - "source": [ + "````\n", + "\n", + "\n", + "We can now filter to include only the English documents and also sort them according to their score.\n", + "\n", + "```python\n", + "languages.filter(lambda row: row[\"language\"] == \"en\").sort(\"score\", descending=True).take(1000)\n", + "```\n", + "\n", "If you are interested in this example and want to extend it, you can do the following for the full dataset:\n", "- go throught these results in order\n", "- create labels on whether the text is a chat conversation and then train a model like [Huggingface Transformers](https://huggingface.co/docs/transformers/) on the data.\n", @@ -523,23 +320,10 @@ } ], "metadata": { - "celltoolbar": "Tags", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" } }, "nbformat": 4, diff --git a/doc/source/data/examples/random-access.rst b/doc/source/data/examples/random-access.rst index 908cd8948bae..3c28fc8d1c3f 100644 --- a/doc/source/data/examples/random-access.rst +++ b/doc/source/data/examples/random-access.rst @@ -4,22 +4,22 @@ Random Data Access (Experimental) --------------------------------- -Any Arrow-format dataset can be enabled for random access by calling ``dataset.to_random_access_dataset(key="col_name")``. This partitions the dataset across the cluster by the given sort key, providing efficient random access to records via binary search. A number of worker actors are created, each of which has zero-copy access to the underlying sorted data blocks of the Dataset. +Any Arrow-format dataset can be enabled for random access by calling ``ds.to_random_access_dataset(key="col_name")``. This partitions the data across the cluster by the given sort key, providing efficient random access to records via binary search. A number of worker actors are created, each of which has zero-copy access to the underlying sorted data blocks of the Dataset. .. code-block:: python # Generate a dummy embedding table as an example. - ds = ray.data.range_table(100) - ds = ds.add_column("embedding", lambda b: b["value"] ** 2) - # -> schema={value: int64, embedding: int64} + ds = ray.data.range(100) + ds = ds.add_column("embedding", lambda b: b["id"] ** 2) + # -> schema={id: int64, embedding: int64} # Enable random access on the dataset. This launches a number of actors # spread across the cluster that serve random access queries to the data. - rmap = ds.to_random_access_dataset(key="value", num_workers=4) + rmap = ds.to_random_access_dataset(key="id", num_workers=4) # Example of a point query by key. ray.get(rmap.get_async(2)) - # -> {"value": 2, "embedding": 4} + # -> {"id": 2, "embedding": 4} # Queries to missing keys return None. ray.get(rmap.get_async(-1)) @@ -27,14 +27,14 @@ Any Arrow-format dataset can be enabled for random access by calling ``dataset.t # Example of a multiget query. rmap.multiget([4, 2]) - # -> [{"value": 4, "embedding": 16}, {"value": 2, "embedding": 4}] + # -> [{"id": 4, "embedding": 16}, {"id": 2, "embedding": 4}] Similar to Dataset, a RandomAccessDataset can be passed to and used from any Ray actor or task. Architecture ------------ -RandomAccessDataset spreads its workers evenly across the cluster. Each worker fetches and pins in shared memory all blocks of the sorted source dataset found on its node. In addition, it is ensured that each block is assigned to at least one worker. A central index of block to key-range assignments is computed, which is used to serve lookups. +RandomAccessDataset spreads its workers evenly across the cluster. Each worker fetches and pins in shared memory all blocks of the sorted source data found on its node. In addition, it is ensured that each block is assigned to at least one worker. A central index of block to key-range assignments is computed, which is used to serve lookups. Lookups occur as follows: @@ -68,4 +68,4 @@ It is important to note that the client (Ray worker process) can also be a bottl Fault Tolerance --------------- -Currently, RandomAccessDataset is not fault-tolerant. Losing any of the worker actors invalidates the dataset, and it must be re-created from the source dataset. +Currently, RandomAccessDataset is not fault-tolerant. Losing any of the worker actors invalidates the dataset, and it must be re-created from the source data. diff --git a/doc/source/data/faq.rst b/doc/source/data/faq.rst index ad628f0cbadc..7f1af6ec80a7 100644 --- a/doc/source/data/faq.rst +++ b/doc/source/data/faq.rst @@ -1,13 +1,13 @@ -.. _datasets_faq: +.. _data_faq: === FAQ === -These are some Frequently Asked Questions that we've seen pop up for Ray Datasets. +These are some Frequently Asked Questions that we've seen pop up for Ray Data. .. note:: - For a general conceptual overview of Ray Datasets, see our + For a general conceptual overview of Ray Data, see our :ref:`Key Concepts docs `. If you still have questions after reading this FAQ, please reach out on @@ -18,10 +18,10 @@ If you still have questions after reading this FAQ, please reach out on :depth: 2 -What problems does Ray Datasets solve? +What problems does Ray Data solve? ====================================== -Ray Datasets aims to solve the problems of slow, resource-inefficient, unscalable data +Ray Data aims to solve the problems of slow, resource-inefficient, unscalable data loading and preprocessing pipelines for two core uses cases: 1. **Model training:** resulting in poor training throughput and low GPU utilization as @@ -29,38 +29,38 @@ loading and preprocessing pipelines for two core uses cases: 2. **Batch inference:** resulting in poor batch inference throughput and low GPU utilization. -In order to solve these problems without sacrificing usability, Ray Datasets simplifies +In order to solve these problems without sacrificing usability, Ray Data simplifies parallel and pipelined data processing on Ray, providing a higher-level API while internally handling data batching, task parallelism and pipelining, and memory management. -Who is using Ray Datasets? -========================== +Who is using Ray Data? +====================== -To give an idea of Datasets use cases, we list a few notable users running Datasets +To give an idea of Ray Data use cases, we list a few notable users running Ray Data integrations in production below: -* Predibase is using Ray Datasets for ML ingest and batch inference in their OSS +* Predibase is using Ray Data for ML ingest and batch inference in their OSS declarative ML framework, `Ludwig `__, and internally in their `AutoML product `__. -* Amazon is using Ray Datasets for large-scale I/O in their scalable data catalog, +* Amazon is using Ray Data for large-scale I/O in their scalable data catalog, `DeltaCAT `__. -* Shopify is using Ray Datasets for ML ingest and batch inference in their ML platform, +* Shopify is using Ray Data for ML ingest and batch inference in their ML platform, `Merlin `__. -* Ray Datasets is used as the data processing engine for the +* Ray Data is used as the data processing engine for the `Ray-based Apache Beam runner `__. -* Ray Datasets is used as the preprocessing and batch inference engine for +* Ray Data is used as the preprocessing and batch inference engine for :ref:`Ray AIR `. -If you're using Ray Datasets, please let us know about your experience on the +If you're using Ray Data, please let us know about your experience on the `Slack `__ or `Discourse `__; we'd love to hear from you! -What should I use Ray Datasets for? -=================================== +What should I use Ray Data for? +=============================== -Ray Datasets is the standard way to load, process, and exchange data in Ray libraries +Ray Data is the standard way to load, process, and exchange data in Ray libraries and applications, with a particular emphasis on ease-of-use, performance, and scalability in both data size and cluster size. Within that, Datasets is designed for two core uses cases: @@ -70,15 +70,15 @@ two core uses cases: * **Batch inference:** Loading, preprocessing, and performing parallel batch inference on data. -We have designed the Datasets APIs, data model, execution model, and +We have designed the Dataset APIs, data model, execution model, and integrations with these use cases in mind, and have captured these use cases in large-scale nightly tests to ensure that we're hitting our scalability, performance, and efficiency marks for these use cases. -What should I not use Ray Datasets for? -======================================= +What should I not use Ray Data for? +=================================== -Ray Datasets is not meant to be used for generic ETL pipelines (like Spark) or +Ray Data is not meant to be used for generic ETL pipelines (like Spark) or scalable data science (like Dask, Modin, or Mars). However, each of these frameworks are :ref:`runnable on Ray `, and Datasets integrates tightly with these frameworks, allowing for efficient exchange of distributed data partitions often @@ -90,7 +90,7 @@ Datasets is specifically targeting the ML ingest and batch inference use cases, with focus on data loading and last-mile preprocessing for ML pipelines. -For data loading for training, how does Ray Datasets compare to other solutions? +For data loading for training, how does Ray Data compare to other solutions? ================================================================================ There are several ML framework-specific and general solutions for loading data into @@ -144,7 +144,7 @@ Petastorm ~~~~~~~~~ * **Supported data types:** `Petastorm `__ only supports Parquet data, while - Ray Datasets supports many file formats. + Ray Data supports many file formats. * **Lower overhead:** Datasets is lower overhead: it supports zero-copy exchange between processes, in contrast to the multi-processing-based pipelines used by Petastorm. * **No data processing:** Petastorm does not expose any data processing APIs. @@ -153,49 +153,49 @@ NVTabular ~~~~~~~~~ * **Supported data types:** `NVTabular `__ only supports tabular - (Parquet, CSV, Avro) data, while Ray Datasets supports many other file formats. + (Parquet, CSV, Avro) data, while Ray Data supports many other file formats. * **Lower overhead:** Datasets is lower overhead: it supports zero-copy exchange between processes, in contrast to the multi-processing-based pipelines used by Petastorm. * **Heterogeneous compute:** NVTabular doesn't support mixing heterogeneous resources in dataset transforms (e.g. - both CPU and GPU transformations), while Ray Datasets supports this. + both CPU and GPU transformations), while Ray Data supports this. * **ML-specific ops:** NVTabular has a bunch of great ML-specific preprocessing - operations; this is currently WIP for Ray Datasets: + operations; this is currently WIP for Ray Data: :ref:`Ray AIR preprocessors `. -.. _datasets_streaming_faq: +.. _streaming_faq: -For batch (offline) inference, why should I use Ray Datasets instead of an actor pool? +For batch (offline) inference, why should I use Ray Data instead of an actor pool? ====================================================================================== -Ray Datasets provides its own autoscaling actor pool via the actor compute strategy for +Ray Data provides its own autoscaling actor pool via the actor compute strategy for :meth:`ds.map_batches() `, allowing you to perform CPU- or GPU-based batch inference on this actor pool. Using this instead of the `Ray actor pool `__ has a few advantages: -* Ray Datasets actor pool is autoscaling and supports easy-to-configure task dependency +* Ray Data actor pool is autoscaling and supports easy-to-configure task dependency prefetching, pipelining data transfer with compute. -* Ray Datasets takes care of orchestrating the tasks, batching the data, and managing +* Ray Data takes care of orchestrating the tasks, batching the data, and managing the memory. -* With Ray Datasets pipelining, you can - precisely configure pipelining of preprocessing with batch inference, allowing you to - easily tweak parallelism vs. pipelining to maximize your GPU utilization. -* Ray Datasets provides a broad and performant I/O layer, which you would otherwise have +* Ray Data provides a broad and performant I/O layer, which you would otherwise have to roll yourself. -How fast is Ray Datasets? +How fast is Ray Data? ========================= We're still working on open benchmarks, but we've done some benchmarking on synthetic data and have helped several users port from solutions using Petastorm, Torch multi-processing data loader, and TensorFlow datasets that have seen a big training throughput improvement (4-8x) and model accuracy improvement (due to global per-epoch -shuffling) using Ray Datasets. +shuffling) using Ray Data. -Please see our -`recent blog post on Ray Datasets `__ +Please see this +`blog post on Ray Data `__ for more information on this benchmarking. +The new streaming backend for Ray Data (Dataset) supports throughputs of up to +hundreds of gigabytes per second in a large cluster. + Does all of my data need to fit into memory? ============================================ @@ -204,48 +204,22 @@ need to be able to fit your data into memory OR disk. However, keeping your data distributed memory may speed up your workload, which can be done on arbitrarily large datasets by windowing them, creating pipelines. -How much data can Ray Datasets handle? -====================================== +How much data can Ray Data handle? +================================== -Ray Datasets has been tested at multi-petabyte scale for I/O and multi-terabyte scale for +Ray Data has been tested at multi-petabyte scale for I/O and multi-terabyte scale for shuffling, and we're continuously working on improving this scalability. If you have a very large dataset that you'd like to process and you're running into scalability issues, please reach out to us on our `Discourse `__. -How do I get my data into Ray Datasets? -======================================= +How do I get my data into Ray Data? +=================================== -Ray Datasets supports creating a ``Dataset`` from local and distributed in-memory data +Ray Data supports creating a ``Dataset`` from local and distributed in-memory data via integrations with common data libraries, as well as from local and remote storage systems via our support for many common file formats and storage backends. -Check out our :ref:`feature guide for creating datasets ` for -details. - -How do I do streaming/online data loading and processing? -========================================================= - -Streaming data loading and data processing can be accomplished by using -dataset pipelines. By windowing a dataset, you can -stream data transformations across subsets of the data, even windowing down to the -reading of each file. - -When should I use pipelining? -============================= - -Pipelining is useful in a few scenarios: - -* You have two chained operations using different resources (e.g. CPU and GPU) that you - want to saturate; this is the case for both ML ingest (CPU-based preprocessing and - GPU-based training) and batch inference (CPU-based preprocessing and GPU-based batch - inference). -* You want to do streaming data loading and processing in order to keep the size of the - working set small; see previous FAQ on - :ref:`how to do streaming data loading and processing `. -* You want to decrease the time-to-first-batch (latency) for a certain operation at the - end of your workload. This is the case for training and inference since this prevents - GPUs from being idle (which is costly), and can be advantageous for some other - latency-sensitive consumers of datasets. +For more details, read :ref:`Loading Data `. When should I use global per-epoch shuffling? ============================================= @@ -263,7 +237,7 @@ learned weights in the wrong direction, shuffling before the next epoch lets you out of such a gradient rut. In the distributed data-parallel training case, the current status quo solution is typically to have a per-shard in-memory shuffle buffer that you fill up and pop random batches from, without mixing data across shards between epochs. -Ray Datasets also offers fully global random shuffling via +Ray Data also offers fully global random shuffling via :meth:`ds.random_shuffle() `, and doing so on an epoch-repeated dataset pipeline to provide global per-epoch shuffling is as simple as ``ray.data.read().repeat().random_shuffle_each_window()``. But when should you opt for @@ -296,28 +270,77 @@ loading + shuffling throughput is higher than your training throughput, your GPU be saturated, so we like to recommend users with shuffle-sensitive models to push their shuffle quality higher until this threshold is hit. -What is Arrow and how does Ray Datasets use it? +What is Arrow and how does Ray Data use it? =============================================== `Apache Arrow `__ is a columnar memory format and a -single-node data processing and I/O library that Ray Datasets leverages extensively. You -can think of Ray Datasets as orchestrating distributed processing of Arrow data. +single-node data processing and I/O library that Ray Data leverages extensively. You +can think of Ray Data as orchestrating distributed processing of Arrow data. -See our :ref:`key concepts ` for more information on how Ray Datasets +See our :ref:`key concepts ` for more information on how Ray Data uses Arrow. -How much performance tuning does Ray Datasets require? +How much performance tuning does Ray Data require? ====================================================== -Ray Datasets doesn't perform query optimization, so some manual performance +Ray Data doesn't perform query optimization, so some manual performance tuning may be necessary depending on your use case and data scale. Please see our :ref:`performance tuning guide ` for more information. -How can I contribute to Ray Datasets? +Migrating to strict mode +======================== + +In Ray 2.5, Ray Data by default always requires data schemas, dropping support for +standalone Python objects. In addition to unification and simplicity benefits, this +aligns the Ray Data API closer to industry-standard distributed data APIs like Apache +Spark and also emerging standards for machine learning datasets like HuggingFace. + +You can disable strict mode temporarily by setting the environment variable +``RAY_DATA_STRICT_MODE=0`` on all cluster processes. Strict mode will not be +possible to disable in future releases. + +Migrating existing code is straightforward. There are two common changes you may need +to make to your code to be compatible: + +1. Pass the ``batch_format="pandas"`` argument to ``map_batches`` or ``iter_batches``, + if your code assumes pandas is the default batch format. +2. Instead of returning a standalone objects or numpy arrays from ``map`` or ``map_batches``, + return a dictionary that names the field. E.g., change function code from ``return object()`` to + ``return {"my_obj": object()}``, and ``return [1, 2, 3]`` to ``return {"my_values": [1, 2, 3]}``. + +List of strict mode changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In more detail, support for standalone Python objects is dropped. This means that +instead of directly storing, e.g., Python ``Tuple[str, int]`` instance in Ray Data, +you must either give each field a name (i.e., ``{foo: str, bar: int}``), or +use a named object-type field (i.e., ``{foo: object}``). In addition, the ``default`` +batch format is replaced with ``numpy`` by default. This means that most users +just need to be aware of ``Dict[str, Any]`` (non-batched data records) and +``Dict[str, np.ndarray]`` (batched data) types when working with Ray Data. + +**Full list of changes**: + +* All read apis return structured data, never standalone Python objects. +* Standalone Python objects are prohibited from being returned from map / map batches. +* Standalone Numpy arrays are prohibited from being returned from map / map batches. +* There is no more special interpretation of single-column schema containing just ``__value__`` as a column. +* The default batch format is ``numpy`` instead of ``default`` (pandas). +* ``schema()`` returns a unified Schema class instead of ``Union[pyarrow.lib.Schema, type]``. +* When lists of array-like objects are returned from map batches, they will be converted into a contiguous numpy array, rather than treated as a list of objects. + +**Datasource behavior changes**: + +* ``range_tensor``: create ``data`` column instead of ``__value__``. +* ``from_numpy`` / ``from_numpy_refs`` : create ``data`` column instead of using ``__value__``. +* ``from_items``: create ``item`` column instead of using Python objects. +* ``range``: create ``id`` column instead of using Python objects. + +How can I contribute to Ray Data? ===================================== We're always happy to accept external contributions! If you have a question, a feature -request, or want to contibute to Ray Datasets or tell us about your use case, please +request, or want to contibute to Ray Data or tell us about your use case, please reach out to us on `Discourse `__; if you have a you're confident that you've found a bug, please open an issue on the `Ray GitHub repo `__. Please see our diff --git a/doc/source/data/getting-started.rst b/doc/source/data/getting-started.rst index 028ae66cf160..f1db53b5e81d 100644 --- a/doc/source/data/getting-started.rst +++ b/doc/source/data/getting-started.rst @@ -1,11 +1,12 @@ -.. _datasets_getting_started: +.. _data_getting_started: Getting Started =============== -A Ray :class:`Dataset ` is a distributed data collection. It holds -references to distributed data *blocks*, and exposes APIs for loading and processing -data. +Ray Data's main abstraction is a :class:`Dataset `, which +is a distributed data transformation pipeline. Dataset provides APIs for loading +external data into Ray in *blocks*, and it exposes APIs for streaming +processing of these data blocks in the cluster. Install Ray Data ---------------- @@ -20,19 +21,18 @@ To learn more about installing Ray and its libraries, read :ref:`Installing Ray `. Create a dataset ----------------- +------------------- Create datasets from on-disk files, Python objects, and cloud storage services like S3. -Ray reads from any `filesystem supported by Arrow +Ray Data can read from any `filesystem supported by Arrow `__. .. testcode:: import ray - dataset = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") - - dataset.show(limit=1) + ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") + ds.show(limit=1) .. testoutput:: @@ -40,113 +40,116 @@ Ray reads from any `filesystem supported by Arrow To learn more about creating datasets, read -:ref:`Creating datasets `. +:ref:`Loading data `. Transform the dataset ---------------------- +------------------------ Apply :ref:`user-defined functions ` (UDFs) to -transform datasets. Ray executes transformations in parallel for performance at scale. +transform datasets. Ray executes transformations in parallel for performance. .. testcode:: - import pandas as pd + from typing import Dict + import numpy as np - # Find rows with sepal length < 5.5 and petal length > 3.5. - def transform_batch(df: pd.DataFrame) -> pd.DataFrame: - return df[(df["sepal length (cm)"] < 5.5) & (df["petal length (cm)"] > 3.5)] + # Compute a "petal area" attribute. + def transform_batch(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + vec_a = batch["petal length (cm)"] + vec_b = batch["petal width (cm)"] + batch["petal area (cm^2)"] = vec_a * vec_b + return batch - transformed_dataset = dataset.map_batches(transform_batch) - print(transformed_dataset) + transformed_ds = ds.map_batches(transform_batch) + print(transformed_ds.materialize()) .. testoutput:: - MapBatches(transform_batch) - +- Datastream( - num_blocks=1, - num_rows=150, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) - + MaterializedDataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64, + petal area (cm^2): double + } + ) To learn more about transforming datasets, read -:ref:`Transforming datasets `. +:ref:`Transforming data `. Consume the dataset -------------------- +---------------------- Pass datasets to Ray tasks or actors, and access records with methods like -:meth:`~ray.data.Dataset.iter_batches`. +:meth:`~ray.data.Dataset.take_batch` and :meth:`~ray.data.Dataset.iter_batches`. -.. tabbed:: Local +.. tab-set:: - .. testcode:: + .. tab-item:: Local - batches = transformed_dataset.iter_batches(batch_size=8) - print(next(iter(batches))) + .. testcode:: - .. testoutput:: - :options: +NORMALIZE_WHITESPACE + print(transformed_ds.take_batch(batch_size=3)) - sepal length (cm) ... target - 0 5.2 ... 1 - 1 5.4 ... 1 - 2 4.9 ... 2 + .. testoutput:: + :options: +NORMALIZE_WHITESPACE - [3 rows x 5 columns] + {'sepal length (cm)': array([5.1, 4.9, 4.7]), + 'sepal width (cm)': array([3.5, 3. , 3.2]), + 'petal length (cm)': array([1.4, 1.4, 1.3]), + 'petal width (cm)': array([0.2, 0.2, 0.2]), + 'target': array([0, 0, 0]), + 'petal area (cm^2)': array([0.28, 0.28, 0.26])} -.. tabbed:: Tasks + .. tab-item:: Tasks - .. testcode:: + .. testcode:: - @ray.remote - def consume(dataset: ray.data.Dataset) -> int: - num_batches = 0 - for batch in dataset.iter_batches(batch_size=8): - num_batches += 1 - return num_batches + @ray.remote + def consume(ds: ray.data.Dataset) -> int: + num_batches = 0 + for batch in ds.iter_batches(batch_size=8): + num_batches += 1 + return num_batches - ray.get(consume.remote(transformed_dataset)) + ray.get(consume.remote(transformed_ds)) -.. tabbed:: Actors + .. tab-item:: Actors - .. testcode:: + .. testcode:: - @ray.remote - class Worker: + @ray.remote + class Worker: - def train(self, shard) -> int: - for batch in shard.iter_batches(batch_size=8): - pass - return shard.count() + def train(self, data_iterator): + for batch in data_iterator.iter_batches(batch_size=8): + pass - workers = [Worker.remote() for _ in range(4)] - shards = transformed_dataset.split(n=4, locality_hints=workers) - ray.get([w.train.remote(s) for w, s in zip(workers, shards)]) + workers = [Worker.remote() for _ in range(4)] + shards = transformed_ds.streaming_split(n=4, equal=True) + ray.get([w.train.remote(s) for w, s in zip(workers, shards)]) To learn more about consuming datasets, read -:ref:`Consuming datasets `. +:ref:`Consuming data `. Save the dataset ----------------- +------------------- -Call methods like :meth:`~ray.data.Dataset.write_parquet` to save datasets to local +Call methods like :meth:`~ray.data.Dataset.write_parquet` to save dataset contents to local or remote filesystems. .. testcode:: import os - transformed_dataset.write_parquet("iris") + transformed_ds.write_parquet("/tmp/iris") - print(os.listdir("iris")) + print(os.listdir("/tmp/iris")) .. testoutput:: :options: +ELLIPSIS @@ -154,9 +157,4 @@ or remote filesystems. ['..._000000.parquet'] -To learn more about saving datasets, read :ref:`Saving datasets `. - -Next Steps ----------- - -* To check how your application is doing, you can use the :ref:`Ray dashboard`. \ No newline at end of file +To learn more about saving dataset contents, read :ref:`Saving data `. diff --git a/doc/source/data/glossary.rst b/doc/source/data/glossary.rst index 31caf719191f..084398e1566d 100644 --- a/doc/source/data/glossary.rst +++ b/doc/source/data/glossary.rst @@ -1,7 +1,7 @@ .. _datasets_glossary: ===================== -Ray Datasets Glossary +Ray Data Glossary ===================== .. glossary:: @@ -20,26 +20,26 @@ Ray Datasets Glossary >>> # Dataset is executed by streaming executor by default, which doesn't >>> # preserve the order, so we explicitly set it here. >>> ray.data.context.DataContext.get_current().execution_options.preserve_order = True - >>> dataset = ray.data.range_table(10) + >>> dataset = ray.data.range(10) >>> next(iter(dataset.iter_batches(batch_format="numpy", batch_size=5))) - {'value': array([0, 1, 2, 3, 4])} + {'id': array([0, 1, 2, 3, 4])} >>> next(iter(dataset.iter_batches(batch_format="pandas", batch_size=5))) - value - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 + id + 0 0 + 1 1 + 2 2 + 3 3 + 4 4 To learn more about batch formats, read - :ref:`UDF Input Batch Formats `. + :ref:`Configuring batch formats `. Block A processing unit of data. A :class:`~ray.data.Dataset` consists of a collection of blocks. - Under the hood, :term:`Datasets ` partition :term:`records ` - into a set of distributed data blocks. This allows Datasets to perform operations + Under the hood, :term:`Ray Data ` partition :term:`records ` + into a set of distributed data blocks. This allows it to perform operations in parallel. Unlike a batch, which is a user-facing object, a block is an internal abstraction. @@ -47,23 +47,21 @@ Ray Datasets Glossary Block format The way :term:`blocks ` are represented. - Blocks are represented as - `Arrow tables `_, - `pandas DataFrames `_, - and Python lists. To determine the block format, call - :meth:`Dataset.dataset_format() `. + Blocks are internally represented as + `Arrow tables `_ or + `pandas DataFrames `_. - Datasets (library) + Ray Data (library) A library for distributed data processing. - Datasets isn’t intended as a replacement for more general data processing systems. + Ray Data isn’t intended as a replacement for more general data processing systems. Its utility is as the last-mile bridge from ETL pipeline outputs to distributed ML applications and libraries in Ray. - To learn more about Ray Datasets, read :ref:`Key Concepts `. + To learn more about Ray Data, read :ref:`Key Concepts `. Dataset (object) - A class that represents a distributed collection of data. + A class that produces a sequence of distributed data blocks. :class:`~ray.data.Dataset` exposes methods to read, transform, and consume data at scale. @@ -82,73 +80,10 @@ Ray Datasets Glossary To learn more about Datasources, read :ref:`Creating a Custom Datasource `. Record - A single data item. - - If your dataset is :term:`tabular `, then records are :class:`TableRows `. - If your dataset is :term:`simple `, then records are arbitrary Python objects. - If your dataset is :term:`tensor `, then records are `NumPy ndarrays `_. + A single data item, which is always a ``Dict[str, Any]``. Schema - The data type of a dataset. - - If your dataset is :term:`tabular `, then the schema describes - the column names and data types. If your dataset is :term:`simple `, - then the schema describes the Python object type. If your dataset is - :term:`tensor `, then the schema describes the per-element - tensor shape and data type. + The name and type of the dataset fields. To determine a dataset's schema, call :meth:`Dataset.schema() `. - - Simple Dataset - A Dataset that represents a collection of arbitrary Python objects. - - .. doctest:: - - >>> import ray - >>> ray.data.from_items(["spam", "ham", "eggs"]) - MaterializedDatastream(num_blocks=3, num_rows=3, schema=) - - Tensor Dataset - A Dataset that represents a collection of ndarrays. - - :term:`Tabular datasets ` that contain tensor columns aren’t tensor datasets. - - .. doctest:: - - >>> import numpy as np - >>> import ray - >>> ray.data.from_numpy(np.zeros((100, 32, 32, 3))) - MaterializedDatastream( - num_blocks=1, - num_rows=100, - schema={__value__: numpy.ndarray(shape=(32, 32, 3), dtype=double)} - ) - - Tabular Dataset - A Dataset that represents columnar data. - - .. doctest:: - - >>> import ray - >>> ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") - Datastream( - num_blocks=1, - num_rows=150, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) - - User-defined function (UDF) - A callable that transforms batches or :term:`records ` of data. UDFs let you arbitrarily transform datasets. - - Call :meth:`Dataset.map_batches() `, - :meth:`Dataset.map() `, or - :meth:`Dataset.flat_map() ` to apply UDFs. - - To learn more about UDFs, read :ref:`Writing User-Defined Functions `. diff --git a/doc/source/data/images/actor_batch_prediction.png b/doc/source/data/images/actor_batch_prediction.png new file mode 100644 index 000000000000..5922dde5893b Binary files /dev/null and b/doc/source/data/images/actor_batch_prediction.png differ diff --git a/doc/source/data/images/actor_pool_batch_prediction.png b/doc/source/data/images/actor_pool_batch_prediction.png new file mode 100644 index 000000000000..bc8999aad58d Binary files /dev/null and b/doc/source/data/images/actor_pool_batch_prediction.png differ diff --git a/doc/source/data/images/air_batch_prediction.png b/doc/source/data/images/air_batch_prediction.png new file mode 100644 index 000000000000..7741431af463 Binary files /dev/null and b/doc/source/data/images/air_batch_prediction.png differ diff --git a/doc/source/data/images/batch_inference.png b/doc/source/data/images/batch_inference.png new file mode 100644 index 000000000000..aea38fb43b9d Binary files /dev/null and b/doc/source/data/images/batch_inference.png differ diff --git a/doc/source/data/images/batch_inference_overview.png b/doc/source/data/images/batch_inference_overview.png new file mode 100644 index 000000000000..5dd8536700f8 Binary files /dev/null and b/doc/source/data/images/batch_inference_overview.png differ diff --git a/doc/source/data/images/dataset-arch.svg b/doc/source/data/images/dataset-arch.svg index a56515610e8e..9f3bbea5596b 100644 --- a/doc/source/data/images/dataset-arch.svg +++ b/doc/source/data/images/dataset-arch.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/doc/source/data/images/dataset-compute-1.png b/doc/source/data/images/dataset-compute-1.png deleted file mode 100644 index 2f5629b5db2f..000000000000 Binary files a/doc/source/data/images/dataset-compute-1.png and /dev/null differ diff --git a/doc/source/data/images/dataset-loading-1.png b/doc/source/data/images/dataset-loading-1.png index 9cfcd8c6bd2d..9673789bc3fe 100644 Binary files a/doc/source/data/images/dataset-loading-1.png and b/doc/source/data/images/dataset-loading-1.png differ diff --git a/doc/source/data/images/stream-example.png b/doc/source/data/images/stream-example.png new file mode 100644 index 000000000000..c0ab99e0b5c3 Binary files /dev/null and b/doc/source/data/images/stream-example.png differ diff --git a/doc/source/data/images/task_batch_prediction.png b/doc/source/data/images/task_batch_prediction.png new file mode 100644 index 000000000000..72328062a938 Binary files /dev/null and b/doc/source/data/images/task_batch_prediction.png differ diff --git a/doc/source/data/images/train_predict_pipeline.png b/doc/source/data/images/train_predict_pipeline.png new file mode 100644 index 000000000000..890b7346b7bf Binary files /dev/null and b/doc/source/data/images/train_predict_pipeline.png differ diff --git a/doc/source/data/integrations.rst b/doc/source/data/integrations.rst index 636966d496b5..55b6161896a6 100644 --- a/doc/source/data/integrations.rst +++ b/doc/source/data/integrations.rst @@ -4,7 +4,7 @@ Integrations ============ -If you’re new to Ray Datasets, we recommend starting with the :ref:`Ray Datasets Quick Start `. +If you’re new to Ray Data, we recommend starting with the :ref:`Ray Data Quick Start `. This is a guide on how to run Dask, Spark, Mars or Modin on Ray. diff --git a/doc/source/data/key-concepts.rst b/doc/source/data/key-concepts.rst index da3a10b2e88a..e430842e598f 100644 --- a/doc/source/data/key-concepts.rst +++ b/doc/source/data/key-concepts.rst @@ -6,38 +6,31 @@ Key Concepts .. _dataset_concept: --------- -Datasets --------- +---------- +Dataset +---------- -A :term:`Dataset ` contains a list of Ray object references to :term:`blocks `. -Each block holds a set of items in an `Arrow table `_, -`pandas DataFrame `_, or Python list. +A :term:`Dataset ` operates over a sequence of Ray object references to :term:`blocks `. +Each block holds a set of records in an `Arrow table `_ or +`pandas DataFrame `_. Having multiple blocks in a dataset allows for parallel transformation and ingest. -For ML use cases, Datasets also natively supports mixing :ref:`Tensors ` and tabular data. +For ML use cases, Dataset natively supports mixing tensors with tabular data. To +learn more, read :ref:`Working with tensor data `. -There are three types of datasets: - -* :term:`Simple datasets ` -- Datasets that represent a collection of Python objects -* :term:`Tabular datasets ` -- Datasets that represent columnar data -* :term:`Tensor datasets ` -- Datasets that represent a collection of ndarrays - -The following figure visualizes a tabular dataset with three blocks, each holding 1000 rows: +The following figure visualizes a dataset with three blocks, each holding 1000 rows. Note that certain blocks +may not be computed yet. Normally, callers iterate over dataset blocks in a streaming fashion, so that not all +blocks need to be materialized in the cluster memory at once. .. image:: images/dataset-arch.svg .. https://docs.google.com/drawings/d/1PmbDvHRfVthme9XD7EYM-LIHPXtHdOfjCbc1SCsM64k/edit -Since a Dataset is just a list of Ray object references, it can be freely passed between Ray tasks, -actors, and libraries like any other object reference. -This flexibility is a unique characteristic of Ray Datasets. - Reading Data ============ -Datasets uses Ray tasks to read data from remote storage in parallel. Each read task reads one or more files and produces an output block: +Dataset uses Ray tasks to read data from remote storage in parallel. Each read task reads one or more files and produces an output block: .. image:: images/dataset-read.svg :align: center @@ -47,12 +40,12 @@ Datasets uses Ray tasks to read data from remote storage in parallel. Each read You can manually specify the number of read tasks, but the final parallelism is always capped by the number of files in the underlying dataset. -For an in-depth guide on creating datasets, read :ref:`Creating Datasets `. +For an in-depth guide on creating datasets, read :ref:`Loading Data `. Transforming Data ================= -Datasets uses either Ray tasks or Ray actors to transform data blocks. By default, Datasets uses tasks. +Dataset uses either Ray tasks or Ray actors to transform data blocks. By default, it uses tasks. To use Actors, pass an :class:`ActorPoolStrategy` to ``compute`` in methods like :meth:`~ray.data.Dataset.map_batches`. :class:`ActorPoolStrategy` creates an autoscaling @@ -64,13 +57,13 @@ pool of Ray actors. This allows you to cache expensive state initialization .. https://docs.google.com/drawings/d/12STHGV0meGWfdWyBlJMUgw7a-JcFPu9BwSOn5BjRw9k/edit -For an in-depth guide on transforming datasets, read :ref:`Transforming Datasets `. +For an in-depth guide on transforming datasets, read :ref:`Transforming Data `. Shuffling Data ============== Operations like :meth:`~ray.data.Dataset.sort` and :meth:`~ray.data.Dataset.groupby` -require blocks to be partitioned by value or *shuffled*. Datasets uses tasks to shuffle blocks in a map-reduce +require blocks to be partitioned by value or *shuffled*. Dataset uses tasks to shuffle blocks in a map-reduce style: map tasks partition blocks by value and then reduce tasks merge co-partitioned blocks. @@ -86,26 +79,29 @@ Repartition has two modes: .. https://docs.google.com/drawings/d/132jhE3KXZsf29ho1yUdPrCHB9uheHBWHJhDQMXqIVPA/edit -Datasets can shuffle hundreds of terabytes of data. For an in-depth guide on shuffle performance, read :ref:`Performance Tips and Tuning `. +Dataset can shuffle multi-terabyte datasets, leveraging the Ray object store for disk spilling. For an in-depth guide on shuffle performance, read :ref:`Performance Tips and Tuning `. +Note that operations like shuffle materialize the entire Dataset prior to their execution (shuffle execution is not streamed through memory). -Execution mode -============== +Iteration and materialization +============================= -Most transformations are lazy. They don't execute until you consume a dataset or call -:meth:`Dataset.materialize() `. +Most transformations on a dataset are lazy. They don't execute until you iterate over the dataset or call +:meth:`Dataset.materialize() `. When a Dataset is materialized, its +type becomes a `MaterializedDataset`, which indicates that all its blocks are materialized in Ray +object store memory. -The transformations are executed in a streaming way, incrementally on the data and -with operators processed in parallel, see :ref:`Streaming Execution `. +Dataset transformations are executed in a streaming way, incrementally on the data and +with operators processed in parallel, see :ref:`Streaming Execution `. -For an in-depth guide on Datasets execution, read :ref:`Execution `. +Datasets and MaterializedDatasets can be freely passed between Ray tasks, actors, and libraries without +incurring copies of the underlying block data (pass by reference semantics). Fault tolerance =============== -Datasets performs *lineage reconstruction* to recover data. If an application error or -system failure occurs, Datasets recreates lost blocks by re-executing tasks. - -Fault tolerance isn't supported in two cases: +Dataset performs *lineage reconstruction* to recover data. If an application error or +system failure occurs, Dataset recreates lost blocks by re-executing tasks. If ``compute=ActorPoolStrategy(size=n)`` is used, then Ray +restarts the actor used for computing the block prior to re-executing the task. -* If the original worker process that created the Dataset dies. This is because the creator stores the metadata for the :ref:`objects ` that comprise the Dataset. -* If you specify ``compute=ActorPoolStrategy(size=n)`` for transformations. This is because Datasets relies on :ref:`task-based fault tolerance `. +Fault tolerance is not supported if the original worker process that created the Dataset dies. +This is because the creator stores the metadata for the :ref:`objects ` that comprise the Dataset. diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst new file mode 100644 index 000000000000..0c6684f6a3e6 --- /dev/null +++ b/doc/source/data/loading-data.rst @@ -0,0 +1,820 @@ +.. _loading_data: + +==================== +Loading Data +==================== + +:class:`Datasets ` can be created from: + +* generated synthetic data, +* local and distributed in-memory data, and +* local and external storage systems (local disk, cloud storage, HDFS, etc.). + +.. _dataset_generate_data: + +------------------------- +Generating Synthetic Data +------------------------- + +.. tab-set:: + + .. tab-item:: Int Range + + Create a ``Dataset`` from a range of integers, with a single column containing this integer range. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __gen_synth_tabular_range_begin__ + :end-before: __gen_synth_tabular_range_end__ + + .. tab-item:: Tensor Range + + Create a dataset from a range of integers, packing this integer range into + ndarrays of the provided shape. + + .. doctest:: + + >>> import ray + >>> ds = ray.data.range_tensor(100 * 64 * 64, shape=(64, 64)) + >>> ds.schema() + Column Type + ------ ---- + data numpy.ndarray(shape=(64, 64), dtype=int64) + >>> ds.show(1) + {'data': array([[0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0]])} + +.. _dataset_reading_from_storage: + +-------------------------- +Reading Files From Storage +-------------------------- + +Using the ``ray.data.read_*()`` APIs, data can be loaded from files on local disk +or remote storage system such as S3, GCS, Azure Blob Storage, or HDFS. Any filesystem +`supported by pyarrow `__ +can be used to specify file locations, and many common file formats are supported: +Parquet, CSV, JSON, NPY, text, binary. + +Each of these APIs take a path or list of paths to files or directories. Any directories +provided will be walked in order to obtain concrete file paths, at which point all files +will be read in parallel. + +.. _dataset_supported_file_formats: + +Common File Formats +=================== + +.. tab-set:: + + .. tab-item:: Parquet + + Read Parquet files and directories. Partitioned parquet read support is also available. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_parquet_begin__ + :end-before: __read_parquet_end__ + + The Parquet reader also supports projection and filter pushdown, allowing column + selection and row filtering to be pushed down to the file scan. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_parquet_pushdown_begin__ + :end-before: __read_parquet_pushdown_end__ + + See the API docs for :func:`read_parquet() `. + + .. tab-item:: CSV + + Read CSV files and directories. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_csv_begin__ + :end-before: __read_csv_end__ + + See the API docs for :func:`read_csv() `. + + .. tab-item:: JSON + + Read JSON files and directories. + + Currently, only newline-delimited JSON (NDJSON) is supported. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_json_begin__ + :end-before: __read_json_end__ + + See the API docs for :func:`read_json() `. + + .. tab-item:: NumPy + + Read NumPy files and directories. + + This function represents NumPy data as ndarrays. To learn more, read + :ref:`Working with tensor data `. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_numpy_begin__ + :end-before: __read_numpy_end__ + + See the API docs for :func:`read_numpy() `. + + .. tab-item:: Text + + Read text files and directories. Each line in each text file will be treated as a row in the dataset. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_text_begin__ + :end-before: __read_text_end__ + + See the API docs for :func:`read_text() `. + + .. tab-item:: Images + + Call :func:`~ray.data.read_images` to read images. + + This function represents images as ndarrays. To learn more, read + :ref:`Working with tensor data `. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_images_begin__ + :end-before: __read_images_end__ + + .. tab-item:: Binary + + Read binary files and directories. Each binary file will be converted to a record + containing opaque bytes. These bytes can be decoded into tensor, tabular, text, or any other + kind of data using :meth:`~ray.data.Dataset.map_batches` to apply a per-row decoding + :ref:`user-defined function `. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_binary_begin__ + :end-before: __read_binary_end__ + + See the API docs for :func:`read_binary_files() `. + + .. tab-item:: TFRecords + + Call :func:`~ray.data.read_tfrecords` to read TFRecord files into a + :class:`~ray.data.Dataset`. + + .. warning:: + Only `tf.train.Example `_ + records are supported. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_tfrecords_begin__ + :end-before: __read_tfrecords_end__ + +.. _dataset_reading_remote_storage: + + +Reading from Remote Storage +=========================== + +All of the file formats mentioned above can be read from remote storage, such as S3, +GCS, Azure Blob Storage, and HDFS. These storage systems are supported via Arrow's +filesystem APIs natively for S3 and HDFS, and as a wrapper around fsspec for GCS and +HDFS. All ``ray.data.read_*()`` APIs expose a ``filesystem`` argument that accepts both +`Arrow FileSystem `__ instances +and `fsspec FileSystem `__ instances, +allowing you to configure this connection to the remote storage system, such as +authn/authz and buffer/block size. + +For S3 and HDFS, the underlying `FileSystem +`__ +implementation will be inferred from the URL scheme (``"s3://"`` and ``"hdfs://"``); if +the default connection configuration suffices for your workload, you won't need to +specify a ``filesystem`` argument. + +We use Parquet files for the below examples, but all of the aforementioned file formats +are supported for each of these storage systems. + +.. tab-set:: + + .. tab-item:: S3 + + The AWS S3 storage system is inferred from the URI scheme (``s3://``), with required connection + configuration such as S3 credentials being pulled from the machine's environment + (e.g. the ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY`` environment variables). + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_parquet_s3_begin__ + :end-before: __read_parquet_s3_end__ + + If needing to customize this S3 storage system connection (credentials, region, + endpoint override, etc.), you can pass in an + `S3FileSystem `__ instance + to :func:`read_parquet() `. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __read_parquet_s3_with_fs_begin__ + :end-before: __read_parquet_s3_with_fs_end__ + + .. tab-item:: HDFS + + The HDFS storage system is inferred from the URI scheme (``hdfs://``), with required connection + configuration such as the host and the port being derived from the URI. + + .. note:: + + This example is not runnable as-is; you'll need to point it at your HDFS + cluster/data. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __read_parquet_hdfs_begin__ + :end-before: __read_parquet_hdfs_end__ + + If needing to customize this HDFS storage system connection (host, port, user, kerb + ticket, etc.), you can pass in an `HDFSFileSystem + `__ + instance to :func:`read_parquet() `. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __read_parquet_hdfs_with_fs_begin__ + :end-before: __read_parquet_hdfs_with_fs_end__ + + .. tab-item:: GCS + + Data can be read from Google Cloud Storage by providing a configured + `gcsfs GCSFileSystem `__, where the + appropriate Google Cloud project and credentials can be specified. + + .. note:: + This example is not runnable as-is; you'll need to point it at your GCS bucket and + configure your GCP project and credentials. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __read_parquet_gcs_begin__ + :end-before: __read_parquet_gcs_end__ + + .. tip:: + To verify that your GCP project and credentials are set up, validate + that the GCS `filesystem` has permissions to read the input `path`. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __validate_parquet_gcs_begin__ + :end-before: __validate_parquet_gcs_end__ + + For more examples, see the `GCSFS Documentation `__. + + .. tab-item:: ADL/ABS (Azure) + + Data can be read from Azure Blob Storage by providing a configured + `adlfs AzureBlobFileSystem `__, where the appropriate + account name and account key can be specified. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __read_parquet_az_begin__ + :end-before: __read_parquet_az_end__ + +Reading from Local Storage +========================== + +In Ray Data, users often read from remote storage systems as described above. In +some use cases, users may want to read from local storage. There are three ways to read +from a local filesystem: + +* **Providing a raw filesystem path**: For example, in ``ray.data.read_csv("my_file.csv")``, + the given path will be resolved as a local filesystem path. If the file exists only on the + local node and you run this read operation in distributed cluster, this will fail as it + cannot access the file from remote nodes. +* **Using ``local://`` custom URI scheme**: Similarly, this will be resolved to local + filesystem, e.g. ``ray.data.read_csv("local://my_file.csv")`` will read the + same file as the approach above. The difference is that this scheme will ensure + all read tasks happen on the local node, so it's safe to run in a distributed + cluster. +* **Using ``example://`` custom URI scheme**: The paths with this scheme will be resolved + to ``ray/data/examples/data`` directory in the Ray package. This scheme is used + only for testing or demoing examples. + +Reading Compressed Files +======================== + +Ray Data supports reading compressed files using the ``arrow_open_stream_args`` arg. +`Codecs supported by Arrow `__ +(bz2, brotli, gzip, lz4 or zstd) are compatible with Ray Data. +For example: + +.. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __read_compressed_begin__ + :end-before: __read_compressed_end__ + +.. _dataset_from_in_memory_data: + +------------------- +From In-Memory Data +------------------- + +Datasets can be constructed from existing in-memory data. In addition to being able to +construct a ``Dataset`` from plain Python objects, Datasets also interoperates with popular +single-node libraries (`Pandas `__, +`NumPy `__, `Arrow `__) as well as +distributed frameworks (:ref:`Dask `, :ref:`Spark `, +:ref:`Modin `, :ref:`Mars `). + +.. _dataset_from_in_memory_data_single_node: + +From Single-Node Data Libraries +=============================== + +In this section, we demonstrate creating a ``Dataset`` from single-node in-memory data. + +.. tab-set:: + + .. tab-item:: Pandas + + Create a ``Dataset`` from a Pandas DataFrame. This constructs a ``Dataset`` + backed by a single block. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_pandas_begin__ + :end-before: __from_pandas_end__ + + We can also build a ``Dataset`` from more than one Pandas DataFrame, where each said + DataFrame will become a block in the ``Dataset``. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_pandas_mult_begin__ + :end-before: __from_pandas_mult_end__ + + .. tab-item:: NumPy + + Create a ``Dataset`` from a NumPy ndarray. This constructs a ``Dataset`` + backed by a single block; the outer dimension of the ndarray + will be treated as the row dimension, and the column will have name ``"data"``. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_numpy_begin__ + :end-before: __from_numpy_end__ + + We can also build a ``Dataset`` from more than one NumPy ndarray, where each said + ndarray will become a block in the ``Dataset``. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_numpy_mult_begin__ + :end-before: __from_numpy_mult_end__ + + .. tab-item:: Arrow + + Create a ``Dataset`` from an + `Arrow Table `__. + This constructs a ``Dataset`` backed by a single block. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_arrow_begin__ + :end-before: __from_arrow_end__ + + We can also build a ``Dataset`` from more than one Arrow Table, where each said + ``Table`` will become a block in the ``Dataset``. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_arrow_mult_begin__ + :end-before: __from_arrow_mult_end__ + + .. tab-item:: Python Objects + + Create a ``Dataset`` from a list of Python objects; which are interpreted as dict records. + If the object is not a dict, it will be wrapped as ``{"item": item}``. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_items_begin__ + :end-before: __from_items_end__ + +.. _dataset_from_in_memory_data_distributed: + +From Distributed Data Processing Frameworks +=========================================== + +In addition to working with single-node in-memory data, Datasets can be constructed from +distributed (multi-node) in-memory data, interoperating with popular distributed +data processing frameworks such as :ref:`Dask `, :ref:`Spark `, +:ref:`Modin `, and :ref:`Mars `. + +Note that these data processing frameworks must be running on Ray in order for these +integrations to work. See how these frameworks can be run on Ray in our +:ref:`data processing integrations docs `. + +.. tab-set:: + + .. tab-item:: Dask + + Create a ``MaterializedDataset`` from a + `Dask DataFrame `__. This constructs a + ``Dataset`` backed by the distributed Pandas DataFrame partitions that underly the + Dask DataFrame. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_dask_begin__ + :end-before: __from_dask_end__ + + .. tab-item:: Spark + + Create a ``MaterializedDataset`` from a `Spark DataFrame + `__. + This constructs a ``Dataset`` backed by the distributed Spark DataFrame partitions + that underly the Spark DataFrame. When this conversion happens, Spark-on-Ray (RayDP) + will save the Spark DataFrame partitions to Ray's object store in the Arrow format, + which Datasets will then interpret as its blocks. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __from_spark_begin__ + :end-before: __from_spark_end__ + + .. tab-item:: Modin + + Create a ``MaterializedDataset`` from a Modin DataFrame. This constructs a ``Dataset`` + backed by the distributed Pandas DataFrame partitions that underly the Modin DataFrame. + + .. literalinclude:: ./doc_code/loading_data.py + :language: python + :start-after: __from_modin_begin__ + :end-before: __from_modin_end__ + + .. tab-item:: Mars + + Create a ``MaterializedDataset`` from a Mars DataFrame. This constructs a ``Dataset`` + backed by the distributed Pandas DataFrame partitions that underly the Mars DataFrame. + + .. literalinclude:: ./doc_code/loading_data_untested.py + :language: python + :start-after: __from_mars_begin__ + :end-before: __from_mars_end__ + +.. _dataset_from_torch_tf: + +------------------------- +From Torch and TensorFlow +------------------------- + +.. tab-set:: + + .. tab-item:: PyTorch + + If you already have a Torch dataset available, you can create a Dataset using + :class:`~ray.data.from_torch`. + + .. warning:: + :class:`~ray.data.from_torch` doesn't support parallel + reads. You should only use this datasource for small datasets like MNIST or + CIFAR. + + .. code-block:: python + + import ray + import torchvision + + torch_ds = torchvision.datasets.MNIST("data", download=True) + dataset = ray.data.from_torch(torch_ds) + dataset.take(1) + # {"item": (, 5)} + + .. tab-item:: TensorFlow + + If you already have a TensorFlow dataset available, you can create a Dataset + using :class:`~ray.data.from_tf`. + + .. warning:: + :class:`~ray.data.from_tf` doesn't support parallel reads. You + should only use this function with small datasets like MNIST or CIFAR. + + .. code-block:: python + + import ray + import tensorflow_datasets as tfds + + tf_ds, _ = tfds.load("cifar10", split=["train", "test"]) + dataset = ray.data.from_tf(tf_ds) + + dataset + # -> MaterializedDataset(num_blocks=200, num_rows=50000, schema={id: binary, image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8), label: int64}) + +.. _dataset_from_huggingface: + +------------------------------- +From 🤗 (Hugging Face) Datasets +------------------------------- + +You can convert 🤗 Datasets into Ray Data by using +:py:class:`~ray.data.from_huggingface`. This function accesses the underlying Arrow table and +converts it into a Dataset directly. + +.. warning:: + :py:class:`~ray.data.from_huggingface` doesn't support parallel + reads. This will not usually be an issue with in-memory 🤗 Datasets, + but may fail with large memory-mapped 🤗 Datasets. 🤗 ``IterableDataset`` + objects are not supported. + +.. code-block:: python + + import ray.data + from datasets import load_dataset + + hf_ds = load_dataset("wikitext", "wikitext-2-raw-v1") + ray_ds = ray.data.from_huggingface(hf_ds) + ray_ds["train"].take(2) + # [{'text': ''}, {'text': ' = Valkyria Chronicles III = \n'}] + +.. _dataset_mongo_db: + +------------ +From MongoDB +------------ + +A Dataset can also be created from `MongoDB `__ with +:py:class:`~ray.data.read_mongo`. +This interacts with MongoDB similar to external filesystems, except here you will +need to specify the MongoDB source by its `uri `__, +`database and collection `__, +and specify a `pipeline `__ to run against +the collection. The execution results are then used to create a Dataset. + +.. note:: + + This example is not runnable as-is; you'll need to point it at your MongoDB + instance. + +.. code-block:: python + + import ray + + # Read a local MongoDB. + ds = ray.data.read_mongo( + uri="mongodb://localhost:27017", + database="my_db", + collection="my_collection", + pipeline=[{"$match": {"col": {"$gte": 0, "$lt": 10}}}, {"$sort": "sort_col"}], + ) + + # Reading a remote MongoDB is the same. + ds = ray.data.read_mongo( + uri="mongodb://username:password@mongodb0.example.com:27017/?authSource=admin", + database="my_db", + collection="my_collection", + pipeline=[{"$match": {"col": {"$gte": 0, "$lt": 10}}}, {"$sort": "sort_col"}], + ) + + # Write back to MongoDB. + ds.write_mongo( + MongoDatasource(), + uri="mongodb://username:password@mongodb0.example.com:27017/?authSource=admin", + database="my_db", + collection="my_collection", + ) + +.. _datasets_sql_databases: + +-------------------------- +Reading From SQL Databases +-------------------------- + +Call :func:`~ray.data.read_sql` to read data from a database that provides a +`Python DB API2-compliant `_ connector. + +.. tab-set:: + + .. tab-item:: MySQL + + To read from MySQL, install + `MySQL Connector/Python `_. It's the + first-party MySQL database connector. + + .. code-block:: console + + pip install mysql-connector-python + + Then, define your connection login and query the database. + + .. code-block:: python + + import mysql.connector + + import ray + + def create_connection(): + return mysql.connector.connect( + user="admin", + password=..., + host="example-mysql-database.c2c2k1yfll7o.us-west-2.rds.amazonaws.com", + connection_timeout=30, + database="example", + ) + + # Get all movies + dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) + # Get movies after the year 1980 + dataset = ray.data.read_sql( + "SELECT title, score FROM movie WHERE year >= 1980", create_connection + ) + # Get the number of movies per year + dataset = ray.data.read_sql( + "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection + ) + + + .. tab-item:: PostgreSQL + + To read from PostgreSQL, install `Psycopg 2 `_. It's + the most popular PostgreSQL database connector. + + .. code-block:: console + + pip install psycopg2-binary + + Then, define your connection login and query the database. + + .. code-block:: python + + import psycopg2 + + import ray + + def create_connection(): + return psycopg2.connect( + user="postgres", + password=..., + host="example-postgres-database.c2c2k1yfll7o.us-west-2.rds.amazonaws.com", + dbname="example", + ) + + # Get all movies + dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) + # Get movies after the year 1980 + dataset = ray.data.read_sql( + "SELECT title, score FROM movie WHERE year >= 1980", create_connection + ) + # Get the number of movies per year + dataset = ray.data.read_sql( + "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection + ) + + .. tab-item:: Snowflake + + To read from Snowflake, install the + `Snowflake Connector for Python `_. + + .. code-block:: console + + pip install snowflake-connector-python + + Then, define your connection login and query the database. + + .. code-block:: python + + import snowflake.connector + + import ray + + def create_connection(): + return snowflake.connector.connect( + user=..., + password=... + account="ZZKXUVH-IPB52023", + database="example", + ) + + # Get all movies + dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) + # Get movies after the year 1980 + dataset = ray.data.read_sql( + "SELECT title, score FROM movie WHERE year >= 1980", create_connection + ) + # Get the number of movies per year + dataset = ray.data.read_sql( + "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection + ) + + + .. tab-item:: Databricks + + To read from Databricks, install the + `Databricks SQL Connector for Python `_. + + .. code-block:: console + + pip install databricks-sql-connector + + + Then, define your connection logic and read from the Databricks SQL warehouse. + + .. code-block:: python + + from databricks import sql + + import ray + + def create_connection(): + return sql.connect( + server_hostname="dbc-1016e3a4-d292.cloud.databricks.com", + http_path="/sql/1.0/warehouses/a918da1fc0b7fed0", + access_token=..., + + + # Get all movies + dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) + # Get movies after the year 1980 + dataset = ray.data.read_sql( + "SELECT title, score FROM movie WHERE year >= 1980", create_connection + ) + # Get the number of movies per year + dataset = ray.data.read_sql( + "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection + ) + + .. tab-item:: BigQuery + + To read from BigQuery, install the + `Python Client for Google BigQuery `_. + This package includes a DB API2-compliant database connector. + + .. code-block:: console + + pip install google-cloud-bigquery + + Then, define your connection login and query the dataset. + + .. code-block:: python + + from google.cloud import bigquery + from google.cloud.bigquery import dbapi + + import ray + + def create_connection(): + client = bigquery.Client(...) + return dbapi.Connection(client) + + # Get all movies + dataset = ray.data.read_sql("SELECT * FROM movie", create_connection) + # Get movies after the year 1980 + dataset = ray.data.read_sql( + "SELECT title, score FROM movie WHERE year >= 1980", create_connection + ) + # Get the number of movies per year + dataset = ray.data.read_sql( + "SELECT year, COUNT(*) FROM movie GROUP BY year", create_connection + ) + + +.. _data_custom_datasource: + +------------------ +Custom Datasources +------------------ + +Datasets can read and write in parallel to :ref:`custom datasources ` defined in Python. +Once you have implemented `YourCustomDataSource`, you can use it like any other source in Ray Data: + +.. code-block:: python + + # Read from a custom datasource. + ds = ray.data.read_datasource(YourCustomDatasource(), **read_args) + + # Write to a custom datasource. + ds.write_datasource(YourCustomDatasource(), **write_args) + +For more details, check out :ref:`guide for implementing a custom datasource `. + +-------------------------- +Performance Considerations +-------------------------- + +The dataset ``parallelism`` determines the number of blocks the base data will be split into for parallel reads. Ray Data will decide internally how many read tasks to run concurrently to best utilize the cluster, ranging from ``1...parallelism`` tasks. In other words, the higher the parallelism, the smaller the data blocks in the Dataset and hence the more opportunity for parallel execution. + +.. image:: images/dataset-read.svg + :width: 650px + :align: center + +This default parallelism can be overridden via the ``parallelism`` argument; see the +:ref:`performance guide ` for more information on how to tune this read parallelism. diff --git a/doc/source/data/mars-on-ray.rst b/doc/source/data/mars-on-ray.rst index 252cea195f66..129fa94724f7 100644 --- a/doc/source/data/mars-on-ray.rst +++ b/doc/source/data/mars-on-ray.rst @@ -13,7 +13,7 @@ all mars scheduler optimizations. If ray tasks mode is used, all tasks will be s pipeline capabilities provided by ray futures. -.. _`Mars`: https://docs.pymars.org +.. _`Mars`: https://mars-project.readthedocs.io/en/latest/ Installation @@ -54,7 +54,7 @@ Or connecting to a Mars on Ray runtime which is already initialized: # perform computation -Interact with Ray Dataset: +Interact with Dataset: .. code-block:: python @@ -75,4 +75,4 @@ Interact with Ray Dataset: df2 = ds.to_mars() print(df2.head(5).execute()) -Refer to _`Mars on Ray`: https://docs.pymars.org/en/latest/installation/ray.html for more information. +Refer to _`Mars on Ray`: https://mars-project.readthedocs.io/en/latest/installation/ray.html#mars-ray for more information. diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst index 4b3ac00fb54c..6d9ec81d9bb5 100644 --- a/doc/source/data/performance-tips.rst +++ b/doc/source/data/performance-tips.rst @@ -3,10 +3,16 @@ Performance Tips and Tuning =========================== +Monitoring your application +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +View the Ray dashboard to monitor your application and troubleshoot issues. To learn +more about the Ray dashboard, read :ref:`Ray Dashboard `. + Debugging Statistics ~~~~~~~~~~~~~~~~~~~~ -You can view debug stats for your Dataset and DatasetPipeline executions via :meth:`ds.stats() `. +You can view debug stats for your Dataset executions via :meth:`ds.stats() `. These stats can be used to understand the performance of your Dataset workload and can help you debug problematic bottlenecks. Note that both execution and iterator statistics are available: .. code-block:: python @@ -20,57 +26,35 @@ These stats can be used to understand the performance of your Dataset workload a ds = ray.data.range(10000) ds = ds.map(lambda x: str(x + 1)) + ds = ds.map(pause) - pipe = ds.repeat(5).map(pause).random_shuffle_each_window() + for x in ds.iter_batches(): + pass - @ray.remote - def consume(p, stats=False): - for x in p.iter_batches(): - pass - if stats: - print(p.stats()) - - a, b = pipe.split(2) - ray.get([consume.remote(a), consume.remote(b, True)]) + print(ds.stats()) .. code-block:: - == Pipeline Window 4 == - Stage 0 read: [execution cached] - Stage 1 map: [execution cached] - Stage 2 map: 200/200 blocks executed in 0.37s - * Remote wall time: 8.08ms min, 15.82ms max, 9.36ms mean, 1.87s total - * Remote cpu time: 688.79us min, 3.63ms max, 977.38us mean, 195.48ms total - * Output num rows: 50 min, 50 max, 50 mean, 10000 total - * Output size bytes: 456 min, 456 max, 456 mean, 91200 total - * Tasks per node: 200 min, 200 max, 200 mean; 1 nodes used - - Stage 3 random_shuffle_map: 200/200 blocks executed in 0.63s - * Remote wall time: 550.98us min, 5.2ms max, 900.66us mean, 180.13ms total - * Remote cpu time: 550.79us min, 1.13ms max, 870.82us mean, 174.16ms total - * Output num rows: 50 min, 50 max, 50 mean, 10000 total - * Output size bytes: 456 min, 456 max, 456 mean, 91200 total - * Tasks per node: 200 min, 200 max, 200 mean; 1 nodes used - - Stage 3 random_shuffle_reduce: 200/200 blocks executed in 0.63s - * Remote wall time: 152.37us min, 322.96us max, 218.32us mean, 43.66ms total - * Remote cpu time: 151.9us min, 321.53us max, 217.96us mean, 43.59ms total - * Output num rows: 32 min, 69 max, 50 mean, 10000 total - * Output size bytes: 312 min, 608 max, 456 mean, 91200 total - * Tasks per node: 200 min, 200 max, 200 mean; 1 nodes used + Stage 1 ReadRange->Map->Map: 16/16 blocks executed in 0.37s + * Remote wall time: 101.55ms min, 331.39ms max, 135.24ms mean, 2.16s total + * Remote cpu time: 7.42ms min, 15.88ms max, 11.01ms mean, 176.15ms total + * Peak heap memory usage (MiB): 157.18 min, 157.73 max, 157 mean + * Output num rows: 625 min, 625 max, 625 mean, 10000 total + * Output size bytes: 3658 min, 4392 max, 4321 mean, 69150 total + * Tasks per node: 16 min, 16 max, 16 mean; 1 nodes used + * Extra metrics: {'obj_store_mem_alloc': 3658, 'obj_store_mem_freed': 5000, 'obj_store_mem_peak': 40000} Dataset iterator time breakdown: - * In ray.wait(): 1.15ms - * In ray.get(): 3.51ms - * In format_batch(): 6.83ms - * In user code: 441.53us - * Total time: 12.92ms - - ##### Overall Pipeline Time Breakdown ##### - * Time stalled waiting for next dataset: 3.48ms min, 758.48ms max, 486.78ms mean, 1.95s total - * Time in dataset iterator: 270.66ms - * Time in user code: 1.38ms - * Total time: 4.47s + * Total time user code is blocked: 551.67ms + * Total time in user code: 144.97us + * Total time overall: 1.01s + * Num blocks local: 0 + * Num blocks remote: 0 + * Num blocks unknown location: 16 + * Batch iteration time breakdown (summed across prefetch threads): + * In ray.get(): 75.68us min, 220.26us max, 131.89us avg, 2.11ms total + * In batch creation: 326.58us min, 1.37ms max, 644.86us avg, 25.79ms total + * In batch formatting: 101.81us min, 898.73us max, 172.38us avg, 6.9ms total Batching Transforms ~~~~~~~~~~~~~~~~~~~ @@ -79,10 +63,45 @@ Mapping individual records using :meth:`.map(fn) ` can be Instead, consider using :meth:`.map_batches(batch_fn, batch_format="pandas") ` and writing your ``batch_fn`` to perform vectorized pandas operations. +.. _data_format_overheads: + +Format Overheads +~~~~~~~~~~~~~~~~ + +Converting between the internal block types (Arrow, Pandas) +and the requested batch format (``"numpy"``, ``"pandas"``, ``"pyarrow"``) +may incur data copies; which conversions cause data copying is given in the below table: + + +.. list-table:: Data Format Conversion Costs + :header-rows: 1 + :stub-columns: 1 + + * - Block Type x Batch Format + - ``"pandas"`` + - ``"numpy"`` + - ``"pyarrow"`` + - ``None`` + * - Pandas Block + - Zero-copy + - Copy* + - Copy* + - Zero-copy + * - Arrow Block + - Copy* + - Zero-copy* + - Zero-copy + - Zero-copy + +.. note:: + \* No copies occur when converting between Arrow, Pandas, and NumPy formats for columns + represented as ndarrays (except for bool arrays). + + Parquet Column Pruning ~~~~~~~~~~~~~~~~~~~~~~ -Current Datasets will read all Parquet columns into memory. +Current Dataset will read all Parquet columns into memory. If you only need a subset of the columns, make sure to specify the list of columns explicitly when calling :meth:`ray.data.read_parquet() ` to avoid loading unnecessary data (projection pushdown). @@ -103,14 +122,23 @@ This can be used in conjunction with column pruning when appropriate to get the Tuning Read Parallelism ~~~~~~~~~~~~~~~~~~~~~~~ +By default, Ray Data automatically selects the read ``parallelism`` according to the following procedure: + +1. The number of available CPUs is estimated. If in a placement group, the number of CPUs in the cluster is scaled by the size of the placement group compared to the cluster size. If not in a placement group, this is the number of CPUs in the cluster. +2. The parallelism is set to the estimated number of CPUs multiplied by 2. If the parallelism is less than 8, it is set to 8. +3. The in-memory data size is estimated. If the parallelism would create in-memory blocks that are larger on average than the target block size (512MiB), the parallelism is increased until the blocks are < 512MiB in size. +4. The parallelism is truncated to ``min(num_files, parallelism)``. + +Occasionally, it is advantageous to manually tune the parallelism to optimize the application. This can be done when loading data via the ``parallelism`` parameter. +For example, use ``ray.data.read_parquet(path, parallelism=1000)`` to force up to 1000 read tasks to be created. + +Tuning Read Resources +~~~~~~~~~~~~~~~~~~~~~ + By default, Ray requests 1 CPU per read task, which means one read tasks per CPU can execute concurrently. For data sources that can benefit from higher degress of I/O parallelism, you can specify a lower ``num_cpus`` value for the read function via the ``ray_remote_args`` parameter. For example, use ``ray.data.read_parquet(path, ray_remote_args={"num_cpus": 0.25})`` to allow up to four read tasks per CPU. -By default, Datasets automatically selects the read parallelism based on the current cluster size and dataset size. -However, the number of read tasks can also be increased manually via the ``parallelism`` parameter. -For example, use ``ray.data.read_parquet(path, parallelism=1000)`` to force up to 1000 read tasks to be created. - .. _shuffle_performance_tips: Enabling Push-Based Shuffle diff --git a/doc/source/data/pipelining-compute.rst b/doc/source/data/pipelining-compute.rst index acab716c1611..b201d377da2f 100644 --- a/doc/source/data/pipelining-compute.rst +++ b/doc/source/data/pipelining-compute.rst @@ -1,15 +1,13 @@ .. _pipelining_datasets: -.. note:: +============================= +DatasetPipelines (deprecated) +============================= - The DatasetPipeline is expected to be deprecated in Ray 2.5. If your use case doesn't - need per-window shuffle, we recommend using just plain Datasets, which supports the - streaming execution by default in Ray 2.4. For more detail, see - :ref:`Streaming Execution `. +.. warning:: -================== -Pipelining Compute -================== + DatasetPipelines are deprecated now that Dataset provides pipelined execution + by default. For more detail, see :ref:`Streaming Execution `. Dataset pipelines allow Dataset transformations to be executed incrementally on *windows* of the base data, instead of on all of the data at once. This can be used for streaming data loading into ML training, or to execute batch transformations on large datasets without needing to load the entire dataset into cluster memory. diff --git a/doc/source/data/transforming-data.rst b/doc/source/data/transforming-data.rst new file mode 100644 index 000000000000..0b9305dabbef --- /dev/null +++ b/doc/source/data/transforming-data.rst @@ -0,0 +1,286 @@ +.. _transforming_data: + +================= +Transforming Data +================= + +Dataset transforms take in datasets and produce new datasets. For example, *map_batches* +is a transform that applies a +:ref:`user-defined function ` on each data record +and returns a new dataset as the result. Dataset transforms can be composed to +express a chain of computations. + +-------- +Overview +-------- + +There are two main types of supported transforms: + +* One-to-one: each input block will contribute to only one output + block, such as :meth:`ds.map_batches() `. +* All-to-all: input blocks can contribute to multiple output blocks, + such as :meth:`ds.random_shuffle() `. + +.. list-table:: Common Ray Data transforms. + :header-rows: 1 + + * - Transform + - Type + - Description + * - :meth:`ds.map() ` + - One-to-one + - Apply a given function to individual data records. + * - :meth:`ds.map_batches() ` + - One-to-one + - Apply a given function to batches of records. + * - :meth:`ds.repartition() ` + - All-to-all + - | Repartition the dataset into N blocks. + * - :meth:`ds.random_shuffle() ` + - All-to-all + - | Randomly shuffle the dataset. + * - :meth:`ds.groupby().\() ` + - All-to-all + - | Group data by column and aggregate each group. + * - :meth:`ds.groupby().map_groups() ` + - All-to-all + - | Group data by column and transform each group. + +.. _transform_datasets_writing_udfs: + +-------------- +Map transforms +-------------- + +Use ``map_batches`` to efficiently transform records in batches, or ``map`` to transform records individually: + +.. tab-set:: + + .. tab-item:: Map Batches + + Call `map_batches`` to transform batches of records. Each batch has type``Dict[str, np.ndarray]``. The below example shows how to use ``map_batches`` to convert text records to lowercase: + + .. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __map_batches_begin__ + :end-before: __map_batches_end__ + + .. tab-item:: Map + + Records can also be transformed one at a time using the ``map`` function, which takes records encoded as ``Dict[str, Any]]``. The below example shows how to convert text records to lowercase: + + .. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __map_begin__ + :end-before: __map_end__ + +Configuring CPUs and GPUs +========================= + +By default, each task used for (e.g., `map` or `map_batches`) requests 1 CPU from Ray. +To increase the resources reserved per task, you can increase the CPU request by specifying +``.map_batches(..., num_cpus=)``, which will instead reserve ``N`` CPUs per task. +Increasing the CPUs per task can help with avoiding out of memory (OOM) errors +for resource intensive tasks. + +.. code-block:: python + + # Run each function with 1 CPU each (default). + ds.map_batches(func) + + # Run each function with 4 CPUs each. + ds.map_batches(func, num_cpus=4) + +To request tasks be run on a GPU, use ``.map_batches(..., num_gpus=1)``, etc. In addition to +``num_cpus`` and ``num_gpus``, any kwarg from ``@ray.remote`` can be passed to customize +the resource scheduling of tasks: + +.. code-block:: python + + # Run each function with 1 GPU each. + ds.map_batches(func, num_gpus=1) + + # Can also customize other ray remote args such as `max_retries`. + ds.map_batches(func, num_gpus=1, max_retries=10) + +Configuring batch size +====================== + +An important parameter to set for :meth:`ds.map_batches() ` +is ``batch_size``, which controls the size of the batches provided to the your transform function. The default +batch size is `4096` for CPU tasks. For GPU tasks, an explicit batch size is always required: + +.. code-block:: python + + # Each batch sent to `func` will have up to 4096 records (default). + ds.map_batches(func) + + # Reduce the batch size to 64 records per batch. + ds.map_batches(func, batch_size=64) + +Increasing ``batch_size`` can improve performance for transforms that take advantage of vectorization, but will also result in higher memory utilization, which can lead to out-of-memory (OOM) errors. If encountering OOMs, decreasing your ``batch_size`` may help. Note also that if the ``batch_size`` becomes larger than the number of records per block, multiple blocks will be bundled together into a single batch, potentially reducing the parallelism available. + +.. _transform_datasets_batch_formats: + +Configuring batch format +======================== + +Customize the format of data batches using the ``batch_format`` argument to :meth:`ds.map_batches() `. The following are examples in each available batch format. + +Transform functions do not have to return data in the same format as the input batch. For example, you could return a ``pd.DataFrame`` even if the input was in NumPy format. + +.. tab-set:: + + .. tab-item:: NumPy (default) + + The ``"numpy"`` option presents batches as ``Dict[str, np.ndarray]``, where the + `numpy.ndarray `__ + values represent a batch of record field values. + + .. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __writing_numpy_udfs_begin__ + :end-before: __writing_numpy_udfs_end__ + + .. tab-item:: Pandas + + The ``"pandas"`` batch format presents batches in + `pandas.DataFrame `__ + format. + + .. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __writing_pandas_udfs_begin__ + :end-before: __writing_pandas_udfs_end__ + + .. tab-item:: PyArrow + + The ``"pyarrow"`` batch format presents batches in + `pyarrow.Table `__ + format. + + .. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __writing_arrow_udfs_begin__ + :end-before: __writing_arrow_udfs_end__ + +.. _transforming_data_actors: + +Reduce setup overheads using actors +=================================== + +Data transforms can be executed by either :ref:`Ray tasks ` +or :ref:`Ray actors `. By default, ``map_batches`` uses tasks. +For transforms that require expensive setup, +it's preferrable to use actors, which are stateful and allow setup to be reused +for efficiency. For a fixed-size actor pool, specify ``compute=ActorPoolStrategy(size=n)``. +For an autoscaling actor pool, use ``compute=ray.data.ActorPoolStrategy(min_size=m, max_size=n)``. + +When using actors, you must also specify your transform as a callable class type instead of a plain function. The following is an example of using actors for batch inference: + +.. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __dataset_compute_strategy_begin__ + :end-before: __dataset_compute_strategy_end__ + +Reduce memory usage using generators +==================================== + +Transform functions can also be written as Python generators, yielding multiple outputs for a batch or row instead of a single item. Generator UDFs are useful when returning large objects. Instead of returning a very large output batch, ``fn`` can instead yield the output batch in chunks to avoid excessive heap memory usage. + +.. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __writing_generator_udfs_begin__ + :end-before: __writing_generator_udfs_end__ + +------------------ +Shuffle transforms +------------------ + +Shuffle transforms change the organization of the data, e.g., increasing the number of blocks, or the order of records in each block, without changing the record contents. + +Repartitioning data +=================== + +Call :meth:`Dataset.repartition() ` to change the +number of blocks of the dataset. This may be useful to break up your dataset into small +pieces to enable more fine-grained parallelization, or to reduce the number of files +produced as output of a write operation. + +.. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __shuffle_begin__ + :end-before: __shuffle_end__ + +Random shuffle +============== + +Call :meth:`Dataset.random_shuffle() ` to +globally shuffle the order of data records. + +.. doctest:: + + >>> import ray + >>> dataset = ray.data.range(10) + >>> dataset.random_shuffle().take_batch() # doctest: +SKIP + {'id': array([7, 0, 9, 3, 5, 1, 4, 2, 8, 6])} + +For reduced overhead during training ingest, use local shuffles. Read +:ref:`Shuffling Data ` in the AIR user guide to learn more. + +.. _data-groupbys: + +------------------ +Grouped transforms +------------------ + +Ray Data supports grouping data by column and applying aggregations to each group. This is supported via the :meth:`ds.groupby() ` call. + +Aggregations +============ + +Aggregations can be performed per group: + +.. code-block:: python + + ds = ray.data.from_items([ + {"A": x % 3, "B": 2 * x, "C": 3 * x} + for x in range(10) + ]) + + # Group by the A column and calculate the per-group mean for B and C columns. + ds.groupby("A").mean(["B", "C"]).to_pandas() + # -> + # A mean(B) mean(C) + # 0 0 9.0 13.5 + # 1 1 8.0 12.0 + # 2 2 10.0 15.0 + +Aggregations can also be applied globally: + +.. code-block:: python + + from ray.data.aggregate import Mean, Std + + # Global mean on B and C columns. + ds.mean(["B", "C"]) + # -> {'mean(B)': 9.0, 'mean(C)': 13.5} + + # Multiple global aggregations on multiple columns. + ds.aggregate(Mean("B"), Std("B", ddof=0), Mean("C"), Std("C", ddof=0)) + # -> {'mean(A)': 0.9, 'std(A)': 0.8306623862918076, 'mean(B)': 9.0, 'std(B)': 5.744562646538029} + +Note that Ray Data currently only supports grouping by a single column. In order to group by multiple columns, you can first compute the grouping key using ``map_batches`` prior to calling ``groupby``. + +Map Groups +========== + +Custom processing can be applied to each group of records using :meth:`ds.groupby().map_groups() `. For example, this could be used to implement custom aggregations, train a model per group, etc. + +.. literalinclude:: ./doc_code/transforming_data.py + :language: python + :start-after: __map_groups_begin__ + :end-before: __map_groups_end__ + +Note that when using ``map_groups``, all records of the same group will be gathered into the same batch, +which may lead to out-of-memory errors if the group size exceeds the capacity of a single machine. diff --git a/doc/source/data/transforming-datasets.rst b/doc/source/data/transforming-datasets.rst deleted file mode 100644 index ab646b904eb2..000000000000 --- a/doc/source/data/transforming-datasets.rst +++ /dev/null @@ -1,563 +0,0 @@ -.. _transforming_datasets: - -===================== -Transforming Datasets -===================== - -Datasets transformations take in datasets and produce new datasets. For example, *map* -is a transformation that applies a -:ref:`user-defined function ` on each dataset record -and returns a new dataset as the result. Datasets transformations can be composed to -express a chain of computations. - -.. tip:: - - If you're performing common ML transformations like normalization and label - encoding, create a :class:`~ray.data.preprocessor.Preprocessor` instead. To learn - more, read :ref:`Using Preprocessors `. - -.. _transform_datasets_transformations: - ---------------- -Transformations ---------------- - -There are two main types of transformations: - -* One-to-one: each input block will contribute to only one output - block, such as :meth:`ds.map_batches() `. -* All-to-all: input blocks can contribute to multiple output blocks, - such as :meth:`ds.random_shuffle() `. - -Here is a table listing some common transformations supported by Ray Datasets. - -.. list-table:: Common Ray Datasets transformations. - :header-rows: 1 - - * - Transformation - - Type - - Description - * - :meth:`ds.map_batches() ` - - One-to-one - - Apply a given function to batches of records of this dataset. - * - :meth:`ds.add_column() ` - - One-to-one - - Apply a given function to batches of records to create a new column. - * - :meth:`ds.drop_columns() ` - - One-to-one - - Drop the given columns from the dataset. - * - :meth:`ds.split() ` - - One-to-one - - | Split the dataset into N disjoint pieces. - * - :meth:`ds.repartition(shuffle=False) ` - - One-to-one - - | Repartition the dataset into N blocks, without shuffling the data. - * - :meth:`ds.repartition(shuffle=True) ` - - All-to-all - - | Repartition the dataset into N blocks, shuffling the data during repartition. - * - :meth:`ds.random_shuffle() ` - - All-to-all - - | Randomly shuffle the elements of this dataset. - * - :meth:`ds.sort() ` - - All-to-all - - | Sort the dataset by a sortkey. - * - :meth:`ds.groupby() ` - - All-to-all - - | Group the dataset by a groupkey. - -.. tip:: - - Datasets also provides the convenience transformation methods :meth:`ds.map() `, - :meth:`ds.flat_map() `, and :meth:`ds.filter() `, - which are not vectorized (slower than :meth:`ds.map_batches() `), but - may be useful for development. - -The following is an example to make use of those transformation APIs for processing -the Iris dataset. - -.. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __dataset_transformation_begin__ - :end-before: __dataset_transformation_end__ - -.. _transform_datasets_writing_udfs: - -------------------------------------- -Writing User-defined Functions (UDFs) -------------------------------------- - -User-defined functions (UDFs) are routines that apply on one row (e.g. -:meth:`.map() `) or a batch of rows (e.g. -:meth:`.map_batches() `) of a dataset. UDFs let you -express your customized business logic in transformations. Here we will focus on -:meth:`.map_batches() ` as it's the primary mapping -API in Datasets. - -Here are the basics that you need to know about UDFs: - -* A UDF can be either a function, a generator, or if using the :ref:`actor compute strategy `, a :ref:`callable class `. -* Select the UDF input :ref:`batch format ` using the ``batch_format`` argument. -* The UDF output type determines the Dataset schema of the transformation result. - -.. _transform_datasets_callable_classes: - -Types of UDFs -============= -There are three types of UDFs that you can use with Ray Data: Function UDFs, Callable Class UDFs, and Generator UDFs. - -.. tabbed:: "Function UDFs" - - The most basic UDFs are functions that take in a batch or row as input, and returns a batch or row as output. See :ref:`transform_datasets_batch_formats` for the supported batch formats. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_default_udfs_tabular_begin__ - :end-before: __writing_default_udfs_tabular_end__ - -.. tabbed:: "Callable Class UDFs" - - With the actor compute strategy, you can use per-row and per-batch UDFs - *callable classes*, i.e., classes that implement the ``__call__`` magic method. You - can use the constructor of the class for stateful setup, and it is only invoked once - per worker actor. - - Callable classes are useful if you need to load expensive state (such as a model) for the UDF. By using an actor class, you only need to load the state once in the beginning, rather than for each batch. - - .. note:: - These transformation APIs take the uninstantiated callable class as an argument, - not an instance of the class. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_callable_classes_udfs_begin__ - :end-before: __writing_callable_classes_udfs_end__ - -.. tabbed:: "Generator UDFs" - - UDFs can also be written as Python generators, yielding multiple outputs for a batch or row instead of a single item. Generator UDFs are useful when returning large objects. Instead of returning a very large output batch, ``fn`` can instead yield the output batch in chunks to avoid excessive heap memory usage. - - .. warning:: - When applying a generator UDF on individual rows, make sure to use the :meth:`.flat_map() ` API and not the :meth:`.map() ` API. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_generator_udfs_begin__ - :end-before: __writing_generator_udfs_end__ - - -.. _transform_datasets_batch_formats: - -UDF Input Batch Format -====================== - -Choose the *batch format* of the data given to UDFs -by setting the ``batch_format`` option of :meth:`.map_batches() `. -Here is an overview of the available batch formats: - -.. tabbed:: "default" - - The "default" batch format presents data as follows for each Dataset type: - - * **Tabular Datasets**: Each batch will be a - `pandas.DataFrame `__. - This may incur a conversion cost if the underlying Dataset block is not - zero-copy convertible from an Arrow table. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_default_udfs_tabular_begin__ - :end-before: __writing_default_udfs_tabular_end__ - - * **Tensor Datasets** (single-column): Each batch will be a single - `numpy.ndarray `__ - containing the single tensor column for this batch. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_default_udfs_tensor_begin__ - :end-before: __writing_default_udfs_tensor_end__ - - * **Simple Datasets**: Each batch will be a Python list. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_default_udfs_list_begin__ - :end-before: __writing_default_udfs_list_end__ - -.. tabbed:: "pandas" - - The ``"pandas"`` batch format presents batches in - `pandas.DataFrame `__ - format. If converting a simple dataset to Pandas DataFrame batches, a single-column - dataframe with the column ``"__value__"`` will be created. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_pandas_udfs_begin__ - :end-before: __writing_pandas_udfs_end__ - -.. tabbed:: "pyarrow" - - The ``"pyarrow"`` batch format presents batches in - `pyarrow.Table `__ - format. If converting a simple dataset to Arrow Table batches, a single-column table - with the column ``"__value__"`` will be created. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_arrow_udfs_begin__ - :end-before: __writing_arrow_udfs_end__ - -.. tabbed:: "numpy" - - The ``"numpy"`` batch format presents batches in - `numpy.ndarray `__ - format as follows: - - * **Tabular Datasets**: Each batch will be a dictionary of NumPy - ndarrays (``Dict[str, np.ndarray]``), with each key-value pair representing a column - in the table. - - * **Tensor Datasets** (single-column): Each batch will be a single - `numpy.ndarray `__ - containing the single tensor column for this batch. - - * **Simple Datasets**: Each batch will be a single NumPy ndarray, where Datasets will - attempt to convert each list-batch to an ndarray. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_numpy_udfs_begin__ - :end-before: __writing_numpy_udfs_end__ - -Converting between the underlying Datasets data representations (Arrow, Pandas, and -Python lists) and the requested batch format (``"default"``, ``"pandas"``, -``"pyarrow"``, ``"numpy"``) may incur data copies; which conversions cause data copying -is given in the below table: - - -.. list-table:: Data Format Conversion Costs - :header-rows: 1 - :stub-columns: 1 - - * - Dataset Format x Batch Format - - ``"default"`` - - ``"pandas"`` - - ``"numpy"`` - - ``"pyarrow"`` - - ``None`` - * - ``"pandas"`` - - Zero-copy - - Zero-copy - - Copy* - - Copy* - - Zero-copy - * - ``"arrow"`` - - Copy* - - Copy* - - Zero-copy* - - Zero-copy - - Zero-copy - * - ``"simple"`` - - Copy - - Copy - - Copy - - Copy - - Copy - -.. note:: - \* No copies occur when converting between Arrow, Pandas, and NumPy formats for columns - represented in our tensor extension type (unless data is boolean). Copies **always** - occur when converting boolean data from/to Arrow to/from Pandas/NumPy, since Arrow - bitpacks boolean data while Pandas/NumPy does not. - -.. tip:: - - Prefer using vectorized operations on the ``pandas.DataFrame``, - ``pyarrow.Table``, and ``numpy.ndarray`` types for better performance. For - example, suppose you want to compute the sum of a column in ``pandas.DataFrame``: - instead of iterating over each row of a batch and summing up values of that column, - use ``df_batch["col_foo"].sum()``. - -.. tip:: - - If the UDF for :meth:`ds.map_batches() ` does **not** - mutate its input, we can prevent an unnecessary data batch copy by specifying - ``zero_copy_batch=True``, which will provide the UDF with zero-copy, read-only - batches. See the :meth:`ds.map_batches() ` docstring for - more information. - -.. _transform_datasets_batch_output_types: - -Batch UDF Output Types -====================== - -The following output types are allowed for batch UDFs (e.g., -:meth:`ds.map_batches() `). The following describes -how they are interpreted to create the transformation result: - -.. tabbed:: pd.DataFrame - - Returning ``pd.DataFrame`` creates a Tabular dataset as the transformation result: - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_pandas_out_udfs_begin__ - :end-before: __writing_pandas_out_udfs_end__ - -.. tabbed:: pa.Table - - Returning ``pa.Table`` creates a Tabular dataset as the transformation result: - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_arrow_out_udfs_begin__ - :end-before: __writing_arrow_out_udfs_end__ - -.. tabbed:: np.ndarray - - Returning ``np.ndarray`` creates a single-column Tensor dataset as the transformation result: - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_numpy_out_udfs_begin__ - :end-before: __writing_numpy_out_udfs_end__ - -.. tabbed:: Dict[str, np.ndarray] - - Returning ``Dict[str, np.ndarray]`` creates a multi-column Tensor dataset as the transformation result. - - If a column tensor is 1-dimensional, then the native Arrow 1D list - type is used; if a column tensor has 2 or more dimensions, then the Dataset - :ref:`tensor extension type ` to embed these - n-dimensional tensors in the Arrow table. - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_numpy_dict_out_udfs_begin__ - :end-before: __writing_numpy_dict_out_udfs_end__ - -.. tabbed:: list - - Returning ``list`` creates a simple Python object dataset as the transformation result: - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_simple_out_udfs_begin__ - :end-before: __writing_simple_out_udfs_end__ - -.. _transform_datasets_row_output_types: - -Row UDF Output Types -==================== - -The following output types are allowed for per-row UDFs (e.g., -:meth:`ds.map() `): - -.. tabbed:: dict - - Returning a ``dict`` of Arrow-compatible data types creates a Tabular dataset - as the transformation result. If any dict values are not Arrow-compatible, then - a simple Python object dataset will be created: - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_dict_out_row_udfs_begin__ - :end-before: __writing_dict_out_row_udfs_end__ - -.. tabbed:: np.ndarray - - Returning ``np.ndarray`` creates a single-column Tensor dataset as the transformation result: - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_numpy_out_row_udfs_begin__ - :end-before: __writing_numpy_out_row_udfs_end__ - -.. tabbed:: object - - Other return row types will create a simple Python object dataset as the transformation result: - - .. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __writing_simple_out_row_udfs_begin__ - :end-before: __writing_simple_out_row_udfs_end__ - -.. _transform_datasets_configuring_batch_size: - ----------------------- -Configuring Batch Size ----------------------- - -:meth:`ds.map_batches() ` is the canonical parallel -transformation API for Datasets: it launches parallel tasks over the underlying Datasets -blocks and maps UDFs over data batches within those tasks, allowing the UDF to -implement vectorized operations on batches. An important parameter to -set is ``batch_size``, which controls the size of the batches provided to the UDF. - -.. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __configuring_batch_size_begin__ - :end-before: __configuring_batch_size_end__ - -Increasing ``batch_size`` can result in faster execution by better leveraging vectorized -operations and hardware, reducing batch slicing and concatenation overhead, and overall -saturation of CPUs/GPUs, but will also result in higher memory utilization, which can -lead to out-of-memory failures. If encountering OOMs, decreasing your ``batch_size`` may -help. - -.. note:: - The default ``batch_size`` of ``4096`` may be too large for datasets with large rows - (e.g. tables with many columns or a collection of large images). - -If you specify a ``batch_size`` that's larger than your ``Dataset`` blocks, Datasets -will bundle multiple blocks together for a single task in order to better satisfy -``batch_size``. If ``batch_size`` is a lot larger than your ``Dataset`` blocks (e.g. if -your dataset was created with too large of a ``parallelism`` and/or the ``batch_size`` -is set to too large of a value for your dataset), the number of parallel tasks -may be less than expected. - -If your ``Dataset`` blocks are smaller than your ``batch_size`` and you want to increase -:meth:`ds.map_batches() ` parallelism, decrease your -``batch_size`` to prevent this block bundling. If you think that your ``Dataset`` blocks -are too small, try decreasing ``parallelism`` during the read to create larger blocks. - -.. note:: - The size of the batches provided to the UDF may be smaller than the provided - ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent to a given - task. - -.. note:: - Block bundling (processing multiple blocks in a single task) will not occur if - ``batch_size`` is not set; instead, each task will receive a single block. If a block - is smaller than the default ``batch_size`` (4096), then the batch provided to the UDF - in that task will the same size as the block, and will therefore be smaller than the - default ``batch_size``. - -.. _transform_datasets_compute_strategy: - ----------------- -Compute Strategy ----------------- - -Datasets transformations are executed by either :ref:`Ray tasks ` -or :ref:`Ray actors ` across a Ray cluster. By default, Ray tasks are -used (with ``compute="tasks"``). For transformations that require expensive setup, -it's preferrable to use Ray actors, which are stateful and allow setup to be reused -for efficiency. For a fixed-size actor pool, specify ``compute=ActorPoolStrategy(size=n)``. -For an autoscaling actor pool, use ``compute=ray.data.ActorPoolStrategy(min_size=m, max_size=n)``. - -The following is an example of using the Ray tasks and actors compute strategy -for batch inference: - -.. literalinclude:: ./doc_code/transforming_datasets.py - :language: python - :start-after: __dataset_compute_strategy_begin__ - :end-before: __dataset_compute_strategy_end__ - -.. _datasets-groupbys: - --------------------------- -Group-bys and aggregations --------------------------- - -Unlike mapping operations, groupbys and aggregations are global. Grouped aggregations -are executed lazily. Global aggregations are executed *eagerly* and block until the -aggregation has been computed. - -.. code-block:: python - - ds: ray.data.Dataset = ray.data.from_items([ - {"A": x % 3, "B": 2 * x, "C": 3 * x} - for x in range(10)]) - - # Group by the A column and calculate the per-group mean for B and C columns. - agg_ds: ray.data.Dataset = ds.groupby("A").mean(["B", "C"]).materialize() - # -> Sort Sample: 100%|███████████████████████████████████████| 10/10 [00:01<00:00, 9.04it/s] - # -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 23.66it/s] - # -> GroupBy Reduce: 100%|████████████████████████████████████| 10/10 [00:00<00:00, 937.21it/s] - # -> Dataset(num_blocks=10, num_rows=3, schema={}) - agg_ds.to_pandas() - # -> - # A mean(B) mean(C) - # 0 0 9.0 13.5 - # 1 1 8.0 12.0 - # 2 2 10.0 15.0 - - # Global mean on B column. - ds.mean("B") - # -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 2851.91it/s] - # -> GroupBy Reduce: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 319.69it/s] - # -> 9.0 - - # Global mean on multiple columns. - ds.mean(["B", "C"]) - # -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 1730.32it/s] - # -> GroupBy Reduce: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 231.41it/s] - # -> {'mean(B)': 9.0, 'mean(C)': 13.5} - - # Multiple global aggregations on multiple columns. - from ray.data.aggregate import Mean, Std - ds.aggregate(Mean("B"), Std("B", ddof=0), Mean("C"), Std("C", ddof=0)) - # -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 1568.73it/s] - # -> GroupBy Reduce: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 133.51it/s] - # -> {'mean(A)': 0.9, 'std(A)': 0.8306623862918076, 'mean(B)': 9.0, 'std(B)': 5.744562646538029} - -Combine aggreations with batch mapping to transform datasets using computed statistics. -For example, you can efficiently standardize feature columns and impute missing values -with calculated column means. - -.. code-block:: python - - # Impute missing values with the column mean. - b_mean = ds.mean("B") - # -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 4054.03it/s] - # -> GroupBy Reduce: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 359.22it/s] - # -> 9.0 - - def impute_b(df: pd.DataFrame): - df["B"].fillna(b_mean) - return df - - ds = ds.map_batches(impute_b, batch_format="pandas") - # -> MapBatches(impute_b) - # +- Dataset(num_blocks=10, num_rows=10, schema={A: int64, B: int64, C: int64}) - - # Standard scaling of all feature columns. - stats = ds.aggregate(Mean("B"), Std("B"), Mean("C"), Std("C")) - # -> MapBatches(impute_b): 100%|██████████████████████████████| 10/10 [00:01<00:00, 7.16it/s] - # -> GroupBy Map: 100%|███████████████████████████████████████| 10/10 [00:00<00:00, 1260.99it/s] - # -> GroupBy Reduce: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 128.77it/s] - # -> {'mean(B)': 9.0, 'std(B)': 6.0553007081949835, 'mean(C)': 13.5, 'std(C)': 9.082951062292475} - - def batch_standard_scaler(df: pd.DataFrame): - def column_standard_scaler(s: pd.Series): - s_mean = stats[f"mean({s.name})"] - s_std = stats[f"std({s.name})"] - return (s - s_mean) / s_std - - cols = df.columns.difference(["A"]) - df.loc[:, cols] = df.loc[:, cols].transform(column_standard_scaler) - return df - - ds = ds.map_batches(batch_standard_scaler, batch_format="pandas") - ds.materialize() - # -> Map Progress: 100%|██████████████████████████████████████| 10/10 [00:00<00:00, 144.79it/s] - # -> Dataset(num_blocks=10, num_rows=10, schema={A: int64, B: double, C: double}) - --------------- -Shuffling data --------------- - -Call :meth:`Dataset.random_shuffle() ` to -perform a global shuffle. - -.. doctest:: - - >>> import ray - >>> dataset = ray.data.range(10) - >>> dataset.random_shuffle().take_all() # doctest: +SKIP - [7, 0, 9, 3, 5, 1, 4, 2, 8, 6] - -For better performance, perform a local shuffle. Read -:ref:`Shuffling Data ` in the AIR user guide to learn more. diff --git a/doc/source/data/user-guide.rst b/doc/source/data/user-guide.rst index 0d38e670ab3b..029cda45dac5 100644 --- a/doc/source/data/user-guide.rst +++ b/doc/source/data/user-guide.rst @@ -1,20 +1,23 @@ -.. _data_user_guide : +.. _data_user_guide: =========== User Guides =========== -If you’re new to Ray Datasets, we recommend starting with the :ref:`Ray Datasets Quick Start `. -This user guide will help you navigate the Ray Datasets project and show you how achieve several tasks. +If you’re new to Ray Data, we recommend starting with the +:ref:`Ray Data Quick Start `. +This user guide will help you navigate the Ray Data project and +show you how achieve several tasks. .. toctree:: :maxdepth: 2 - creating-datasets - transforming-datasets - consuming-datasets - dataset-tensor-support + loading-data + transforming-data + consuming-data + batch_inference + working-with-tensors custom-datasource - pipelining-compute - dataset-internals + data-internals performance-tips + pipelining-compute diff --git a/doc/source/data/working-with-tensors.rst b/doc/source/data/working-with-tensors.rst new file mode 100644 index 000000000000..6c064f941073 --- /dev/null +++ b/doc/source/data/working-with-tensors.rst @@ -0,0 +1,141 @@ +.. _working_with_tensors: + +Working with Tensors +==================== + +N-dimensional arrays (i.e., tensors) are ubiquitous in ML workloads. This guide +describes the limitations and best practices of working with such data. + +Tensor data representation +-------------------------- + +Ray Data represents tensors as +`NumPy ndarrays `__. + +.. testcode:: + + import ray + + ds = ray.data.read_images("s3://anonymous@air-example-data/digits") + print(ds) + +.. testoutput:: + :options: +ELLIPSIS + + Dataset( + num_blocks=..., + num_rows=100, + schema={image: numpy.ndarray(shape=(28, 28), dtype=uint8)} + ) + +Batches of fixed-shape tensors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If your tensors have a fixed shape, Ray Data represents batches as regular ndarrays. + +.. doctest:: + + >>> import ray + >>> ds = ray.data.read_images("s3://anonymous@air-example-data/digits") + >>> batch = ds.take_batch(batch_size=32) + >>> batch["image"].shape + (32, 28, 28) + >>> batch["image"].dtype + dtype('uint8') + +Batches of variable-shape tensors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If your tensors vary in shape, Ray Data represents batches as arrays of object dtype. + +.. doctest:: + + >>> import ray + >>> ds = ray.data.read_images("s3://anonymous@air-example-data/AnimalDetection") + >>> batch = ds.take_batch(batch_size=32) + >>> batch["image"].shape + (32,) + >>> batch["image"].dtype + dtype('O') + +The individual elements of these object arrays are regular ndarrays. + +.. doctest:: + + >>> batch["image"][0].dtype + dtype('uint8') + >>> batch["image"][0].shape # doctest: +SKIP + (375, 500, 3) + >>> batch["image"][3].shape # doctest: +SKIP + (333, 465, 3) + +.. _transforming_tensors: + +Transforming tensor data +------------------------ + +Call :meth:`~ray.data.Dataset.map` or :meth:`~ray.data.Dataset.map_batches` to transform tensor data. + +.. testcode:: + + from typing import Any, Dict + + import ray + import numpy as np + + ds = ray.data.read_images("s3://anonymous@air-example-data/AnimalDetection") + + def increase_brightness(row: Dict[str, Any]) -> Dict[str, Any]: + row["image"] = np.clip(row["image"] + 4, 0, 255) + return row + + # Increase the brightness, record at a time. + ds.map(increase_brightness) + + def batch_increase_brightness(batch: Dict[str, np.ndarray]) -> Dict: + batch["image"] = np.clip(batch["image"] + 4, 0, 255) + return batch + + # Increase the brightness, batch at a time. + ds.map_batches(batch_increase_brightness) + +In this example, we return ``np.ndarray`` directly as the output. Ray Data will also treat +returned lists of ``np.ndarray`` and objects implementing ``__array__`` (e.g., ``torch.Tensor``) +as tensor data. + +For more information on transforming data, read +:ref:`Transforming data `. + + +Saving tensor data +------------------ + +Save tensor data in Parquet or Numpy files. Other formats aren't supported. + +.. tab-set:: + + .. tab-item:: Parquet + + Call :meth:`~ray.data.Dataset.write_parquet` to save data in Parquet files. + + .. testcode:: + + import ray + + ds = ray.data.read_images("example://image-datasets/simple") + ds.write_parquet("/tmp/simple") + + + .. tab-item:: NumPy + + Call :meth:`~ray.data.Dataset.write_numpy` to save an ndarray column in a NumPy + file. + + .. testcode:: + + import ray + + ds = ray.data.read_images("example://image-datasets/simple") + ds.write_numpy("/tmp/simple.npy", column="image") + +For more information on saving data, read :ref:`Saving data `. diff --git a/doc/source/images/llm-stack.png b/doc/source/images/llm-stack.png new file mode 100644 index 000000000000..110c1339b81f Binary files /dev/null and b/doc/source/images/llm-stack.png differ diff --git a/doc/source/index.md b/doc/source/index.md index cb33f3860e65..87bb228668fb 100644 --- a/doc/source/index.md +++ b/doc/source/index.md @@ -75,7 +75,7 @@

    pip install "ray[default]"

    Installation guide >

    -
    @@ -133,7 +133,7 @@ dataset_transformed = preprocessor.fit_transform(dataset=dataset)
    @@ -166,7 +166,7 @@ result = trainer.fit()

    Scale generic Python code with simple, foundational primitives that enable a high degree of control for building distributed applications or custom platforms.

    - Learn more about Core > + Learn more about Core >
    @@ -391,7 +391,7 @@ ppo_algo.evaluate()

    Contribute to Ray

    - -``` \ No newline at end of file +``` diff --git a/doc/source/ray-air/api/dataset-ingest.rst b/doc/source/ray-air/api/dataset-ingest.rst index 22dfcdc53062..fe3b350db779 100644 --- a/doc/source/ray-air/api/dataset-ingest.rst +++ b/doc/source/ray-air/api/dataset-ingest.rst @@ -1,9 +1,9 @@ -Ray Dataset Ingest into AIR Trainers -===================================== +Ray Data Ingest into AIR Trainers +================================= .. seealso:: - See this :ref:`AIR Dataset ingest guide ` for usage examples. + See this :ref:`AIR Data ingest guide ` for usage examples. .. currentmodule:: ray diff --git a/doc/source/ray-air/api/predictor.rst b/doc/source/ray-air/api/predictor.rst index 1c438fbbd54c..34ab11a484a2 100644 --- a/doc/source/ray-air/api/predictor.rst +++ b/doc/source/ray-air/api/predictor.rst @@ -80,6 +80,7 @@ Batch Prediction API batch_predictor.BatchPredictor.predict batch_predictor.BatchPredictor.predict_pipelined +.. _air_framework_predictors: Built-in Predictors for Library Integrations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -91,6 +92,6 @@ Built-in Predictors for Library Integrations ~lightgbm.LightGBMPredictor ~tensorflow.TensorflowPredictor ~torch.TorchPredictor - ~huggingface.HuggingFacePredictor + ~hf_transformers.TransformersPredictor ~sklearn.SklearnPredictor ~rl.RLPredictor diff --git a/doc/source/ray-air/benchmarks.rst b/doc/source/ray-air/benchmarks.rst index b5e30f905c21..46a4b0d0f0c4 100644 --- a/doc/source/ray-air/benchmarks.rst +++ b/doc/source/ray-air/benchmarks.rst @@ -259,13 +259,13 @@ overhead of a few seconds for both methods that is negligible for longer trainin .. _`XGBoost Cluster Configuration`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml#L6-L24 .. _`GPU image batch prediction script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/gpu_batch_prediction.py#L18-L49 .. _`GPU image training script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py#L95-L106 -.. _`GPU prediction small cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_1_g4_8xl.yaml#L6-L15 -.. _`GPU prediction large cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4_g4_12xl.yaml#L6-L15 -.. _`GPU training small cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_1.yaml#L6-L24 -.. _`GPU training large cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_16.yaml#L5-L25 +.. _`GPU prediction small cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_1_cpu_16_aws.yaml#L6-L15 +.. _`GPU prediction large cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml#L6-L15 +.. _`GPU training small cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml#L6-L24 +.. _`GPU training large cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml#L5-L25 .. _`Pytorch comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/torch_benchmark.py -.. _`Pytorch comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4.yaml -.. _`Pytorch comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml +.. _`Pytorch comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml +.. _`Pytorch comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml .. _`Tensorflow comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py -.. _`Tensorflow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4.yaml -.. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml +.. _`Tensorflow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml +.. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml diff --git a/doc/source/ray-air/check-ingest.rst b/doc/source/ray-air/check-ingest.rst index f6969103c62e..3c74e6aa1509 100644 --- a/doc/source/ray-air/check-ingest.rst +++ b/doc/source/ray-air/check-ingest.rst @@ -3,7 +3,7 @@ Configuring Training Datasets ============================= -AIR builds its training data pipeline on :ref:`Ray Datasets `, which is a scalable, framework-agnostic data loading and preprocessing library. Datasets enables AIR to seamlessly load data for local and distributed training with Train. +AIR builds its training data pipeline on :ref:`Ray Data `, which is a scalable, framework-agnostic data loading and preprocessing library. Ray Data enables AIR to seamlessly load data for local and distributed training with Train. This page describes how to setup and configure these datasets in Train under different scenarios and scales. @@ -74,57 +74,59 @@ Here are some examples of configuring Dataset ingest options and what they do: Enabling Streaming Ingest ~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: Bulk Ingest +.. tab-set:: - By default, AIR loads all datasets into the Ray object store at the start of training. - This provides the best performance if the cluster can fit the datasets - entirely in memory, or if the preprocessing step is expensive to run more than once. + .. tab-item:: Bulk Ingest - .. literalinclude:: doc_code/air_ingest.py - :language: python - :start-after: __config_4__ - :end-before: __config_4_end__ + By default, AIR loads all datasets into the Ray object store at the start of training. + This provides the best performance if the cluster can fit the datasets + entirely in memory, or if the preprocessing step is expensive to run more than once. - You should use bulk ingest when: + .. literalinclude:: doc_code/air_ingest.py + :language: python + :start-after: __config_4__ + :end-before: __config_4_end__ - * you have enough memory to fit data blocks in cluster object store; or - * your preprocessing transform is expensive to recompute on each epoch + You should use bulk ingest when: -.. tabbed:: Streaming Ingest (experimental) + * you have enough memory to fit data blocks in cluster object store; or + * your preprocessing transform is expensive to recompute on each epoch - In streaming ingest mode, instead of loading the entire dataset into the - Ray object store at once, AIR will load a fraction of the dataset at a - time. This can be desirable when the dataset is very large, and caching it - all at once would cause expensive disk spilling. The downside is that the - dataset will have to be preprocessed on each epoch, which may be more - expensive. Preprocessing is overlapped with training computation, but - overall training throughput may still decrease if preprocessing is more - expensive than the training computation (forward pass, backward pass, - gradient sync). + .. tab-item:: Streaming Ingest (experimental) - To enable this mode, use the :py:meth:`max_object_store_memory_fraction - ` argument. This argument defaults to -1, - meaning that bulk ingest should be used and the entire dataset should be - computed and cached before training starts. + In streaming ingest mode, instead of loading the entire dataset into the + Ray object store at once, AIR will load a fraction of the dataset at a + time. This can be desirable when the dataset is very large, and caching it + all at once would cause expensive disk spilling. The downside is that the + dataset will have to be preprocessed on each epoch, which may be more + expensive. Preprocessing is overlapped with training computation, but + overall training throughput may still decrease if preprocessing is more + expensive than the training computation (forward pass, backward pass, + gradient sync). - Use a float value 0 or greater to indicate the "window" size, i.e. the - maximum fraction of object store memory that should be used at once. A - reasonable value is 0.2, meaning 20% of available object store memory. - Larger window sizes can improve performance by increasing parallelism. A - window size of 1 or greater will likely result in spilling. + To enable this mode, use the :py:meth:`max_object_store_memory_fraction + ` argument. This argument defaults to -1, + meaning that bulk ingest should be used and the entire dataset should be + computed and cached before training starts. - .. literalinclude:: doc_code/air_ingest.py - :language: python - :start-after: __config_5__ - :end-before: __config_5_end__ + Use a float value 0 or greater to indicate the "window" size, i.e. the + maximum fraction of object store memory that should be used at once. A + reasonable value is 0.2, meaning 20% of available object store memory. + Larger window sizes can improve performance by increasing parallelism. A + window size of 1 or greater will likely result in spilling. - Use streaming ingest when: + .. literalinclude:: doc_code/air_ingest.py + :language: python + :start-after: __config_5__ + :end-before: __config_5_end__ - * you have large datasets that don't fit into memory; and - * re-executing the preprocessing step on each epoch is faster than caching the preprocessed dataset on disk and reloading from disk on each epoch + Use streaming ingest when: - Note that this feature is experimental and the actual object store memory - usage may vary. Please file a `GitHub issue `_ if you run into problems. + * you have large datasets that don't fit into memory; and + * re-executing the preprocessing step on each epoch is faster than caching the preprocessed dataset on disk and reloading from disk on each epoch + + Note that this feature is experimental and the actual object store memory + usage may vary. Please file a `GitHub issue `_ if you run into problems. .. _air-shuffle: @@ -138,50 +140,52 @@ By default, AIR shuffles the assignment of data blocks (files) to dataset shards To randomize data records within a file, perform a local or global shuffle. -.. tabbed:: Local Shuffling +.. tab-set:: + + .. tab-item:: Local Shuffling - Local shuffling is the recommended approach for randomizing data order. To use local shuffle, - simply specify a non-zero ``local_shuffle_buffer_size`` as an argument to :meth:`~ray.data.DataIterator.iter_batches`. - The iterator will then use a local buffer of the given size to randomize record order. The - larger the buffer size, the more randomization will be applied, but it will also use more - memory. + Local shuffling is the recommended approach for randomizing data order. To use local shuffle, + simply specify a non-zero ``local_shuffle_buffer_size`` as an argument to :meth:`~ray.data.DataIterator.iter_batches`. + The iterator will then use a local buffer of the given size to randomize record order. The + larger the buffer size, the more randomization will be applied, but it will also use more + memory. - See :meth:`~ray.data.DataIterator.iter_batches` for more details. + See :meth:`~ray.data.DataIterator.iter_batches` for more details. - .. literalinclude:: doc_code/air_ingest.py - :language: python - :start-after: __local_shuffling_start__ - :end-before: __local_shuffling_end__ + .. literalinclude:: doc_code/air_ingest.py + :language: python + :start-after: __local_shuffling_start__ + :end-before: __local_shuffling_end__ - You should use local shuffling when: + You should use local shuffling when: - * a small in-memory buffer provides enough randomization; or - * you want the highest possible ingest performance; or - * your model is not overly sensitive to shuffle quality + * a small in-memory buffer provides enough randomization; or + * you want the highest possible ingest performance; or + * your model is not overly sensitive to shuffle quality -.. tabbed:: Global Shuffling (slower) + .. tab-item:: Global Shuffling (slower) - Global shuffling provides more uniformly random (decorrelated) samples and is carried - out via a distributed map-reduce operation. This higher quality shuffle can often lead - to more precision gain per training step, but it is also an expensive distributed - operation and will decrease the ingest throughput. The shuffle step is overlapped with - training computation, so as long as the shuffled ingest throughput matches - or exceeds the model training (forward pass, backward pass, gradient sync) - throughput, this higher-quality shuffle shouldn't slow down the overall - training. + Global shuffling provides more uniformly random (decorrelated) samples and is carried + out via a distributed map-reduce operation. This higher quality shuffle can often lead + to more precision gain per training step, but it is also an expensive distributed + operation and will decrease the ingest throughput. The shuffle step is overlapped with + training computation, so as long as the shuffled ingest throughput matches + or exceeds the model training (forward pass, backward pass, gradient sync) + throughput, this higher-quality shuffle shouldn't slow down the overall + training. - If global shuffling *is* causing the ingest throughput to become the training - bottleneck, local shuffling may be a better option. + If global shuffling *is* causing the ingest throughput to become the training + bottleneck, local shuffling may be a better option. - .. literalinclude:: doc_code/air_ingest.py - :language: python - :start-after: __global_shuffling_start__ - :end-before: __global_shuffling_end__ + .. literalinclude:: doc_code/air_ingest.py + :language: python + :start-after: __global_shuffling_start__ + :end-before: __global_shuffling_end__ - You should use global shuffling when: + You should use global shuffling when: - * you suspect high-quality shuffles may significantly improve model quality; and - * absolute ingest performance is less of a concern + * you suspect high-quality shuffles may significantly improve model quality; and + * absolute ingest performance is less of a concern .. _air-per-epoch-preprocessing: @@ -240,43 +244,45 @@ Dataset Resources Datasets uses Ray tasks to execute data processing operations. These tasks use CPU resources in the cluster during execution, which may compete with resources needed for Training. -.. tabbed:: Unreserved CPUs +.. tab-set:: + + .. tab-item:: Unreserved CPUs - By default, Dataset tasks use cluster CPU resources for execution. This can sometimes - conflict with Trainer resource requests. For example, if Trainers allocate all CPU resources - in the cluster, then no Datasets tasks can run. + By default, Dataset tasks use cluster CPU resources for execution. This can sometimes + conflict with Trainer resource requests. For example, if Trainers allocate all CPU resources + in the cluster, then no Datasets tasks can run. - .. literalinclude:: ./doc_code/air_ingest.py - :language: python - :start-after: __resource_allocation_1_begin__ - :end-before: __resource_allocation_1_end__ + .. literalinclude:: ./doc_code/air_ingest.py + :language: python + :start-after: __resource_allocation_1_begin__ + :end-before: __resource_allocation_1_end__ - Unreserved CPUs work well when: + Unreserved CPUs work well when: - * you are running only one Trainer and the cluster has enough CPUs; or - * your Trainers are configured to use GPUs and not CPUs + * you are running only one Trainer and the cluster has enough CPUs; or + * your Trainers are configured to use GPUs and not CPUs -.. tabbed:: Using Reserved CPUs (experimental) + .. tab-item:: Using Reserved CPUs (experimental) - The ``_max_cpu_fraction_per_node`` option can be used to exclude CPUs from placement - group scheduling. In the below example, setting this parameter to ``0.8`` enables Tune - trials to run smoothly without risk of deadlock by reserving 20% of node CPUs for - Dataset execution. + The ``_max_cpu_fraction_per_node`` option can be used to exclude CPUs from placement + group scheduling. In the below example, setting this parameter to ``0.8`` enables Tune + trials to run smoothly without risk of deadlock by reserving 20% of node CPUs for + Dataset execution. - .. literalinclude:: ./doc_code/air_ingest.py - :language: python - :start-after: __resource_allocation_2_begin__ - :end-before: __resource_allocation_2_end__ + .. literalinclude:: ./doc_code/air_ingest.py + :language: python + :start-after: __resource_allocation_2_begin__ + :end-before: __resource_allocation_2_end__ - You should use reserved CPUs when: + You should use reserved CPUs when: - * you are running multiple concurrent CPU Trainers using Tune; or - * you want to ensure predictable Datasets performance + * you are running multiple concurrent CPU Trainers using Tune; or + * you want to ensure predictable Datasets performance - .. warning:: + .. warning:: - ``_max_cpu_fraction_per_node`` is experimental and not currently recommended for use with - autoscaling clusters (scale-up will not trigger properly). + ``_max_cpu_fraction_per_node`` is experimental and not currently recommended for use with + autoscaling clusters (scale-up will not trigger properly). Debugging Ingest with the ``DummyTrainer`` ------------------------------------------ @@ -426,9 +432,9 @@ FAQ How do I pass in a :py:class:`~ray.data.dataset_pipeline.DatasetPipeline` to my ``Trainer``? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The Trainer interface only accepts a standard :py:class:`~ray.data.Datastream` and not a :py:class:`~ray.data.dataset_pipeline.DatasetPipeline`. +The Trainer interface only accepts a standard :py:class:`~ray.data.Dataset` and not a :py:class:`~ray.data.dataset_pipeline.DatasetPipeline`. Instead, you can configure the ingest via the ``dataset_config`` that is passed to your ``Trainer``. Internally, Ray AIR will -convert the provided :py:class:`~ray.data.Datastream` into a :py:class:`~ray.data.dataset_pipeline.DatasetPipeline` with the specified configurations. +convert the provided :py:class:`~ray.data.Dataset` into a :py:class:`~ray.data.dataset_pipeline.DatasetPipeline` with the specified configurations. See the :ref:`Enabling Streaming Ingest ` and :ref:`Shuffling Data ` sections for full examples. diff --git a/doc/source/ray-air/computer-vision.rst b/doc/source/ray-air/computer-vision.rst index ea35dcf4438c..64542835e6d5 100644 --- a/doc/source/ray-air/computer-vision.rst +++ b/doc/source/ray-air/computer-vision.rst @@ -14,107 +14,109 @@ This guide explains how to perform common computer vision tasks like: Reading image data ------------------ -.. tabbed:: Raw images +.. tab-set:: - Datasets like ImageNet store files like this: + .. tab-item:: Raw images - .. code-block:: + Datasets like ImageNet store files like this: - root/dog/xxx.png - root/dog/xxy.png - root/dog/[...]/xxz.png + .. code-block:: - root/cat/123.png - root/cat/nsdf3.png - root/cat/[...]/asd932_.png + root/dog/xxx.png + root/dog/xxy.png + root/dog/[...]/xxz.png - To load images stored in this layout, read the raw images and include the - class names. + root/cat/123.png + root/cat/nsdf3.png + root/cat/[...]/asd932_.png - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __read_images1_start__ - :end-before: __read_images1_stop__ - :dedent: + To load images stored in this layout, read the raw images and include the + class names. - Then, apply a :ref:`user-defined function ` to - encode the class names as integer targets. + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __read_images1_start__ + :end-before: __read_images1_stop__ + :dedent: - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __read_images2_start__ - :end-before: __read_images2_stop__ - :dedent: + Then, apply a :ref:`user-defined function ` to + encode the class names as integer targets. - .. tip:: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __read_images2_start__ + :end-before: __read_images2_stop__ + :dedent: - You can also use :class:`~ray.data.preprocessors.LabelEncoder` to encode labels. + .. tip:: -.. tabbed:: NumPy + You can also use :class:`~ray.data.preprocessors.LabelEncoder` to encode labels. - To load NumPy arrays into a :class:`~ray.data.Datastream`, separately read the image and label arrays. + .. tab-item:: NumPy - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __read_numpy1_start__ - :end-before: __read_numpy1_stop__ - :dedent: + To load NumPy arrays into a :class:`~ray.data.Dataset`, separately read the image and label arrays. - Then, combine the datasets and rename the columns. + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __read_numpy1_start__ + :end-before: __read_numpy1_stop__ + :dedent: - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __read_numpy2_start__ - :end-before: __read_numpy2_stop__ - :dedent: + Then, combine the datasets and rename the columns. -.. tabbed:: TFRecords + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __read_numpy2_start__ + :end-before: __read_numpy2_stop__ + :dedent: - Image datasets often contain ``tf.train.Example`` messages that look like this: + .. tab-item:: TFRecords - .. code-block:: + Image datasets often contain ``tf.train.Example`` messages that look like this: - features { - feature { - key: "image" - value { - bytes_list { - value: ... # Raw image bytes + .. code-block:: + + features { + feature { + key: "image" + value { + bytes_list { + value: ... # Raw image bytes + } } } - } - feature { - key: "label" - value { - int64_list { - value: 3 + feature { + key: "label" + value { + int64_list { + value: 3 + } } } } - } - To load examples stored in this format, read the TFRecords into a :class:`~ray.data.Datastream`. + To load examples stored in this format, read the TFRecords into a :class:`~ray.data.Dataset`. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __read_tfrecords1_start__ - :end-before: __read_tfrecords1_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __read_tfrecords1_start__ + :end-before: __read_tfrecords1_stop__ + :dedent: - Then, apply a :ref:`user-defined function ` to - decode the raw image bytes. + Then, apply a :ref:`user-defined function ` to + decode the raw image bytes. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __read_tfrecords2_start__ - :end-before: __read_tfrecords2_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __read_tfrecords2_start__ + :end-before: __read_tfrecords2_stop__ + :dedent: -.. tabbed:: Parquet + .. tab-item:: Parquet - To load image data stored in Parquet files, call :func:`ray.data.read_parquet`. + To load image data stored in Parquet files, call :func:`ray.data.read_parquet`. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __read_parquet_start__ - :end-before: __read_parquet_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __read_parquet_start__ + :end-before: __read_parquet_stop__ + :dedent: -For more information on creating datasets, see :ref:`Creating Datasets `. +For more information on creating datasets, see :ref:`Loading Data `. Transforming images @@ -123,81 +125,85 @@ Transforming images To transform images, create a :class:`~ray.data.preprocessor.Preprocessor`. They're the standard way to preprocess data with Ray. -.. tabbed:: Torch +.. tab-set:: + + .. tab-item:: Torch - To apply TorchVision transforms, create a :class:`~ray.data.preprocessors.TorchVisionPreprocessor`. + To apply TorchVision transforms, create a :class:`~ray.data.preprocessors.TorchVisionPreprocessor`. - Create two :class:`TorchVisionPreprocessors ` - -- one to normalize images, and another to augment images. Later, you'll pass the preprocessors to :class:`Trainers `, - :class:`Predictors `, and - :class:`PredictorDeployments `. + Create two :class:`TorchVisionPreprocessors ` + -- one to normalize images, and another to augment images. Later, you'll pass the preprocessors to :class:`Trainers `, + :class:`Predictors `, and + :class:`PredictorDeployments `. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __torch_preprocessors_start__ - :end-before: __torch_preprocessors_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __torch_preprocessors_start__ + :end-before: __torch_preprocessors_stop__ + :dedent: -.. tabbed:: TensorFlow + .. tab-item:: TensorFlow - To apply TorchVision transforms, create a :class:`~ray.data.preprocessors.BatchMapper`. + To apply TorchVision transforms, create a :class:`~ray.data.preprocessors.BatchMapper`. - Create two :class:`~ray.data.preprocessors.BatchMapper` -- one to normalize images, and another to - augment images. Later, you'll pass the preprocessors to :class:`Trainers `, - :class:`Predictors `, and - :class:`PredictorDeployments `. + Create two :class:`~ray.data.preprocessors.BatchMapper` -- one to normalize images, and another to + augment images. Later, you'll pass the preprocessors to :class:`Trainers `, + :class:`Predictors `, and + :class:`PredictorDeployments `. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __tensorflow_preprocessors_start__ - :end-before: __tensorflow_preprocessors_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __tensorflow_preprocessors_start__ + :end-before: __tensorflow_preprocessors_stop__ + :dedent: For more information on transforming data, see :ref:`Using Preprocessors ` and -:ref:`Transforming Datasets `. +:ref:`Transforming Data `. Training vision models ---------------------- :class:`Trainers ` let you train models in parallel. -.. tabbed:: Torch +.. tab-set:: - To train a vision model, define the training loop per worker. + .. tab-item:: Torch - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __torch_training_loop_start__ - :end-before: __torch_training_loop_stop__ - :dedent: + To train a vision model, define the training loop per worker. - Then, create a :class:`~ray.train.torch.TorchTrainer` and call - :meth:`~ray.train.torch.TorchTrainer.fit`. + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __torch_training_loop_start__ + :end-before: __torch_training_loop_stop__ + :dedent: - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __torch_trainer_start__ - :end-before: __torch_trainer_stop__ - :dedent: + Then, create a :class:`~ray.train.torch.TorchTrainer` and call + :meth:`~ray.train.torch.TorchTrainer.fit`. - For more in-depth examples, read :doc:`/ray-air/examples/torch_image_example` and - :ref:`Using Trainers `. + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __torch_trainer_start__ + :end-before: __torch_trainer_stop__ + :dedent: -.. tabbed:: TensorFlow + For more in-depth examples, read :doc:`/ray-air/examples/torch_image_example` and + :ref:`Using Trainers `. - To train a vision model, define the training loop per worker. + .. tab-item:: TensorFlow - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __tensorflow_training_loop_start__ - :end-before: __tensorflow_training_loop_stop__ - :dedent: + To train a vision model, define the training loop per worker. - Then, create a :class:`~ray.train.tensorflow.TensorflowTrainer` and call - :meth:`~ray.train.tensorflow.TensorflowTrainer.fit`. + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __tensorflow_training_loop_start__ + :end-before: __tensorflow_training_loop_stop__ + :dedent: - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __tensorflow_trainer_start__ - :end-before: __tensorflow_trainer_stop__ - :dedent: + Then, create a :class:`~ray.train.tensorflow.TensorflowTrainer` and call + :meth:`~ray.train.tensorflow.TensorflowTrainer.fit`. - For more information, read :ref:`Using Trainers `. + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __tensorflow_trainer_start__ + :end-before: __tensorflow_trainer_stop__ + :dedent: + + For more information, read :ref:`Using Trainers `. Creating checkpoints -------------------- @@ -210,27 +216,29 @@ If you're going from training to prediction, don't create a new checkpoint. :class:`~ray.air.result.Result` object. Use :attr:`Result.checkpoint ` instead. -.. tabbed:: Torch +.. tab-set:: + + .. tab-item:: Torch - To create a :class:`~ray.train.torch.TorchCheckpoint`, pass a Torch model and - the :class:`~ray.data.preprocessor.Preprocessor` you created in `Transforming images`_ - to :meth:`TorchCheckpoint.from_model() `. + To create a :class:`~ray.train.torch.TorchCheckpoint`, pass a Torch model and + the :class:`~ray.data.preprocessor.Preprocessor` you created in `Transforming images`_ + to :meth:`TorchCheckpoint.from_model() `. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __torch_checkpoint_start__ - :end-before: __torch_checkpoint_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __torch_checkpoint_start__ + :end-before: __torch_checkpoint_stop__ + :dedent: -.. tabbed:: TensorFlow + .. tab-item:: TensorFlow - To create a :class:`~ray.train.tensorflow.TensorflowCheckpoint`, pass a TensorFlow model and - the :class:`~ray.data.preprocessor.Preprocessor` you created in `Transforming images`_ - to :meth:`TensorflowCheckpoint.from_model() `. + To create a :class:`~ray.train.tensorflow.TensorflowCheckpoint`, pass a TensorFlow model and + the :class:`~ray.data.preprocessor.Preprocessor` you created in `Transforming images`_ + to :meth:`TensorflowCheckpoint.from_model() `. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __tensorflow_checkpoint_start__ - :end-before: __tensorflow_checkpoint_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __tensorflow_checkpoint_start__ + :end-before: __tensorflow_checkpoint_stop__ + :dedent: Batch predicting images @@ -239,32 +247,34 @@ Batch predicting images :class:`~ray.train.batch_predictor.BatchPredictor` lets you perform inference on large image datasets. -.. tabbed:: Torch +.. tab-set:: + + .. tab-item:: Torch - To create a :class:`~ray.train.batch_predictor.BatchPredictor`, call - :meth:`BatchPredictor.from_checkpoint ` and pass the checkpoint - you created in `Creating checkpoints`_. + To create a :class:`~ray.train.batch_predictor.BatchPredictor`, call + :meth:`BatchPredictor.from_checkpoint ` and pass the checkpoint + you created in `Creating checkpoints`_. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __torch_batch_predictor_start__ - :end-before: __torch_batch_predictor_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __torch_batch_predictor_start__ + :end-before: __torch_batch_predictor_stop__ + :dedent: - For more in-depth examples, read :doc:`/ray-air/examples/pytorch_resnet_batch_prediction` - and :ref:`Using Predictors for Inference `. + For more in-depth examples, read :doc:`/ray-air/examples/pytorch_resnet_batch_prediction` + and :ref:`Using Predictors for Inference `. -.. tabbed:: TensorFlow + .. tab-item:: TensorFlow - To create a :class:`~ray.train.batch_predictor.BatchPredictor`, call - :meth:`BatchPredictor.from_checkpoint ` and pass the checkpoint - you created in `Creating checkpoints`_. + To create a :class:`~ray.train.batch_predictor.BatchPredictor`, call + :meth:`BatchPredictor.from_checkpoint ` and pass the checkpoint + you created in `Creating checkpoints`_. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __tensorflow_batch_predictor_start__ - :end-before: __tensorflow_batch_predictor_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __tensorflow_batch_predictor_start__ + :end-before: __tensorflow_batch_predictor_stop__ + :dedent: - For more information, read :ref:`Using Predictors for Inference `. + For more information, read :ref:`Using Predictors for Inference `. Serving vision models --------------------- @@ -286,44 +296,45 @@ To NumPy ndarrays like this: array([[1., 2.], [3., 4.]]) +.. tab-set:: -.. tabbed:: Torch + .. tab-item:: Torch - To deploy a Torch model to an endpoint, pass the checkpoint you created in `Creating checkpoints`_ - to :meth:`PredictorDeployment.bind ` and specify - :func:`~ray.serve.http_adapters.json_to_ndarray` as the HTTP adapter. + To deploy a Torch model to an endpoint, pass the checkpoint you created in `Creating checkpoints`_ + to :meth:`PredictorDeployment.bind ` and specify + :func:`~ray.serve.http_adapters.json_to_ndarray` as the HTTP adapter. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __torch_serve_start__ - :end-before: __torch_serve_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __torch_serve_start__ + :end-before: __torch_serve_stop__ + :dedent: - Then, make a request to classify an image. + Then, make a request to classify an image. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __torch_online_predict_start__ - :end-before: __torch_online_predict_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __torch_online_predict_start__ + :end-before: __torch_online_predict_stop__ + :dedent: - For more in-depth examples, read :doc:`/ray-air/examples/torch_image_example` - and :doc:`/ray-air/examples/serving_guide`. + For more in-depth examples, read :doc:`/ray-air/examples/torch_image_example` + and :doc:`/ray-air/examples/serving_guide`. -.. tabbed:: TensorFlow + .. tab-item:: TensorFlow - To deploy a TensorFlow model to an endpoint, pass the checkpoint you created in `Creating checkpoints`_ - to :meth:`PredictorDeployment.bind ` and specify - :func:`~ray.serve.http_adapters.json_to_multi_ndarray` as the HTTP adapter. + To deploy a TensorFlow model to an endpoint, pass the checkpoint you created in `Creating checkpoints`_ + to :meth:`PredictorDeployment.bind ` and specify + :func:`~ray.serve.http_adapters.json_to_multi_ndarray` as the HTTP adapter. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __tensorflow_serve_start__ - :end-before: __tensorflow_serve_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __tensorflow_serve_start__ + :end-before: __tensorflow_serve_stop__ + :dedent: - Then, make a request to classify an image. + Then, make a request to classify an image. - .. literalinclude:: ./doc_code/computer_vision.py - :start-after: __tensorflow_online_predict_start__ - :end-before: __tensorflow_online_predict_stop__ - :dedent: + .. literalinclude:: ./doc_code/computer_vision.py + :start-after: __tensorflow_online_predict_start__ + :end-before: __tensorflow_online_predict_stop__ + :dedent: - For more information, read :doc:`/ray-air/examples/serving_guide`. + For more information, read :doc:`/ray-air/examples/serving_guide`. diff --git a/doc/source/ray-air/doc_code/accelerate_trainer.py b/doc/source/ray-air/doc_code/accelerate_trainer.py index ccefa8776ae8..ebd4c148817b 100644 --- a/doc/source/ray-air/doc_code/accelerate_trainer.py +++ b/doc/source/ray-air/doc_code/accelerate_trainer.py @@ -5,7 +5,7 @@ import ray from ray.air import session, Checkpoint -from ray.train.huggingface.accelerate import AccelerateTrainer +from ray.train.hf_accelerate import AccelerateTrainer from ray.air.config import ScalingConfig diff --git a/doc/source/ray-air/doc_code/computer_vision.py b/doc/source/ray-air/doc_code/computer_vision.py index d409103154c7..e2c2905786f3 100644 --- a/doc/source/ray-air/doc_code/computer_vision.py +++ b/doc/source/ray-air/doc_code/computer_vision.py @@ -73,9 +73,8 @@ def read_numpy(): # __read_numpy2_start__ dataset = images.zip(labels) dataset = dataset.map_batches( - lambda batch: batch.rename( - columns={"__value__": "image", "__value___1": "label"} - ) + lambda batch: batch.rename(columns={"data": "image", "data_1": "label"}), + batch_format="pandas", ) # __read_numpy2_stop__ return dataset diff --git a/doc/source/ray-air/doc_code/hf_trainer.py b/doc/source/ray-air/doc_code/hf_trainer.py index 1d81d36dc35c..36b3b23164f1 100644 --- a/doc/source/ray-air/doc_code/hf_trainer.py +++ b/doc/source/ray-air/doc_code/hf_trainer.py @@ -9,7 +9,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer import ray -from ray.train.huggingface import HuggingFaceTrainer +from ray.train.hf_transformers import TransformersTrainer from ray.air.config import ScalingConfig @@ -81,7 +81,7 @@ def trainer_init_per_worker(train_dataset, eval_dataset, **config): scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) -trainer = HuggingFaceTrainer( +trainer = TransformersTrainer( trainer_init_per_worker=trainer_init_per_worker, scaling_config=scaling_config, datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds}, diff --git a/doc/source/ray-air/doc_code/preprocessors.py b/doc/source/ray-air/doc_code/preprocessors.py index 3cdc2e4b7bc7..acfad704862f 100644 --- a/doc/source/ray-air/doc_code/preprocessors.py +++ b/doc/source/ray-air/doc_code/preprocessors.py @@ -8,36 +8,36 @@ from ray.data.preprocessors.scaler import StandardScaler # Generate two simple datasets. -dataset = ray.data.range_table(8) +dataset = ray.data.range(8) dataset1, dataset2 = dataset.split(2) print(dataset1.take()) -# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}] +# [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}] print(dataset2.take()) -# [{'value': 4}, {'value': 5}, {'value': 6}, {'value': 7}] +# [{'id': 4}, {'id': 5}, {'id': 6}, {'id': 7}] # __preprocessor_setup_end__ # __preprocessor_fit_transform_start__ # Fit the preprocessor on dataset1, and transform both dataset1 and dataset2. -preprocessor = MinMaxScaler(["value"]) +preprocessor = MinMaxScaler(["id"]) dataset1_transformed = preprocessor.fit_transform(dataset1) print(dataset1_transformed.take()) -# [{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}] +# [{'id': 0.0}, {'id': 0.3333333333333333}, {'id': 0.6666666666666666}, {'id': 1.0}] dataset2_transformed = preprocessor.transform(dataset2) print(dataset2_transformed.take()) -# [{'value': 1.3333333333333333}, {'value': 1.6666666666666667}, {'value': 2.0}, {'value': 2.3333333333333335}] +# [{'id': 1.3333333333333333}, {'id': 1.6666666666666667}, {'id': 2.0}, {'id': 2.3333333333333335}] # __preprocessor_fit_transform_end__ # __preprocessor_transform_batch_start__ -batch = pd.DataFrame({"value": list(range(8, 12))}) +batch = pd.DataFrame({"id": list(range(8, 12))}) batch_transformed = preprocessor.transform_batch(batch) print(batch_transformed) -# value +# id # 0 2.666667 # 1 3.000000 # 2 3.333333 @@ -110,16 +110,16 @@ # Generate one simple dataset. dataset = ray.data.from_items( - [{"value": 0}, {"value": 1}, {"value": 2}, {"value": 3}, {"value": None}] + [{"id": 0}, {"id": 1}, {"id": 2}, {"id": 3}, {"id": None}] ) print(dataset.take()) -# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': None}] +# [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': None}] -preprocessor = Chain(SimpleImputer(["value"]), MinMaxScaler(["value"])) +preprocessor = Chain(SimpleImputer(["id"]), MinMaxScaler(["id"])) dataset_transformed = preprocessor.fit_transform(dataset) print(dataset_transformed.take()) -# [{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}, {'value': 0.5}] +# [{'id': 0.0}, {'id': 0.3333333333333333}, {'id': 0.6666666666666666}, {'id': 1.0}, {'id': 0.5}] # __chain_end__ @@ -128,15 +128,15 @@ from ray.data.preprocessors import BatchMapper # Generate a simple dataset. -dataset = ray.data.range_table(4) +dataset = ray.data.range(4) print(dataset.take()) -# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}] +# [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}] -# Create a stateless preprocess that multiplies values by 2. +# Create a stateless preprocess that multiplies ids by 2. preprocessor = BatchMapper(lambda df: df * 2, batch_size=2, batch_format="pandas") dataset_transformed = preprocessor.transform(dataset) print(dataset_transformed.take()) -# [{'value': 0}, {'value': 2}, {'value': 4}, {'value': 6}] +# [{'id': 0}, {'id': 2}, {'id': 4}, {'id': 6}] # __custom_stateless_end__ @@ -151,22 +151,22 @@ class CustomPreprocessor(Preprocessor): def _fit(self, dataset: Dataset) -> Preprocessor: - self.stats_ = dataset.aggregate(Max("value")) + self.stats_ = dataset.aggregate(Max("id")) def _transform_pandas(self, df: DataFrame) -> DataFrame: - return df * self.stats_["max(value)"] + return df * self.stats_["max(id)"] # Generate a simple dataset. -dataset = ray.data.range_table(4) +dataset = ray.data.range(4) print(dataset.take()) -# [{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}] +# [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}] -# Create a stateful preprocessor that finds the max value and scales each value by it. +# Create a stateful preprocessor that finds the max id and scales each id by it. preprocessor = CustomPreprocessor() dataset_transformed = preprocessor.fit_transform(dataset) print(dataset_transformed.take()) -# [{'value': 0}, {'value': 3}, {'value': 6}, {'value': 9}] +# [{'id': 0}, {'id': 3}, {'id': 6}, {'id': 9}] # __custom_stateful_end__ @@ -174,14 +174,14 @@ def _transform_pandas(self, df: DataFrame) -> DataFrame: from ray.data.preprocessors import SimpleImputer # Generate a simple dataset. -dataset = ray.data.from_items([{"value": 1.0}, {"value": None}, {"value": 3.0}]) +dataset = ray.data.from_items([{"id": 1.0}, {"id": None}, {"id": 3.0}]) print(dataset.take()) -# [{'value': 1.0}, {'value': None}, {'value': 3.0}] +# [{'id': 1.0}, {'id': None}, {'id': 3.0}] -imputer = SimpleImputer(columns=["value"], strategy="mean") +imputer = SimpleImputer(columns=["id"], strategy="mean") dataset_transformed = imputer.fit_transform(dataset) print(dataset_transformed.take()) -# [{'value': 1.0}, {'value': 2.0}, {'value': 3.0}] +# [{'id': 1.0}, {'id': 2.0}, {'id': 3.0}] # __simple_imputer_end__ diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD index ff2f4930cdb5..1fcc19fd24b7 100644 --- a/doc/source/ray-air/examples/BUILD +++ b/doc/source/ray-air/examples/BUILD @@ -51,6 +51,8 @@ py_test_run_all_notebooks( "gptj_serving.ipynb", # Requires GPUs "stablediffusion_batch_prediction.ipynb", # Requires GPUs "gptj_deepspeed_fine_tuning.ipynb", # Requires release test + "opt_deepspeed_batch_inference.ipynb", # Requires release test + "dolly_lightning_fsdp_finetuning.ipynb", # Requires release test ], data = ["//doc/source/ray-air/examples:air_examples"], tags = ["exclusive", "team:ml", "ray_air"], diff --git a/doc/source/ray-air/examples/analyze_tuning_results.ipynb b/doc/source/ray-air/examples/analyze_tuning_results.ipynb index 038b9c77f9d3..a38261c0a4de 100644 --- a/doc/source/ray-air/examples/analyze_tuning_results.ipynb +++ b/doc/source/ray-air/examples/analyze_tuning_results.ipynb @@ -21,7 +21,7 @@ "id": "41abda7b", "metadata": {}, "source": [ - "We'll use the [Covertype dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html#sklearn-datasets-fetch-covtype) provided from sklearn to train a multiclass classification task using XGBoost.\n", + "We'll use the [Covertype dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html#sklearn-data-fetch-covtype) provided from sklearn to train a multiclass classification task using XGBoost.\n", "\n", "In this dataset, we try to predict the forst cover type (e.g. \"lodgehole pine\") from cartographic variables, like the distance to the closest road, or the hillshade at different times of the day. The features are binary, discrete and continuous and thus well suited for a decision-tree based classification task.\n", "\n", @@ -42,13 +42,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\r\n", - "You should consider upgrading via the '/Users/kai/.pyenv/versions/3.7.7/bin/python3.7 -m pip install --upgrade pip' command.\u001b[0m\r\n" + "\u001B[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\r\n", + "You should consider upgrading via the '/Users/kai/.pyenv/versions/3.7.7/bin/python3.7 -m pip install --upgrade pip' command.\u001B[0m\r\n" ] } ], "source": [ - "!pip install -q \"ray[air]\" sklearn" + "!pip install -q \"ray[air]\" scikit-learn" ] }, { @@ -82,7 +82,7 @@ "id": "a93b242c", "metadata": {}, "source": [ - "We'll define a utility function to create a Ray Datastream from the Sklearn dataset. We expect the target column to be in the dataframe, so we'll add it to the dataframe manually." + "We'll define a utility function to create a Dataset from the Sklearn dataset. We expect the target column to be in the dataframe, so we'll add it to the dataframe manually." ] }, { @@ -95,12 +95,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-13 12:31:51,444\tINFO services.py:1484 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2022-05-13 12:31:51,444\tINFO services.py:1484 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265\u001B[39m\u001B[22m\n" ] } ], "source": [ - "def get_training_data() -> ray.data.Datastream:\n", + "def get_training_data() -> ray.data.Dataset:\n", " data_raw = fetch_covtype()\n", " df = pd.DataFrame(data_raw[\"data\"], columns=data_raw[\"feature_names\"])\n", " df[\"target\"] = data_raw[\"target\"]\n", @@ -128,7 +128,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Datastream(num_blocks=1, num_rows=581012, schema={Elevation: float64, Aspect: float64, Slope: float64, Horizontal_Distance_To_Hydrology: float64, Vertical_Distance_To_Hydrology: float64, Horizontal_Distance_To_Roadways: float64, Hillshade_9am: float64, Hillshade_Noon: float64, Hillshade_3pm: float64, Horizontal_Distance_To_Fire_Points: float64, Wilderness_Area_0: float64, Wilderness_Area_1: float64, Wilderness_Area_2: float64, Wilderness_Area_3: float64, Soil_Type_0: float64, Soil_Type_1: float64, Soil_Type_2: float64, Soil_Type_3: float64, Soil_Type_4: float64, Soil_Type_5: float64, Soil_Type_6: float64, Soil_Type_7: float64, Soil_Type_8: float64, Soil_Type_9: float64, Soil_Type_10: float64, Soil_Type_11: float64, Soil_Type_12: float64, Soil_Type_13: float64, Soil_Type_14: float64, Soil_Type_15: float64, Soil_Type_16: float64, Soil_Type_17: float64, Soil_Type_18: float64, Soil_Type_19: float64, Soil_Type_20: float64, Soil_Type_21: float64, Soil_Type_22: float64, Soil_Type_23: float64, Soil_Type_24: float64, Soil_Type_25: float64, Soil_Type_26: float64, Soil_Type_27: float64, Soil_Type_28: float64, Soil_Type_29: float64, Soil_Type_30: float64, Soil_Type_31: float64, Soil_Type_32: float64, Soil_Type_33: float64, Soil_Type_34: float64, Soil_Type_35: float64, Soil_Type_36: float64, Soil_Type_37: float64, Soil_Type_38: float64, Soil_Type_39: float64, target: int32})\n" + "Dataset(num_blocks=1, num_rows=581012, schema={Elevation: float64, Aspect: float64, Slope: float64, Horizontal_Distance_To_Hydrology: float64, Vertical_Distance_To_Hydrology: float64, Horizontal_Distance_To_Roadways: float64, Hillshade_9am: float64, Hillshade_Noon: float64, Hillshade_3pm: float64, Horizontal_Distance_To_Fire_Points: float64, Wilderness_Area_0: float64, Wilderness_Area_1: float64, Wilderness_Area_2: float64, Wilderness_Area_3: float64, Soil_Type_0: float64, Soil_Type_1: float64, Soil_Type_2: float64, Soil_Type_3: float64, Soil_Type_4: float64, Soil_Type_5: float64, Soil_Type_6: float64, Soil_Type_7: float64, Soil_Type_8: float64, Soil_Type_9: float64, Soil_Type_10: float64, Soil_Type_11: float64, Soil_Type_12: float64, Soil_Type_13: float64, Soil_Type_14: float64, Soil_Type_15: float64, Soil_Type_16: float64, Soil_Type_17: float64, Soil_Type_18: float64, Soil_Type_19: float64, Soil_Type_20: float64, Soil_Type_21: float64, Soil_Type_22: float64, Soil_Type_23: float64, Soil_Type_24: float64, Soil_Type_25: float64, Soil_Type_26: float64, Soil_Type_27: float64, Soil_Type_28: float64, Soil_Type_29: float64, Soil_Type_30: float64, Soil_Type_31: float64, Soil_Type_32: float64, Soil_Type_33: float64, Soil_Type_34: float64, Soil_Type_35: float64, Soil_Type_36: float64, Soil_Type_37: float64, Soil_Type_38: float64, Soil_Type_39: float64, target: int32})\n" ] } ], @@ -256,81 +256,81 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(GBDTTrainable pid=62456)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62456)\u001b[0m 2022-05-13 12:32:02,793\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62464)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62463)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62466)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62463)\u001b[0m 2022-05-13 12:32:05,102\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62466)\u001b[0m 2022-05-13 12:32:05,204\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62464)\u001b[0m 2022-05-13 12:32:05,338\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m 2022-05-13 12:32:07,164\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62456)\u001b[0m 2022-05-13 12:32:10,549\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62495)\u001b[0m [12:32:10] task [xgboost.ray]:6975277392 got new rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62494)\u001b[0m [12:32:10] task [xgboost.ray]:4560390352 got new rank 0\n", - "\u001b[2m\u001b[36m(raylet)\u001b[0m Spilled 2173 MiB, 22 objects, write throughput 402 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62463)\u001b[0m 2022-05-13 12:32:17,848\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62523)\u001b[0m [12:32:18] task [xgboost.ray]:4441524624 got new rank 0\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62524)\u001b[0m [12:32:18] task [xgboost.ray]:6890641808 got new rank 1\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m 2022-05-13 12:32:21,253\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62466)\u001b[0m 2022-05-13 12:32:21,529\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62563)\u001b[0m [12:32:21] task [xgboost.ray]:4667801680 got new rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62562)\u001b[0m [12:32:21] task [xgboost.ray]:6856360848 got new rank 0\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62530)\u001b[0m [12:32:21] task [xgboost.ray]:6971527824 got new rank 0\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62532)\u001b[0m [12:32:21] task [xgboost.ray]:4538321232 got new rank 1\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62464)\u001b[0m 2022-05-13 12:32:21,937\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62544)\u001b[0m [12:32:21] task [xgboost.ray]:7005661840 got new rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62543)\u001b[0m [12:32:21] task [xgboost.ray]:4516088080 got new rank 0\n", - "\u001b[2m\u001b[36m(raylet)\u001b[0m Spilled 4098 MiB, 83 objects, write throughput 347 MiB/s.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62456)\u001b[0m 2022-05-13 12:32:41,289\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62463)\u001b[0m 2022-05-13 12:32:48,617\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m 2022-05-13 12:32:52,110\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62466)\u001b[0m 2022-05-13 12:32:52,448\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62464)\u001b[0m 2022-05-13 12:32:52,692\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62456)\u001b[0m 2022-05-13 12:33:11,960\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62463)\u001b[0m 2022-05-13 12:33:19,076\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62464)\u001b[0m 2022-05-13 12:33:23,409\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m 2022-05-13 12:33:23,420\tINFO main.py:1109 -- Training in progress (62 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62466)\u001b[0m 2022-05-13 12:33:23,541\tINFO main.py:1109 -- Training in progress (62 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62463)\u001b[0m 2022-05-13 12:33:23,693\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 78.74 seconds (65.79 pure XGBoost training time).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62464)\u001b[0m 2022-05-13 12:33:24,802\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 79.62 seconds (62.85 pure XGBoost training time).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62648)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62651)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62648)\u001b[0m 2022-05-13 12:33:38,788\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62651)\u001b[0m 2022-05-13 12:33:38,766\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62456)\u001b[0m 2022-05-13 12:33:42,168\tINFO main.py:1109 -- Training in progress (92 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62456)\u001b[0m 2022-05-13 12:33:46,177\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 103.54 seconds (95.60 pure XGBoost training time).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62651)\u001b[0m 2022-05-13 12:33:51,825\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62670)\u001b[0m [12:33:51] task [xgboost.ray]:4623186960 got new rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62669)\u001b[0m [12:33:51] task [xgboost.ray]:4707639376 got new rank 0\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62648)\u001b[0m 2022-05-13 12:33:52,036\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62672)\u001b[0m [12:33:52] task [xgboost.ray]:4530073552 got new rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62671)\u001b[0m [12:33:52] task [xgboost.ray]:6824757200 got new rank 0\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62466)\u001b[0m 2022-05-13 12:33:54,229\tINFO main.py:1109 -- Training in progress (92 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m 2022-05-13 12:33:54,355\tINFO main.py:1109 -- Training in progress (93 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62730)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62730)\u001b[0m 2022-05-13 12:34:04,708\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62466)\u001b[0m 2022-05-13 12:34:11,126\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 126.08 seconds (109.48 pure XGBoost training time).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62730)\u001b[0m 2022-05-13 12:34:15,175\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62753)\u001b[0m [12:34:15] task [xgboost.ray]:4468564048 got new rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=62752)\u001b[0m [12:34:15] task [xgboost.ray]:6799468304 got new rank 0\n" + "\u001B[2m\u001B[36m(GBDTTrainable pid=62456)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62456)\u001B[0m 2022-05-13 12:32:02,793\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62464)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62463)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62466)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62463)\u001B[0m 2022-05-13 12:32:05,102\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62466)\u001B[0m 2022-05-13 12:32:05,204\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62464)\u001B[0m 2022-05-13 12:32:05,338\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m 2022-05-13 12:32:07,164\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62456)\u001B[0m 2022-05-13 12:32:10,549\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62495)\u001B[0m [12:32:10] task [xgboost.ray]:6975277392 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62494)\u001B[0m [12:32:10] task [xgboost.ray]:4560390352 got new rank 0\n", + "\u001B[2m\u001B[36m(raylet)\u001B[0m Spilled 2173 MiB, 22 objects, write throughput 402 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62463)\u001B[0m 2022-05-13 12:32:17,848\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62523)\u001B[0m [12:32:18] task [xgboost.ray]:4441524624 got new rank 0\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62524)\u001B[0m [12:32:18] task [xgboost.ray]:6890641808 got new rank 1\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m 2022-05-13 12:32:21,253\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62466)\u001B[0m 2022-05-13 12:32:21,529\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62563)\u001B[0m [12:32:21] task [xgboost.ray]:4667801680 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62562)\u001B[0m [12:32:21] task [xgboost.ray]:6856360848 got new rank 0\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62530)\u001B[0m [12:32:21] task [xgboost.ray]:6971527824 got new rank 0\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62532)\u001B[0m [12:32:21] task [xgboost.ray]:4538321232 got new rank 1\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62464)\u001B[0m 2022-05-13 12:32:21,937\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62544)\u001B[0m [12:32:21] task [xgboost.ray]:7005661840 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62543)\u001B[0m [12:32:21] task [xgboost.ray]:4516088080 got new rank 0\n", + "\u001B[2m\u001B[36m(raylet)\u001B[0m Spilled 4098 MiB, 83 objects, write throughput 347 MiB/s.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62456)\u001B[0m 2022-05-13 12:32:41,289\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62463)\u001B[0m 2022-05-13 12:32:48,617\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m 2022-05-13 12:32:52,110\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62466)\u001B[0m 2022-05-13 12:32:52,448\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62464)\u001B[0m 2022-05-13 12:32:52,692\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62456)\u001B[0m 2022-05-13 12:33:11,960\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62463)\u001B[0m 2022-05-13 12:33:19,076\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62464)\u001B[0m 2022-05-13 12:33:23,409\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m 2022-05-13 12:33:23,420\tINFO main.py:1109 -- Training in progress (62 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62466)\u001B[0m 2022-05-13 12:33:23,541\tINFO main.py:1109 -- Training in progress (62 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62463)\u001B[0m 2022-05-13 12:33:23,693\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 78.74 seconds (65.79 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62464)\u001B[0m 2022-05-13 12:33:24,802\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 79.62 seconds (62.85 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62648)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62651)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62648)\u001B[0m 2022-05-13 12:33:38,788\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62651)\u001B[0m 2022-05-13 12:33:38,766\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62456)\u001B[0m 2022-05-13 12:33:42,168\tINFO main.py:1109 -- Training in progress (92 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62456)\u001B[0m 2022-05-13 12:33:46,177\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 103.54 seconds (95.60 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62651)\u001B[0m 2022-05-13 12:33:51,825\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62670)\u001B[0m [12:33:51] task [xgboost.ray]:4623186960 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62669)\u001B[0m [12:33:51] task [xgboost.ray]:4707639376 got new rank 0\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62648)\u001B[0m 2022-05-13 12:33:52,036\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62672)\u001B[0m [12:33:52] task [xgboost.ray]:4530073552 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62671)\u001B[0m [12:33:52] task [xgboost.ray]:6824757200 got new rank 0\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62466)\u001B[0m 2022-05-13 12:33:54,229\tINFO main.py:1109 -- Training in progress (92 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m 2022-05-13 12:33:54,355\tINFO main.py:1109 -- Training in progress (93 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62730)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62730)\u001B[0m 2022-05-13 12:34:04,708\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62466)\u001B[0m 2022-05-13 12:34:11,126\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 126.08 seconds (109.48 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62730)\u001B[0m 2022-05-13 12:34:15,175\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62753)\u001B[0m [12:34:15] task [xgboost.ray]:4468564048 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=62752)\u001B[0m [12:34:15] task [xgboost.ray]:6799468304 got new rank 0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(GBDTTrainable pid=62648)\u001b[0m 2022-05-13 12:34:22,167\tINFO main.py:1109 -- Training in progress (30 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62651)\u001b[0m 2022-05-13 12:34:22,147\tINFO main.py:1109 -- Training in progress (30 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m 2022-05-13 12:34:24,646\tINFO main.py:1109 -- Training in progress (123 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62465)\u001b[0m 2022-05-13 12:34:24,745\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 137.75 seconds (123.36 pure XGBoost training time).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62651)\u001b[0m 2022-05-13 12:34:40,173\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 61.63 seconds (48.34 pure XGBoost training time).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62730)\u001b[0m 2022-05-13 12:34:45,745\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62648)\u001b[0m 2022-05-13 12:34:52,543\tINFO main.py:1109 -- Training in progress (60 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62648)\u001b[0m 2022-05-13 12:35:14,888\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 96.35 seconds (82.83 pure XGBoost training time).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62730)\u001b[0m 2022-05-13 12:35:16,197\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=62730)\u001b[0m 2022-05-13 12:35:33,441\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 88.89 seconds (78.26 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62648)\u001B[0m 2022-05-13 12:34:22,167\tINFO main.py:1109 -- Training in progress (30 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62651)\u001B[0m 2022-05-13 12:34:22,147\tINFO main.py:1109 -- Training in progress (30 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m 2022-05-13 12:34:24,646\tINFO main.py:1109 -- Training in progress (123 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62465)\u001B[0m 2022-05-13 12:34:24,745\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 137.75 seconds (123.36 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62651)\u001B[0m 2022-05-13 12:34:40,173\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 61.63 seconds (48.34 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62730)\u001B[0m 2022-05-13 12:34:45,745\tINFO main.py:1109 -- Training in progress (31 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62648)\u001B[0m 2022-05-13 12:34:52,543\tINFO main.py:1109 -- Training in progress (60 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62648)\u001B[0m 2022-05-13 12:35:14,888\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 96.35 seconds (82.83 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62730)\u001B[0m 2022-05-13 12:35:16,197\tINFO main.py:1109 -- Training in progress (61 seconds since last restart).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=62730)\u001B[0m 2022-05-13 12:35:33,441\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=581,012 in 88.89 seconds (78.26 pure XGBoost training time).\n", "2022-05-13 12:35:33,610\tINFO tune.py:753 -- Total run time: 218.52 seconds (217.48 seconds for the tuning loop).\n" ] } diff --git a/doc/source/ray-air/examples/batch_forecasting.ipynb b/doc/source/ray-air/examples/batch_forecasting.ipynb index 5b954a770caa..c8fbae69c9a5 100644 --- a/doc/source/ray-air/examples/batch_forecasting.ipynb +++ b/doc/source/ray-air/examples/batch_forecasting.ipynb @@ -53,9 +53,29 @@ "Prerequisite for this notebook: Read the [Key Concepts](tune-60-seconds) page for Ray Tune.\n", "```\n", "\n", - "Let us start by importing a few required libraries, including open-source [Ray](https://github.com/ray-project/ray) itself!" + "First, let's make sure we have all Python packages we need installed." ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!pip install -q \"ray[air]\" scikit-learn prophet statsmodels" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Next, let's import a few required libraries, including open-source [Ray](https://github.com/ray-project/ray) itself!" + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "code", "execution_count": 1, @@ -708,82 +728,82 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n" + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Initial log joint probability = -24.6903\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 1. Log joint probability = 56.7318. Improved by 81.4221.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 2. Log joint probability = 74.9096. Improved by 18.1778.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 3. Log joint probability = 116.738. Improved by 41.8283.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 4. Log joint probability = 121.485. Improved by 4.74745.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 5. Log joint probability = 123.373. Improved by 1.88806.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 6. Log joint probability = 123.877. Improved by 0.503922.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 7. Log joint probability = 124.063. Improved by 0.185315.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 8. Log joint probability = 124.083. Improved by 0.0205245.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 9. Log joint probability = 124.187. Improved by 0.103934.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 10. Log joint probability = 124.3. Improved by 0.11302.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 11. Log joint probability = 124.316. Improved by 0.0161654.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 12. Log joint probability = 124.375. Improved by 0.0588467.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 13. Log joint probability = 124.406. Improved by 0.0307753.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 14. Log joint probability = 124.414. Improved by 0.00790605.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 15. Log joint probability = 124.421. Improved by 0.00744155.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 16. Log joint probability = 124.428. Improved by 0.00688068.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 17. Log joint probability = 124.444. Improved by 0.0160026.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 18. Log joint probability = 124.45. Improved by 0.00550397.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 19. Log joint probability = 124.45. Improved by 0.000490096.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 20. Log joint probability = 124.45. Improved by 9.73771e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 21. Log joint probability = 124.456. Improved by 0.00539044.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 22. Log joint probability = 124.462. Improved by 0.00667823.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 23. Log joint probability = 124.464. Improved by 0.00138419.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 24. Log joint probability = 124.466. Improved by 0.00192804.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 25. Log joint probability = 124.47. Improved by 0.00406199.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 26. Log joint probability = 124.47. Improved by 0.000535657.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 27. Log joint probability = 124.471. Improved by 0.000549635.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 28. Log joint probability = 124.474. Improved by 0.00299757.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 29. Log joint probability = 124.475. Improved by 0.000802363.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 30. Log joint probability = 124.475. Improved by 0.000302488.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 31. Log joint probability = 124.476. Improved by 0.000657009.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 32. Log joint probability = 124.476. Improved by 5.99847e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 33. Log joint probability = 124.476. Improved by 9.36055e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 34. Log joint probability = 124.476. Improved by 0.000110802.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 35. Log joint probability = 124.476. Improved by 0.000323327.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 36. Log joint probability = 124.476. Improved by 0.000124956.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 37. Log joint probability = 124.476. Improved by 1.69834e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 38. Log joint probability = 124.476. Improved by 2.1557e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 39. Log joint probability = 124.476. Improved by 2.41295e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 40. Log joint probability = 124.476. Improved by 7.22567e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 41. Log joint probability = 124.476. Improved by 4.47652e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 42. Log joint probability = 124.476. Improved by 7.65725e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 43. Log joint probability = 124.476. Improved by 3.42432e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 44. Log joint probability = 124.476. Improved by 3.72182e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 45. Log joint probability = 124.476. Improved by 3.8856e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 46. Log joint probability = 124.476. Improved by 6.05641e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 47. Log joint probability = 124.476. Improved by 9.84136e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 48. Log joint probability = 124.476. Improved by 6.66388e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 49. Log joint probability = 124.476. Improved by 1.34989e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 50. Log joint probability = 124.476. Improved by 7.44078e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 51. Log joint probability = 124.476. Improved by 5.28681e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 52. Log joint probability = 124.476. Improved by 6.72879e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 53. Log joint probability = 124.476. Improved by 3.58152e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 54. Log joint probability = 124.476. Improved by 1.52185e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 55. Log joint probability = 124.476. Improved by 4.81723e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 56. Log joint probability = 124.476. Improved by 6.24187e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 57. Log joint probability = 124.476. Improved by 1.10699e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 58. Log joint probability = 124.476. Improved by 3.56434e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 59. Log joint probability = 124.476. Improved by 7.01115e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 60. Log joint probability = 124.476. Improved by 1.28068e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 61. Log joint probability = 124.476. Improved by 1.27551e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 62. Log joint probability = 124.476. Improved by 1.5548e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 63. Log joint probability = 124.476. Improved by 5.52294e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 64. Log joint probability = 124.476. Improved by 3.71382e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 65. Log joint probability = 124.476. Improved by 2.87695e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=569, ip=172.31.136.199)\u001b[0m Iteration 66. Log joint probability = 124.476. Improved by 8.95623e-09.\n" + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Initial log joint probability = -24.6903\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 1. Log joint probability = 56.7318. Improved by 81.4221.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 2. Log joint probability = 74.9096. Improved by 18.1778.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 3. Log joint probability = 116.738. Improved by 41.8283.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 4. Log joint probability = 121.485. Improved by 4.74745.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 5. Log joint probability = 123.373. Improved by 1.88806.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 6. Log joint probability = 123.877. Improved by 0.503922.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 7. Log joint probability = 124.063. Improved by 0.185315.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 8. Log joint probability = 124.083. Improved by 0.0205245.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 9. Log joint probability = 124.187. Improved by 0.103934.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 10. Log joint probability = 124.3. Improved by 0.11302.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 11. Log joint probability = 124.316. Improved by 0.0161654.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 12. Log joint probability = 124.375. Improved by 0.0588467.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 13. Log joint probability = 124.406. Improved by 0.0307753.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 14. Log joint probability = 124.414. Improved by 0.00790605.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 15. Log joint probability = 124.421. Improved by 0.00744155.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 16. Log joint probability = 124.428. Improved by 0.00688068.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 17. Log joint probability = 124.444. Improved by 0.0160026.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 18. Log joint probability = 124.45. Improved by 0.00550397.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 19. Log joint probability = 124.45. Improved by 0.000490096.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 20. Log joint probability = 124.45. Improved by 9.73771e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 21. Log joint probability = 124.456. Improved by 0.00539044.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 22. Log joint probability = 124.462. Improved by 0.00667823.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 23. Log joint probability = 124.464. Improved by 0.00138419.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 24. Log joint probability = 124.466. Improved by 0.00192804.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 25. Log joint probability = 124.47. Improved by 0.00406199.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 26. Log joint probability = 124.47. Improved by 0.000535657.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 27. Log joint probability = 124.471. Improved by 0.000549635.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 28. Log joint probability = 124.474. Improved by 0.00299757.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 29. Log joint probability = 124.475. Improved by 0.000802363.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 30. Log joint probability = 124.475. Improved by 0.000302488.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 31. Log joint probability = 124.476. Improved by 0.000657009.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 32. Log joint probability = 124.476. Improved by 5.99847e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 33. Log joint probability = 124.476. Improved by 9.36055e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 34. Log joint probability = 124.476. Improved by 0.000110802.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 35. Log joint probability = 124.476. Improved by 0.000323327.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 36. Log joint probability = 124.476. Improved by 0.000124956.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 37. Log joint probability = 124.476. Improved by 1.69834e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 38. Log joint probability = 124.476. Improved by 2.1557e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 39. Log joint probability = 124.476. Improved by 2.41295e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 40. Log joint probability = 124.476. Improved by 7.22567e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 41. Log joint probability = 124.476. Improved by 4.47652e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 42. Log joint probability = 124.476. Improved by 7.65725e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 43. Log joint probability = 124.476. Improved by 3.42432e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 44. Log joint probability = 124.476. Improved by 3.72182e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 45. Log joint probability = 124.476. Improved by 3.8856e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 46. Log joint probability = 124.476. Improved by 6.05641e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 47. Log joint probability = 124.476. Improved by 9.84136e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 48. Log joint probability = 124.476. Improved by 6.66388e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 49. Log joint probability = 124.476. Improved by 1.34989e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 50. Log joint probability = 124.476. Improved by 7.44078e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 51. Log joint probability = 124.476. Improved by 5.28681e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 52. Log joint probability = 124.476. Improved by 6.72879e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 53. Log joint probability = 124.476. Improved by 3.58152e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 54. Log joint probability = 124.476. Improved by 1.52185e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 55. Log joint probability = 124.476. Improved by 4.81723e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 56. Log joint probability = 124.476. Improved by 6.24187e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 57. Log joint probability = 124.476. Improved by 1.10699e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 58. Log joint probability = 124.476. Improved by 3.56434e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 59. Log joint probability = 124.476. Improved by 7.01115e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 60. Log joint probability = 124.476. Improved by 1.28068e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 61. Log joint probability = 124.476. Improved by 1.27551e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 62. Log joint probability = 124.476. Improved by 1.5548e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 63. Log joint probability = 124.476. Improved by 5.52294e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 64. Log joint probability = 124.476. Improved by 3.71382e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 65. Log joint probability = 124.476. Improved by 2.87695e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=569, ip=172.31.136.199)\u001B[0m Iteration 66. Log joint probability = 124.476. Improved by 8.95623e-09.\n" ] }, { @@ -833,519 +853,519 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Initial log joint probability = -24.6903\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 1. Log joint probability = 55.3662. Improved by 80.0565.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 2. Log joint probability = 95.8737. Improved by 40.5075.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 3. Log joint probability = 120.379. Improved by 24.5055.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 4. Log joint probability = 122.813. Improved by 2.43399.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 5. Log joint probability = 123.073. Improved by 0.259582.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 6. Log joint probability = 123.074. Improved by 0.00165627.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 7. Log joint probability = 123.112. Improved by 0.0373812.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 8. Log joint probability = 123.133. Improved by 0.0215269.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 9. Log joint probability = 123.216. Improved by 0.0827413.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 10. Log joint probability = 123.274. Improved by 0.0580866.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 11. Log joint probability = 123.275. Improved by 0.000726338.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 12. Log joint probability = 123.287. Improved by 0.0124071.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 13. Log joint probability = 123.354. Improved by 0.0669767.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 14. Log joint probability = 123.532. Improved by 0.177947.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 15. Log joint probability = 123.537. Improved by 0.00465327.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 16. Log joint probability = 123.567. Improved by 0.0304046.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 17. Log joint probability = 123.626. Improved by 0.0586984.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 18. Log joint probability = 123.717. Improved by 0.0906553.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 19. Log joint probability = 123.767. Improved by 0.0503912.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 20. Log joint probability = 123.794. Improved by 0.0270009.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 21. Log joint probability = 123.809. Improved by 0.0150776.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 22. Log joint probability = 123.819. Improved by 0.00949975.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 23. Log joint probability = 123.826. Improved by 0.00746779.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 24. Log joint probability = 123.83. Improved by 0.00414592.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 25. Log joint probability = 123.835. Improved by 0.00493402.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 26. Log joint probability = 123.836. Improved by 0.000572895.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 27. Log joint probability = 123.837. Improved by 0.00107582.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 28. Log joint probability = 123.839. Improved by 0.00219839.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 29. Log joint probability = 123.84. Improved by 0.000507895.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 30. Log joint probability = 123.841. Improved by 0.00153871.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 31. Log joint probability = 123.842. Improved by 0.000513638.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 32. Log joint probability = 123.842. Improved by 0.000147151.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 33. Log joint probability = 123.842. Improved by 0.000274432.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 34. Log joint probability = 123.842. Improved by 0.000105308.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 35. Log joint probability = 123.842. Improved by 0.000105348.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 36. Log joint probability = 123.842. Improved by 8.63243e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 37. Log joint probability = 123.842. Improved by 5.25735e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 38. Log joint probability = 123.842. Improved by 2.12369e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 39. Log joint probability = 123.842. Improved by 9.84594e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 40. Log joint probability = 123.842. Improved by 7.66574e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 41. Log joint probability = 123.842. Improved by 1.93305e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 42. Log joint probability = 123.842. Improved by 6.82331e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 43. Log joint probability = 123.842. Improved by 2.44574e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 44. Log joint probability = 123.842. Improved by 3.12753e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 45. Log joint probability = 123.842. Improved by 5.82608e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 46. Log joint probability = 123.842. Improved by 4.6484e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 47. Log joint probability = 123.842. Improved by 1.3307e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 48. Log joint probability = 123.843. Improved by 2.23967e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 49. Log joint probability = 123.843. Improved by 4.8155e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 50. Log joint probability = 123.843. Improved by 3.33246e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 51. Log joint probability = 123.843. Improved by 2.56905e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 52. Log joint probability = 123.843. Improved by 2.44229e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 53. Log joint probability = 123.843. Improved by 4.22397e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 54. Log joint probability = 123.843. Improved by 9.91746e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 55. Log joint probability = 123.843. Improved by 1.89293e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 56. Log joint probability = 123.843. Improved by 7.36958e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 57. Log joint probability = 123.843. Improved by 1.30557e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 58. Log joint probability = 123.843. Improved by 2.02889e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 59. Log joint probability = 123.843. Improved by 8.04966e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 60. Log joint probability = 123.843. Improved by 8.67718e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 61. Log joint probability = 123.843. Improved by 1.47952e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 62. Log joint probability = 123.843. Improved by 3.63641e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 63. Log joint probability = 123.843. Improved by 2.15615e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 64. Log joint probability = 123.843. Improved by 1.3613e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 65. Log joint probability = 123.843. Improved by 2.43754e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 66. Log joint probability = 123.843. Improved by 3.49743e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 67. Log joint probability = 123.843. Improved by 6.23249e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 68. Log joint probability = 123.843. Improved by 1.42323e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 69. Log joint probability = 123.843. Improved by 2.71484e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 70. Log joint probability = 123.843. Improved by 1.82188e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 71. Log joint probability = 123.843. Improved by 2.51761e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 72. Log joint probability = 123.843. Improved by 1.31146e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 73. Log joint probability = 123.843. Improved by 1.40753e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=799, ip=172.31.136.199)\u001b[0m Iteration 74. Log joint probability = 123.843. Improved by 2.03943e-09.\n" + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Initial log joint probability = -24.6903\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 1. Log joint probability = 55.3662. Improved by 80.0565.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 2. Log joint probability = 95.8737. Improved by 40.5075.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 3. Log joint probability = 120.379. Improved by 24.5055.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 4. Log joint probability = 122.813. Improved by 2.43399.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 5. Log joint probability = 123.073. Improved by 0.259582.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 6. Log joint probability = 123.074. Improved by 0.00165627.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 7. Log joint probability = 123.112. Improved by 0.0373812.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 8. Log joint probability = 123.133. Improved by 0.0215269.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 9. Log joint probability = 123.216. Improved by 0.0827413.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 10. Log joint probability = 123.274. Improved by 0.0580866.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 11. Log joint probability = 123.275. Improved by 0.000726338.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 12. Log joint probability = 123.287. Improved by 0.0124071.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 13. Log joint probability = 123.354. Improved by 0.0669767.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 14. Log joint probability = 123.532. Improved by 0.177947.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 15. Log joint probability = 123.537. Improved by 0.00465327.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 16. Log joint probability = 123.567. Improved by 0.0304046.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 17. Log joint probability = 123.626. Improved by 0.0586984.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 18. Log joint probability = 123.717. Improved by 0.0906553.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 19. Log joint probability = 123.767. Improved by 0.0503912.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 20. Log joint probability = 123.794. Improved by 0.0270009.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 21. Log joint probability = 123.809. Improved by 0.0150776.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 22. Log joint probability = 123.819. Improved by 0.00949975.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 23. Log joint probability = 123.826. Improved by 0.00746779.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 24. Log joint probability = 123.83. Improved by 0.00414592.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 25. Log joint probability = 123.835. Improved by 0.00493402.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 26. Log joint probability = 123.836. Improved by 0.000572895.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 27. Log joint probability = 123.837. Improved by 0.00107582.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 28. Log joint probability = 123.839. Improved by 0.00219839.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 29. Log joint probability = 123.84. Improved by 0.000507895.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 30. Log joint probability = 123.841. Improved by 0.00153871.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 31. Log joint probability = 123.842. Improved by 0.000513638.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 32. Log joint probability = 123.842. Improved by 0.000147151.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 33. Log joint probability = 123.842. Improved by 0.000274432.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 34. Log joint probability = 123.842. Improved by 0.000105308.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 35. Log joint probability = 123.842. Improved by 0.000105348.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 36. Log joint probability = 123.842. Improved by 8.63243e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 37. Log joint probability = 123.842. Improved by 5.25735e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 38. Log joint probability = 123.842. Improved by 2.12369e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 39. Log joint probability = 123.842. Improved by 9.84594e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 40. Log joint probability = 123.842. Improved by 7.66574e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 41. Log joint probability = 123.842. Improved by 1.93305e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 42. Log joint probability = 123.842. Improved by 6.82331e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 43. Log joint probability = 123.842. Improved by 2.44574e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 44. Log joint probability = 123.842. Improved by 3.12753e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 45. Log joint probability = 123.842. Improved by 5.82608e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 46. Log joint probability = 123.842. Improved by 4.6484e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 47. Log joint probability = 123.842. Improved by 1.3307e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 48. Log joint probability = 123.843. Improved by 2.23967e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 49. Log joint probability = 123.843. Improved by 4.8155e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 50. Log joint probability = 123.843. Improved by 3.33246e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 51. Log joint probability = 123.843. Improved by 2.56905e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 52. Log joint probability = 123.843. Improved by 2.44229e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 53. Log joint probability = 123.843. Improved by 4.22397e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 54. Log joint probability = 123.843. Improved by 9.91746e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 55. Log joint probability = 123.843. Improved by 1.89293e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 56. Log joint probability = 123.843. Improved by 7.36958e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 57. Log joint probability = 123.843. Improved by 1.30557e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 58. Log joint probability = 123.843. Improved by 2.02889e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 59. Log joint probability = 123.843. Improved by 8.04966e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 60. Log joint probability = 123.843. Improved by 8.67718e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 61. Log joint probability = 123.843. Improved by 1.47952e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 62. Log joint probability = 123.843. Improved by 3.63641e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 63. Log joint probability = 123.843. Improved by 2.15615e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 64. Log joint probability = 123.843. Improved by 1.3613e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 65. Log joint probability = 123.843. Improved by 2.43754e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 66. Log joint probability = 123.843. Improved by 3.49743e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 67. Log joint probability = 123.843. Improved by 6.23249e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 68. Log joint probability = 123.843. Improved by 1.42323e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 69. Log joint probability = 123.843. Improved by 2.71484e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 70. Log joint probability = 123.843. Improved by 1.82188e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 71. Log joint probability = 123.843. Improved by 2.51761e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 72. Log joint probability = 123.843. Improved by 1.31146e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 73. Log joint probability = 123.843. Improved by 1.40753e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=799, ip=172.31.136.199)\u001B[0m Iteration 74. Log joint probability = 123.843. Improved by 2.03943e-09.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Initial log joint probability = -21.7758\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 1. Log joint probability = 41.5159. Improved by 63.2917.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 2. Log joint probability = 68.4175. Improved by 26.9016.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 3. Log joint probability = 88.1348. Improved by 19.7173.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 4. Log joint probability = 88.147. Improved by 0.0121786.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 5. Log joint probability = 88.1524. Improved by 0.00537125.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 6. Log joint probability = 88.1633. Improved by 0.0109589.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 7. Log joint probability = 88.1753. Improved by 0.0119717.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 8. Log joint probability = 88.1783. Improved by 0.00301597.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 9. Log joint probability = 88.2164. Improved by 0.0380849.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 10. Log joint probability = 88.2239. Improved by 0.00749222.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 11. Log joint probability = 88.3633. Improved by 0.139416.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 12. Log joint probability = 88.4154. Improved by 0.0520892.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 13. Log joint probability = 88.4651. Improved by 0.0496986.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 14. Log joint probability = 89.8472. Improved by 1.38208.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 15. Log joint probability = 89.8657. Improved by 0.0185247.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 16. Log joint probability = 89.8732. Improved by 0.00753048.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 17. Log joint probability = 89.9318. Improved by 0.0585562.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 18. Log joint probability = 89.9447. Improved by 0.0129053.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 19. Log joint probability = 89.965. Improved by 0.0202932.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 20. Log joint probability = 90.0397. Improved by 0.0747472.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 21. Log joint probability = 90.0875. Improved by 0.0477876.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 22. Log joint probability = 90.105. Improved by 0.0175359.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 23. Log joint probability = 90.4892. Improved by 0.384151.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 24. Log joint probability = 90.556. Improved by 0.0668293.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 25. Log joint probability = 90.6581. Improved by 0.102125.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 26. Log joint probability = 90.742. Improved by 0.0838101.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 27. Log joint probability = 90.7738. Improved by 0.031868.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 28. Log joint probability = 90.7856. Improved by 0.011803.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 29. Log joint probability = 90.8302. Improved by 0.0445906.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 30. Log joint probability = 90.8852. Improved by 0.0549923.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 31. Log joint probability = 90.9034. Improved by 0.0181786.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 32. Log joint probability = 90.9276. Improved by 0.0241721.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 33. Log joint probability = 90.9412. Improved by 0.0136337.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 34. Log joint probability = 90.9542. Improved by 0.0130142.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 35. Log joint probability = 90.962. Improved by 0.00775981.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 36. Log joint probability = 90.9638. Improved by 0.00186611.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 37. Log joint probability = 90.9718. Improved by 0.00797594.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 38. Log joint probability = 90.976. Improved by 0.0042081.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 39. Log joint probability = 90.9777. Improved by 0.00165647.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 40. Log joint probability = 90.9814. Improved by 0.00370259.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 41. Log joint probability = 90.9839. Improved by 0.00256843.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 42. Log joint probability = 90.9851. Improved by 0.0011523.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 43. Log joint probability = 90.9868. Improved by 0.00170077.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 44. Log joint probability = 90.9874. Improved by 0.000631959.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 45. Log joint probability = 90.9885. Improved by 0.00111174.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 46. Log joint probability = 90.9887. Improved by 0.000172812.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 47. Log joint probability = 90.9897. Improved by 0.000951722.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 48. Log joint probability = 90.9904. Improved by 0.000744776.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 49. Log joint probability = 90.9907. Improved by 0.000334385.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 50. Log joint probability = 90.9911. Improved by 0.000323131.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 51. Log joint probability = 90.9913. Improved by 0.000195932.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 52. Log joint probability = 90.9913. Improved by 7.26249e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 53. Log joint probability = 90.9914. Improved by 9.38402e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 54. Log joint probability = 90.9915. Improved by 0.000104485.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 55. Log joint probability = 90.9915. Improved by 3.9586e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 56. Log joint probability = 90.9916. Improved by 7.77437e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 57. Log joint probability = 90.9916. Improved by 2.79958e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 58. Log joint probability = 90.9917. Improved by 5.30653e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 59. Log joint probability = 90.9918. Improved by 5.32272e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 60. Log joint probability = 90.9918. Improved by 2.72417e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 61. Log joint probability = 90.9919. Improved by 9.20075e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 62. Log joint probability = 90.9919. Improved by 1.97313e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 63. Log joint probability = 90.9919. Improved by 3.52389e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 64. Log joint probability = 90.992. Improved by 4.48494e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 65. Log joint probability = 90.992. Improved by 3.68675e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 66. Log joint probability = 90.992. Improved by 2.02192e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 67. Log joint probability = 90.9921. Improved by 2.05867e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 68. Log joint probability = 90.9921. Improved by 1.60531e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 69. Log joint probability = 90.9921. Improved by 1.09975e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 70. Log joint probability = 90.9921. Improved by 5.48589e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 71. Log joint probability = 90.9921. Improved by 5.17867e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 72. Log joint probability = 90.9921. Improved by 6.19947e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 73. Log joint probability = 90.9921. Improved by 1.90771e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 74. Log joint probability = 90.9921. Improved by 1.96755e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 75. Log joint probability = 90.9921. Improved by 3.14253e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 76. Log joint probability = 90.9922. Improved by 2.00154e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 77. Log joint probability = 90.9922. Improved by 7.38871e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 78. Log joint probability = 90.9922. Improved by 5.2899e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 79. Log joint probability = 90.9922. Improved by 3.05609e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 80. Log joint probability = 90.9922. Improved by 4.27669e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 81. Log joint probability = 90.9922. Improved by 2.5749e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 82. Log joint probability = 90.9922. Improved by 4.80204e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 83. Log joint probability = 90.9922. Improved by 2.77249e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 84. Log joint probability = 90.9922. Improved by 6.44e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 85. Log joint probability = 90.9922. Improved by 5.69327e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 86. Log joint probability = 90.9922. Improved by 6.80163e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 87. Log joint probability = 90.9922. Improved by 1.10273e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 88. Log joint probability = 90.9922. Improved by 3.1814e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 89. Log joint probability = 90.9922. Improved by 1.15471e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 90. Log joint probability = 90.9922. Improved by 2.80645e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 91. Log joint probability = 90.9922. Improved by 1.97469e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 92. Log joint probability = 90.9922. Improved by 3.01754e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 93. Log joint probability = 90.9922. Improved by 5.89157e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 94. Log joint probability = 90.9922. Improved by 4.37725e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 95. Log joint probability = 90.9922. Improved by 2.67717e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 96. Log joint probability = 90.9922. Improved by 3.00174e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 97. Log joint probability = 90.9922. Improved by 4.5588e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 98. Log joint probability = 90.9922. Improved by 1.30664e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 99. Log joint probability = 90.9922. Improved by 2.56521e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 100. Log joint probability = 90.9922. Improved by 1.77492e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 101. Log joint probability = 90.9922. Improved by 1.62366e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 102. Log joint probability = 90.9922. Improved by 1.84507e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 103. Log joint probability = 90.9922. Improved by 9.9194e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 104. Log joint probability = 90.9922. Improved by 6.85e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 105. Log joint probability = 90.9922. Improved by 2.19949e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 106. Log joint probability = 90.9922. Improved by 3.50271e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 107. Log joint probability = 90.9922. Improved by 7.81865e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 108. Log joint probability = 90.9922. Improved by 6.23645e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 109. Log joint probability = 90.9922. Improved by 6.12578e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 110. Log joint probability = 90.9922. Improved by 5.88466e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 111. Log joint probability = 90.9922. Improved by 1.63983e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 112. Log joint probability = 90.9922. Improved by 1.58961e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 113. Log joint probability = 90.9922. Improved by 4.68893e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 114. Log joint probability = 90.9922. Improved by 2.36556e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 115. Log joint probability = 90.9922. Improved by 4.54818e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 116. Log joint probability = 90.9922. Improved by 2.94216e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 117. Log joint probability = 90.9922. Improved by 1.2584e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 118. Log joint probability = 90.9922. Improved by 2.77487e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 119. Log joint probability = 90.9922. Improved by 2.76151e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 120. Log joint probability = 90.9922. Improved by 1.37145e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 121. Log joint probability = 90.9922. Improved by 4.27885e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=867, ip=172.31.136.199)\u001b[0m Iteration 122. Log joint probability = 90.9922. Improved by 7.76434e-09.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Initial log joint probability = -21.7758\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 1. Log joint probability = 20.1836. Improved by 41.9594.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 2. Log joint probability = 59.1549. Improved by 38.9713.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 3. Log joint probability = 79.9487. Improved by 20.7939.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 4. Log joint probability = 90.4604. Improved by 10.5117.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 5. Log joint probability = 90.7685. Improved by 0.308148.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 6. Log joint probability = 90.8866. Improved by 0.118032.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 7. Log joint probability = 90.9086. Improved by 0.0220841.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 8. Log joint probability = 90.9484. Improved by 0.0397311.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 9. Log joint probability = 90.9681. Improved by 0.0197759.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 10. Log joint probability = 90.9738. Improved by 0.00567126.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 11. Log joint probability = 90.9772. Improved by 0.00338425.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 12. Log joint probability = 90.979. Improved by 0.00180031.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 13. Log joint probability = 90.9909. Improved by 0.0118985.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 14. Log joint probability = 90.9977. Improved by 0.00677184.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 15. Log joint probability = 90.9994. Improved by 0.00176338.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 16. Log joint probability = 90.9998. Improved by 0.000346058.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 17. Log joint probability = 91.0026. Improved by 0.00283502.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 18. Log joint probability = 91.0067. Improved by 0.00404095.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 19. Log joint probability = 91.009. Improved by 0.00230573.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 20. Log joint probability = 91.0097. Improved by 0.000728684.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 21. Log joint probability = 91.0105. Improved by 0.000842848.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 22. Log joint probability = 91.0137. Improved by 0.00315459.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 23. Log joint probability = 91.0144. Improved by 0.000675261.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 24. Log joint probability = 91.015. Improved by 0.000668053.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 25. Log joint probability = 91.0153. Improved by 0.00022664.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 26. Log joint probability = 91.0158. Improved by 0.000553923.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 27. Log joint probability = 91.0169. Improved by 0.00108114.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 28. Log joint probability = 91.0173. Improved by 0.000446418.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 29. Log joint probability = 91.0179. Improved by 0.000535655.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 30. Log joint probability = 91.0188. Improved by 0.000894825.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 31. Log joint probability = 91.0192. Improved by 0.000463639.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 32. Log joint probability = 91.0193. Improved by 5.37241e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 33. Log joint probability = 91.0194. Improved by 0.00012323.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 34. Log joint probability = 91.0196. Improved by 0.000156284.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 35. Log joint probability = 91.0197. Improved by 8.54979e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 36. Log joint probability = 91.02. Improved by 0.000353443.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 37. Log joint probability = 91.0201. Improved by 9.12108e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 38. Log joint probability = 91.0201. Improved by 3.2033e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 39. Log joint probability = 91.0202. Improved by 5.68514e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 40. Log joint probability = 91.0203. Improved by 7.33769e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 41. Log joint probability = 91.0203. Improved by 6.37981e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 42. Log joint probability = 91.0203. Improved by 1.38012e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 43. Log joint probability = 91.0204. Improved by 2.29702e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 44. Log joint probability = 91.0204. Improved by 6.54176e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 45. Log joint probability = 91.0204. Improved by 1.93438e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 46. Log joint probability = 91.0204. Improved by 3.1678e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 47. Log joint probability = 91.0204. Improved by 5.27803e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 48. Log joint probability = 91.0204. Improved by 1.66328e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 49. Log joint probability = 91.0204. Improved by 1.35778e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 50. Log joint probability = 91.0205. Improved by 1.29478e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 51. Log joint probability = 91.0205. Improved by 7.81213e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 52. Log joint probability = 91.0205. Improved by 1.64481e-05.\n" + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Initial log joint probability = -21.7758\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 1. Log joint probability = 41.5159. Improved by 63.2917.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 2. Log joint probability = 68.4175. Improved by 26.9016.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 3. Log joint probability = 88.1348. Improved by 19.7173.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 4. Log joint probability = 88.147. Improved by 0.0121786.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 5. Log joint probability = 88.1524. Improved by 0.00537125.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 6. Log joint probability = 88.1633. Improved by 0.0109589.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 7. Log joint probability = 88.1753. Improved by 0.0119717.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 8. Log joint probability = 88.1783. Improved by 0.00301597.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 9. Log joint probability = 88.2164. Improved by 0.0380849.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 10. Log joint probability = 88.2239. Improved by 0.00749222.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 11. Log joint probability = 88.3633. Improved by 0.139416.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 12. Log joint probability = 88.4154. Improved by 0.0520892.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 13. Log joint probability = 88.4651. Improved by 0.0496986.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 14. Log joint probability = 89.8472. Improved by 1.38208.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 15. Log joint probability = 89.8657. Improved by 0.0185247.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 16. Log joint probability = 89.8732. Improved by 0.00753048.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 17. Log joint probability = 89.9318. Improved by 0.0585562.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 18. Log joint probability = 89.9447. Improved by 0.0129053.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 19. Log joint probability = 89.965. Improved by 0.0202932.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 20. Log joint probability = 90.0397. Improved by 0.0747472.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 21. Log joint probability = 90.0875. Improved by 0.0477876.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 22. Log joint probability = 90.105. Improved by 0.0175359.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 23. Log joint probability = 90.4892. Improved by 0.384151.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 24. Log joint probability = 90.556. Improved by 0.0668293.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 25. Log joint probability = 90.6581. Improved by 0.102125.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 26. Log joint probability = 90.742. Improved by 0.0838101.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 27. Log joint probability = 90.7738. Improved by 0.031868.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 28. Log joint probability = 90.7856. Improved by 0.011803.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 29. Log joint probability = 90.8302. Improved by 0.0445906.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 30. Log joint probability = 90.8852. Improved by 0.0549923.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 31. Log joint probability = 90.9034. Improved by 0.0181786.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 32. Log joint probability = 90.9276. Improved by 0.0241721.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 33. Log joint probability = 90.9412. Improved by 0.0136337.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 34. Log joint probability = 90.9542. Improved by 0.0130142.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 35. Log joint probability = 90.962. Improved by 0.00775981.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 36. Log joint probability = 90.9638. Improved by 0.00186611.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 37. Log joint probability = 90.9718. Improved by 0.00797594.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 38. Log joint probability = 90.976. Improved by 0.0042081.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 39. Log joint probability = 90.9777. Improved by 0.00165647.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 40. Log joint probability = 90.9814. Improved by 0.00370259.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 41. Log joint probability = 90.9839. Improved by 0.00256843.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 42. Log joint probability = 90.9851. Improved by 0.0011523.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 43. Log joint probability = 90.9868. Improved by 0.00170077.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 44. Log joint probability = 90.9874. Improved by 0.000631959.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 45. Log joint probability = 90.9885. Improved by 0.00111174.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 46. Log joint probability = 90.9887. Improved by 0.000172812.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 47. Log joint probability = 90.9897. Improved by 0.000951722.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 48. Log joint probability = 90.9904. Improved by 0.000744776.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 49. Log joint probability = 90.9907. Improved by 0.000334385.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 50. Log joint probability = 90.9911. Improved by 0.000323131.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 51. Log joint probability = 90.9913. Improved by 0.000195932.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 52. Log joint probability = 90.9913. Improved by 7.26249e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 53. Log joint probability = 90.9914. Improved by 9.38402e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 54. Log joint probability = 90.9915. Improved by 0.000104485.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 55. Log joint probability = 90.9915. Improved by 3.9586e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 56. Log joint probability = 90.9916. Improved by 7.77437e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 57. Log joint probability = 90.9916. Improved by 2.79958e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 58. Log joint probability = 90.9917. Improved by 5.30653e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 59. Log joint probability = 90.9918. Improved by 5.32272e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 60. Log joint probability = 90.9918. Improved by 2.72417e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 61. Log joint probability = 90.9919. Improved by 9.20075e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 62. Log joint probability = 90.9919. Improved by 1.97313e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 63. Log joint probability = 90.9919. Improved by 3.52389e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 64. Log joint probability = 90.992. Improved by 4.48494e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 65. Log joint probability = 90.992. Improved by 3.68675e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 66. Log joint probability = 90.992. Improved by 2.02192e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 67. Log joint probability = 90.9921. Improved by 2.05867e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 68. Log joint probability = 90.9921. Improved by 1.60531e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 69. Log joint probability = 90.9921. Improved by 1.09975e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 70. Log joint probability = 90.9921. Improved by 5.48589e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 71. Log joint probability = 90.9921. Improved by 5.17867e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 72. Log joint probability = 90.9921. Improved by 6.19947e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 73. Log joint probability = 90.9921. Improved by 1.90771e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 74. Log joint probability = 90.9921. Improved by 1.96755e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 75. Log joint probability = 90.9921. Improved by 3.14253e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 76. Log joint probability = 90.9922. Improved by 2.00154e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 77. Log joint probability = 90.9922. Improved by 7.38871e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 78. Log joint probability = 90.9922. Improved by 5.2899e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 79. Log joint probability = 90.9922. Improved by 3.05609e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 80. Log joint probability = 90.9922. Improved by 4.27669e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 81. Log joint probability = 90.9922. Improved by 2.5749e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 82. Log joint probability = 90.9922. Improved by 4.80204e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 83. Log joint probability = 90.9922. Improved by 2.77249e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 84. Log joint probability = 90.9922. Improved by 6.44e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 85. Log joint probability = 90.9922. Improved by 5.69327e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 86. Log joint probability = 90.9922. Improved by 6.80163e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 87. Log joint probability = 90.9922. Improved by 1.10273e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 88. Log joint probability = 90.9922. Improved by 3.1814e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 89. Log joint probability = 90.9922. Improved by 1.15471e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 90. Log joint probability = 90.9922. Improved by 2.80645e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 91. Log joint probability = 90.9922. Improved by 1.97469e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 92. Log joint probability = 90.9922. Improved by 3.01754e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 93. Log joint probability = 90.9922. Improved by 5.89157e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 94. Log joint probability = 90.9922. Improved by 4.37725e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 95. Log joint probability = 90.9922. Improved by 2.67717e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 96. Log joint probability = 90.9922. Improved by 3.00174e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 97. Log joint probability = 90.9922. Improved by 4.5588e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 98. Log joint probability = 90.9922. Improved by 1.30664e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 99. Log joint probability = 90.9922. Improved by 2.56521e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 100. Log joint probability = 90.9922. Improved by 1.77492e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 101. Log joint probability = 90.9922. Improved by 1.62366e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 102. Log joint probability = 90.9922. Improved by 1.84507e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 103. Log joint probability = 90.9922. Improved by 9.9194e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 104. Log joint probability = 90.9922. Improved by 6.85e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 105. Log joint probability = 90.9922. Improved by 2.19949e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 106. Log joint probability = 90.9922. Improved by 3.50271e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 107. Log joint probability = 90.9922. Improved by 7.81865e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 108. Log joint probability = 90.9922. Improved by 6.23645e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 109. Log joint probability = 90.9922. Improved by 6.12578e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 110. Log joint probability = 90.9922. Improved by 5.88466e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 111. Log joint probability = 90.9922. Improved by 1.63983e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 112. Log joint probability = 90.9922. Improved by 1.58961e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 113. Log joint probability = 90.9922. Improved by 4.68893e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 114. Log joint probability = 90.9922. Improved by 2.36556e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 115. Log joint probability = 90.9922. Improved by 4.54818e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 116. Log joint probability = 90.9922. Improved by 2.94216e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 117. Log joint probability = 90.9922. Improved by 1.2584e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 118. Log joint probability = 90.9922. Improved by 2.77487e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 119. Log joint probability = 90.9922. Improved by 2.76151e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 120. Log joint probability = 90.9922. Improved by 1.37145e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 121. Log joint probability = 90.9922. Improved by 4.27885e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=867, ip=172.31.136.199)\u001B[0m Iteration 122. Log joint probability = 90.9922. Improved by 7.76434e-09.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Initial log joint probability = -21.7758\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 1. Log joint probability = 20.1836. Improved by 41.9594.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 2. Log joint probability = 59.1549. Improved by 38.9713.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 3. Log joint probability = 79.9487. Improved by 20.7939.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 4. Log joint probability = 90.4604. Improved by 10.5117.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 5. Log joint probability = 90.7685. Improved by 0.308148.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 6. Log joint probability = 90.8866. Improved by 0.118032.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 7. Log joint probability = 90.9086. Improved by 0.0220841.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 8. Log joint probability = 90.9484. Improved by 0.0397311.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 9. Log joint probability = 90.9681. Improved by 0.0197759.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 10. Log joint probability = 90.9738. Improved by 0.00567126.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 11. Log joint probability = 90.9772. Improved by 0.00338425.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 12. Log joint probability = 90.979. Improved by 0.00180031.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 13. Log joint probability = 90.9909. Improved by 0.0118985.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 14. Log joint probability = 90.9977. Improved by 0.00677184.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 15. Log joint probability = 90.9994. Improved by 0.00176338.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 16. Log joint probability = 90.9998. Improved by 0.000346058.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 17. Log joint probability = 91.0026. Improved by 0.00283502.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 18. Log joint probability = 91.0067. Improved by 0.00404095.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 19. Log joint probability = 91.009. Improved by 0.00230573.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 20. Log joint probability = 91.0097. Improved by 0.000728684.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 21. Log joint probability = 91.0105. Improved by 0.000842848.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 22. Log joint probability = 91.0137. Improved by 0.00315459.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 23. Log joint probability = 91.0144. Improved by 0.000675261.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 24. Log joint probability = 91.015. Improved by 0.000668053.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 25. Log joint probability = 91.0153. Improved by 0.00022664.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 26. Log joint probability = 91.0158. Improved by 0.000553923.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 27. Log joint probability = 91.0169. Improved by 0.00108114.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 28. Log joint probability = 91.0173. Improved by 0.000446418.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 29. Log joint probability = 91.0179. Improved by 0.000535655.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 30. Log joint probability = 91.0188. Improved by 0.000894825.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 31. Log joint probability = 91.0192. Improved by 0.000463639.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 32. Log joint probability = 91.0193. Improved by 5.37241e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 33. Log joint probability = 91.0194. Improved by 0.00012323.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 34. Log joint probability = 91.0196. Improved by 0.000156284.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 35. Log joint probability = 91.0197. Improved by 8.54979e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 36. Log joint probability = 91.02. Improved by 0.000353443.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 37. Log joint probability = 91.0201. Improved by 9.12108e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 38. Log joint probability = 91.0201. Improved by 3.2033e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 39. Log joint probability = 91.0202. Improved by 5.68514e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 40. Log joint probability = 91.0203. Improved by 7.33769e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 41. Log joint probability = 91.0203. Improved by 6.37981e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 42. Log joint probability = 91.0203. Improved by 1.38012e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 43. Log joint probability = 91.0204. Improved by 2.29702e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 44. Log joint probability = 91.0204. Improved by 6.54176e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 45. Log joint probability = 91.0204. Improved by 1.93438e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 46. Log joint probability = 91.0204. Improved by 3.1678e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 47. Log joint probability = 91.0204. Improved by 5.27803e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 48. Log joint probability = 91.0204. Improved by 1.66328e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 49. Log joint probability = 91.0204. Improved by 1.35778e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 50. Log joint probability = 91.0205. Improved by 1.29478e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 51. Log joint probability = 91.0205. Improved by 7.81213e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 52. Log joint probability = 91.0205. Improved by 1.64481e-05.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n" + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 53. Log joint probability = 91.0205. Improved by 5.89368e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 54. Log joint probability = 91.0205. Improved by 2.73371e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 55. Log joint probability = 91.0205. Improved by 3.59134e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 56. Log joint probability = 91.0205. Improved by 7.21082e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 57. Log joint probability = 91.0205. Improved by 1.16206e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 58. Log joint probability = 91.0205. Improved by 2.44705e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 59. Log joint probability = 91.0205. Improved by 1.59075e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 60. Log joint probability = 91.0205. Improved by 2.89546e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 61. Log joint probability = 91.0205. Improved by 1.19933e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 62. Log joint probability = 91.0205. Improved by 2.3315e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 63. Log joint probability = 91.0205. Improved by 3.0172e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 64. Log joint probability = 91.0205. Improved by 1.1254e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 65. Log joint probability = 91.0205. Improved by 1.43073e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 66. Log joint probability = 91.0205. Improved by 1.06503e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 67. Log joint probability = 91.0205. Improved by 1.94521e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 68. Log joint probability = 91.0205. Improved by 1.91264e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 69. Log joint probability = 91.0205. Improved by 1.14165e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 70. Log joint probability = 91.0205. Improved by 6.19488e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 71. Log joint probability = 91.0205. Improved by 1.3134e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 72. Log joint probability = 91.0205. Improved by 7.83336e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 73. Log joint probability = 91.0205. Improved by 6.66751e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 74. Log joint probability = 91.0205. Improved by 2.12689e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 75. Log joint probability = 91.0205. Improved by 1.21127e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 76. Log joint probability = 91.0205. Improved by 6.65688e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 77. Log joint probability = 91.0205. Improved by 2.69727e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 78. Log joint probability = 91.0205. Improved by 3.26115e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 79. Log joint probability = 91.0205. Improved by 6.01741e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 80. Log joint probability = 91.0205. Improved by 9.90215e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 81. Log joint probability = 91.0205. Improved by 1.34709e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 82. Log joint probability = 91.0205. Improved by 1.86905e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 83. Log joint probability = 91.0205. Improved by 1.13228e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 84. Log joint probability = 91.0205. Improved by 1.84163e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 85. Log joint probability = 91.0205. Improved by 9.80857e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 86. Log joint probability = 91.0205. Improved by 3.26897e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 87. Log joint probability = 91.0205. Improved by 2.67554e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 88. Log joint probability = 91.0205. Improved by 3.02441e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=868, ip=172.31.136.199)\u001b[0m Iteration 89. Log joint probability = 91.0205. Improved by 6.99644e-09.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Initial log joint probability = -24.7798\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 1. Log joint probability = 56.6567. Improved by 81.4365.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 2. Log joint probability = 97.3654. Improved by 40.7088.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 3. Log joint probability = 118.678. Improved by 21.3124.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 4. Log joint probability = 129.821. Improved by 11.1432.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 5. Log joint probability = 132.527. Improved by 2.70548.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 6. Log joint probability = 132.562. Improved by 0.0357063.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 7. Log joint probability = 132.959. Improved by 0.396572.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 8. Log joint probability = 132.964. Improved by 0.00492318.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 9. Log joint probability = 132.968. Improved by 0.00386232.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 10. Log joint probability = 133.011. Improved by 0.0434838.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 11. Log joint probability = 133.125. Improved by 0.113608.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m \n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Initial log joint probability = -24.7798\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 1. Log joint probability = 58.4966. Improved by 83.2764.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 2. Log joint probability = 98.0201. Improved by 39.5235.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 3. Log joint probability = 124.762. Improved by 26.7417.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 4. Log joint probability = 128.406. Improved by 3.64467.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 5. Log joint probability = 131.459. Improved by 3.05241.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 6. Log joint probability = 131.536. Improved by 0.0771233.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 7. Log joint probability = 131.585. Improved by 0.0491424.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 8. Log joint probability = 131.622. Improved by 0.0372929.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 9. Log joint probability = 131.746. Improved by 0.123634.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 10. Log joint probability = 131.84. Improved by 0.0940927.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 11. Log joint probability = 131.915. Improved by 0.0752941.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 12. Log joint probability = 131.944. Improved by 0.0284656.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 13. Log joint probability = 132.136. Improved by 0.192139.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 14. Log joint probability = 132.154. Improved by 0.0182919.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 12. Log joint probability = 133.156. Improved by 0.0315004.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 15. Log joint probability = 132.205. Improved by 0.0502591.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 13. Log joint probability = 133.165. Improved by 0.00863589.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 16. Log joint probability = 132.283. Improved by 0.0788813.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 14. Log joint probability = 133.205. Improved by 0.0399492.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 17. Log joint probability = 132.295. Improved by 0.0111451.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 15. Log joint probability = 133.263. Improved by 0.0582913.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 18. Log joint probability = 132.508. Improved by 0.213728.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 16. Log joint probability = 133.312. Improved by 0.0488556.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 17. Log joint probability = 133.379. Improved by 0.0673858.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 19. Log joint probability = 132.535. Improved by 0.0269674.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 18. Log joint probability = 133.399. Improved by 0.0201265.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 20. Log joint probability = 132.608. Improved by 0.0723374.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 19. Log joint probability = 133.484. Improved by 0.0845203.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 20. Log joint probability = 133.489. Improved by 0.00529988.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 21. Log joint probability = 133.564. Improved by 0.074616.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 22. Log joint probability = 133.65. Improved by 0.0863769.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 23. Log joint probability = 133.704. Improved by 0.0536392.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 24. Log joint probability = 133.726. Improved by 0.0224161.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 25. Log joint probability = 133.734. Improved by 0.00765676.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 26. Log joint probability = 133.771. Improved by 0.0367052.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 27. Log joint probability = 133.782. Improved by 0.0110577.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 28. Log joint probability = 133.782. Improved by 0.000409333.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 29. Log joint probability = 133.786. Improved by 0.00424821.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 30. Log joint probability = 133.793. Improved by 0.00702624.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 31. Log joint probability = 133.793. Improved by 0.000120618.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 32. Log joint probability = 133.796. Improved by 0.00259901.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 33. Log joint probability = 133.8. Improved by 0.00347541.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 34. Log joint probability = 133.8. Improved by 4.34525e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 35. Log joint probability = 133.8. Improved by 0.000442336.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 36. Log joint probability = 133.801. Improved by 0.000935713.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 37. Log joint probability = 133.803. Improved by 0.00171089.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 38. Log joint probability = 133.803. Improved by 0.000512353.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 39. Log joint probability = 133.803. Improved by 4.16449e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 40. Log joint probability = 133.804. Improved by 0.000354666.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 41. Log joint probability = 133.804. Improved by 5.7549e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 42. Log joint probability = 133.804. Improved by 0.000324601.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 43. Log joint probability = 133.805. Improved by 0.00101344.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 44. Log joint probability = 133.805. Improved by 0.000491843.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 45. Log joint probability = 133.806. Improved by 8.67991e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 46. Log joint probability = 133.806. Improved by 0.000128382.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 47. Log joint probability = 133.806. Improved by 3.70175e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 48. Log joint probability = 133.806. Improved by 4.50979e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 21. Log joint probability = 132.66. Improved by 0.0521015.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 22. Log joint probability = 132.673. Improved by 0.0129431.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 23. Log joint probability = 132.883. Improved by 0.210274.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 24. Log joint probability = 133.261. Improved by 0.378255.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 25. Log joint probability = 133.449. Improved by 0.187961.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 26. Log joint probability = 133.654. Improved by 0.204868.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 27. Log joint probability = 133.762. Improved by 0.10752.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 28. Log joint probability = 133.793. Improved by 0.0309585.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 29. Log joint probability = 133.847. Improved by 0.0542512.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 30. Log joint probability = 133.898. Improved by 0.0509466.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 31. Log joint probability = 134.179. Improved by 0.2808.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 32. Log joint probability = 134.209. Improved by 0.0301489.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 33. Log joint probability = 134.253. Improved by 0.0447352.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 34. Log joint probability = 134.339. Improved by 0.0856853.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 49. Log joint probability = 133.806. Improved by 2.93527e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 50. Log joint probability = 133.806. Improved by 4.40796e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 35. Log joint probability = 134.341. Improved by 0.00205512.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 51. Log joint probability = 133.806. Improved by 0.000118919.\n" + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 53. Log joint probability = 91.0205. Improved by 5.89368e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 54. Log joint probability = 91.0205. Improved by 2.73371e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 55. Log joint probability = 91.0205. Improved by 3.59134e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 56. Log joint probability = 91.0205. Improved by 7.21082e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 57. Log joint probability = 91.0205. Improved by 1.16206e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 58. Log joint probability = 91.0205. Improved by 2.44705e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 59. Log joint probability = 91.0205. Improved by 1.59075e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 60. Log joint probability = 91.0205. Improved by 2.89546e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 61. Log joint probability = 91.0205. Improved by 1.19933e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 62. Log joint probability = 91.0205. Improved by 2.3315e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 63. Log joint probability = 91.0205. Improved by 3.0172e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 64. Log joint probability = 91.0205. Improved by 1.1254e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 65. Log joint probability = 91.0205. Improved by 1.43073e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 66. Log joint probability = 91.0205. Improved by 1.06503e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 67. Log joint probability = 91.0205. Improved by 1.94521e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 68. Log joint probability = 91.0205. Improved by 1.91264e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 69. Log joint probability = 91.0205. Improved by 1.14165e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 70. Log joint probability = 91.0205. Improved by 6.19488e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 71. Log joint probability = 91.0205. Improved by 1.3134e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 72. Log joint probability = 91.0205. Improved by 7.83336e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 73. Log joint probability = 91.0205. Improved by 6.66751e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 74. Log joint probability = 91.0205. Improved by 2.12689e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 75. Log joint probability = 91.0205. Improved by 1.21127e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 76. Log joint probability = 91.0205. Improved by 6.65688e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 77. Log joint probability = 91.0205. Improved by 2.69727e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 78. Log joint probability = 91.0205. Improved by 3.26115e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 79. Log joint probability = 91.0205. Improved by 6.01741e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 80. Log joint probability = 91.0205. Improved by 9.90215e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 81. Log joint probability = 91.0205. Improved by 1.34709e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 82. Log joint probability = 91.0205. Improved by 1.86905e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 83. Log joint probability = 91.0205. Improved by 1.13228e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 84. Log joint probability = 91.0205. Improved by 1.84163e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 85. Log joint probability = 91.0205. Improved by 9.80857e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 86. Log joint probability = 91.0205. Improved by 3.26897e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 87. Log joint probability = 91.0205. Improved by 2.67554e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 88. Log joint probability = 91.0205. Improved by 3.02441e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=868, ip=172.31.136.199)\u001B[0m Iteration 89. Log joint probability = 91.0205. Improved by 6.99644e-09.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Initial log joint probability = -24.7798\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 1. Log joint probability = 56.6567. Improved by 81.4365.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 2. Log joint probability = 97.3654. Improved by 40.7088.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 3. Log joint probability = 118.678. Improved by 21.3124.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 4. Log joint probability = 129.821. Improved by 11.1432.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 5. Log joint probability = 132.527. Improved by 2.70548.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 6. Log joint probability = 132.562. Improved by 0.0357063.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 7. Log joint probability = 132.959. Improved by 0.396572.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 8. Log joint probability = 132.964. Improved by 0.00492318.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 9. Log joint probability = 132.968. Improved by 0.00386232.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 10. Log joint probability = 133.011. Improved by 0.0434838.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 11. Log joint probability = 133.125. Improved by 0.113608.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m \n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Initial log joint probability = -24.7798\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 1. Log joint probability = 58.4966. Improved by 83.2764.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 2. Log joint probability = 98.0201. Improved by 39.5235.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 3. Log joint probability = 124.762. Improved by 26.7417.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 4. Log joint probability = 128.406. Improved by 3.64467.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 5. Log joint probability = 131.459. Improved by 3.05241.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 6. Log joint probability = 131.536. Improved by 0.0771233.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 7. Log joint probability = 131.585. Improved by 0.0491424.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 8. Log joint probability = 131.622. Improved by 0.0372929.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 9. Log joint probability = 131.746. Improved by 0.123634.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 10. Log joint probability = 131.84. Improved by 0.0940927.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 11. Log joint probability = 131.915. Improved by 0.0752941.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 12. Log joint probability = 131.944. Improved by 0.0284656.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 13. Log joint probability = 132.136. Improved by 0.192139.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 14. Log joint probability = 132.154. Improved by 0.0182919.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 12. Log joint probability = 133.156. Improved by 0.0315004.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 15. Log joint probability = 132.205. Improved by 0.0502591.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 13. Log joint probability = 133.165. Improved by 0.00863589.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 16. Log joint probability = 132.283. Improved by 0.0788813.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 14. Log joint probability = 133.205. Improved by 0.0399492.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 17. Log joint probability = 132.295. Improved by 0.0111451.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 15. Log joint probability = 133.263. Improved by 0.0582913.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 18. Log joint probability = 132.508. Improved by 0.213728.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 16. Log joint probability = 133.312. Improved by 0.0488556.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 17. Log joint probability = 133.379. Improved by 0.0673858.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 19. Log joint probability = 132.535. Improved by 0.0269674.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 18. Log joint probability = 133.399. Improved by 0.0201265.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 20. Log joint probability = 132.608. Improved by 0.0723374.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 19. Log joint probability = 133.484. Improved by 0.0845203.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 20. Log joint probability = 133.489. Improved by 0.00529988.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 21. Log joint probability = 133.564. Improved by 0.074616.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 22. Log joint probability = 133.65. Improved by 0.0863769.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 23. Log joint probability = 133.704. Improved by 0.0536392.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 24. Log joint probability = 133.726. Improved by 0.0224161.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 25. Log joint probability = 133.734. Improved by 0.00765676.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 26. Log joint probability = 133.771. Improved by 0.0367052.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 27. Log joint probability = 133.782. Improved by 0.0110577.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 28. Log joint probability = 133.782. Improved by 0.000409333.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 29. Log joint probability = 133.786. Improved by 0.00424821.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 30. Log joint probability = 133.793. Improved by 0.00702624.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 31. Log joint probability = 133.793. Improved by 0.000120618.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 32. Log joint probability = 133.796. Improved by 0.00259901.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 33. Log joint probability = 133.8. Improved by 0.00347541.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 34. Log joint probability = 133.8. Improved by 4.34525e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 35. Log joint probability = 133.8. Improved by 0.000442336.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 36. Log joint probability = 133.801. Improved by 0.000935713.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 37. Log joint probability = 133.803. Improved by 0.00171089.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 38. Log joint probability = 133.803. Improved by 0.000512353.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 39. Log joint probability = 133.803. Improved by 4.16449e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 40. Log joint probability = 133.804. Improved by 0.000354666.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 41. Log joint probability = 133.804. Improved by 5.7549e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 42. Log joint probability = 133.804. Improved by 0.000324601.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 43. Log joint probability = 133.805. Improved by 0.00101344.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 44. Log joint probability = 133.805. Improved by 0.000491843.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 45. Log joint probability = 133.806. Improved by 8.67991e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 46. Log joint probability = 133.806. Improved by 0.000128382.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 47. Log joint probability = 133.806. Improved by 3.70175e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 48. Log joint probability = 133.806. Improved by 4.50979e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 21. Log joint probability = 132.66. Improved by 0.0521015.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 22. Log joint probability = 132.673. Improved by 0.0129431.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 23. Log joint probability = 132.883. Improved by 0.210274.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 24. Log joint probability = 133.261. Improved by 0.378255.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 25. Log joint probability = 133.449. Improved by 0.187961.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 26. Log joint probability = 133.654. Improved by 0.204868.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 27. Log joint probability = 133.762. Improved by 0.10752.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 28. Log joint probability = 133.793. Improved by 0.0309585.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 29. Log joint probability = 133.847. Improved by 0.0542512.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 30. Log joint probability = 133.898. Improved by 0.0509466.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 31. Log joint probability = 134.179. Improved by 0.2808.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 32. Log joint probability = 134.209. Improved by 0.0301489.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 33. Log joint probability = 134.253. Improved by 0.0447352.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 34. Log joint probability = 134.339. Improved by 0.0856853.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 49. Log joint probability = 133.806. Improved by 2.93527e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 50. Log joint probability = 133.806. Improved by 4.40796e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 35. Log joint probability = 134.341. Improved by 0.00205512.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 51. Log joint probability = 133.806. Improved by 0.000118919.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n" + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 52. Log joint probability = 133.806. Improved by 1.19684e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 53. Log joint probability = 133.806. Improved by 5.11185e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 54. Log joint probability = 133.806. Improved by 4.74767e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 55. Log joint probability = 133.806. Improved by 1.2416e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 56. Log joint probability = 133.806. Improved by 2.02582e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 57. Log joint probability = 133.806. Improved by 1.71245e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 58. Log joint probability = 133.806. Improved by 8.42186e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 59. Log joint probability = 133.806. Improved by 5.25634e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 60. Log joint probability = 133.806. Improved by 1.02038e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 61. Log joint probability = 133.806. Improved by 8.6083e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 62. Log joint probability = 133.806. Improved by 1.95771e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 63. Log joint probability = 133.806. Improved by 2.81929e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 64. Log joint probability = 133.806. Improved by 9.62887e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 65. Log joint probability = 133.806. Improved by 1.02108e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 66. Log joint probability = 133.806. Improved by 8.08545e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 67. Log joint probability = 133.806. Improved by 1.06262e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 68. Log joint probability = 133.806. Improved by 1.44616e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 69. Log joint probability = 133.806. Improved by 2.11851e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 70. Log joint probability = 133.806. Improved by 2.4721e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 71. Log joint probability = 133.806. Improved by 3.84309e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 72. Log joint probability = 133.806. Improved by 8.01389e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 73. Log joint probability = 133.806. Improved by 6.42814e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 74. Log joint probability = 133.806. Improved by 3.08296e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 75. Log joint probability = 133.806. Improved by 7.11785e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 76. Log joint probability = 133.806. Improved by 6.76762e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 77. Log joint probability = 133.806. Improved by 2.88068e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 78. Log joint probability = 133.806. Improved by 6.82979e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 36. Log joint probability = 134.393. Improved by 0.0516495.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 37. Log joint probability = 134.406. Improved by 0.0128166.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 38. Log joint probability = 134.53. Improved by 0.124634.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 39. Log joint probability = 134.593. Improved by 0.0626.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 40. Log joint probability = 134.626. Improved by 0.03309.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 41. Log joint probability = 134.631. Improved by 0.00515215.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 42. Log joint probability = 134.664. Improved by 0.0326243.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 43. Log joint probability = 134.675. Improved by 0.0115272.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 44. Log joint probability = 134.678. Improved by 0.00297174.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 45. Log joint probability = 134.687. Improved by 0.00902203.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 46. Log joint probability = 134.695. Improved by 0.00741251.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 47. Log joint probability = 134.698. Improved by 0.00291338.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 48. Log joint probability = 134.698. Improved by 0.000831812.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 49. Log joint probability = 134.699. Improved by 0.000221433.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 50. Log joint probability = 134.7. Improved by 0.00103722.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 79. Log joint probability = 133.806. Improved by 4.89768e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 80. Log joint probability = 133.806. Improved by 5.13849e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 51. Log joint probability = 134.7. Improved by 0.00033267.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 81. Log joint probability = 133.806. Improved by 1.1728e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=865, ip=172.31.136.199)\u001b[0m Iteration 82. Log joint probability = 133.806. Improved by 5.41323e-09.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 52. Log joint probability = 134.7. Improved by 0.000370356.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 53. Log joint probability = 134.701. Improved by 0.000590457.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 54. Log joint probability = 134.701. Improved by 0.000308186.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 55. Log joint probability = 134.701. Improved by 1.19587e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 56. Log joint probability = 134.703. Improved by 0.0017289.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 57. Log joint probability = 134.705. Improved by 0.00162144.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 58. Log joint probability = 134.706. Improved by 0.000936565.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 59. Log joint probability = 134.706. Improved by 0.000489671.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 60. Log joint probability = 134.706. Improved by 2.13758e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 61. Log joint probability = 134.706. Improved by 7.25762e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 62. Log joint probability = 134.706. Improved by 0.000109131.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 63. Log joint probability = 134.706. Improved by 5.9817e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 64. Log joint probability = 134.706. Improved by 0.000246335.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 65. Log joint probability = 134.707. Improved by 2.75556e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 66. Log joint probability = 134.707. Improved by 6.77305e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 67. Log joint probability = 134.707. Improved by 0.000101361.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 68. Log joint probability = 134.707. Improved by 2.67652e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 69. Log joint probability = 134.707. Improved by 4.08686e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 70. Log joint probability = 134.707. Improved by 5.56634e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 71. Log joint probability = 134.707. Improved by 8.41062e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 72. Log joint probability = 134.707. Improved by 3.58515e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 73. Log joint probability = 134.707. Improved by 1.01022e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 74. Log joint probability = 134.707. Improved by 2.71279e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 75. Log joint probability = 134.707. Improved by 1.57461e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 76. Log joint probability = 134.707. Improved by 2.20976e-05.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 77. Log joint probability = 134.707. Improved by 4.12488e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 78. Log joint probability = 134.707. Improved by 4.15849e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 79. Log joint probability = 134.707. Improved by 4.0241e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 80. Log joint probability = 134.707. Improved by 5.34552e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 81. Log joint probability = 134.707. Improved by 2.28619e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 82. Log joint probability = 134.707. Improved by 1.55421e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 83. Log joint probability = 134.707. Improved by 4.21746e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 84. Log joint probability = 134.707. Improved by 1.7876e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 85. Log joint probability = 134.707. Improved by 4.65521e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 86. Log joint probability = 134.707. Improved by 6.75201e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 87. Log joint probability = 134.707. Improved by 1.22495e-06.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 88. Log joint probability = 134.707. Improved by 6.8387e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 89. Log joint probability = 134.707. Improved by 1.51393e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 90. Log joint probability = 134.707. Improved by 3.06142e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 91. Log joint probability = 134.707. Improved by 2.65367e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 92. Log joint probability = 134.707. Improved by 3.27718e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 93. Log joint probability = 134.707. Improved by 1.4017e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 94. Log joint probability = 134.707. Improved by 1.27841e-07.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 95. Log joint probability = 134.707. Improved by 7.60193e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 96. Log joint probability = 134.707. Improved by 2.21328e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 97. Log joint probability = 134.707. Improved by 1.95887e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 98. Log joint probability = 134.707. Improved by 7.67787e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 99. Log joint probability = 134.707. Improved by 1.98719e-08.\n", - "\u001b[2m\u001b[36m(train_model pid=864, ip=172.31.136.199)\u001b[0m Iteration 100. Log joint probability = 134.707. Improved by 6.91463e-09.\n" + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 52. Log joint probability = 133.806. Improved by 1.19684e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 53. Log joint probability = 133.806. Improved by 5.11185e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 54. Log joint probability = 133.806. Improved by 4.74767e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 55. Log joint probability = 133.806. Improved by 1.2416e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 56. Log joint probability = 133.806. Improved by 2.02582e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 57. Log joint probability = 133.806. Improved by 1.71245e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 58. Log joint probability = 133.806. Improved by 8.42186e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 59. Log joint probability = 133.806. Improved by 5.25634e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 60. Log joint probability = 133.806. Improved by 1.02038e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 61. Log joint probability = 133.806. Improved by 8.6083e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 62. Log joint probability = 133.806. Improved by 1.95771e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 63. Log joint probability = 133.806. Improved by 2.81929e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 64. Log joint probability = 133.806. Improved by 9.62887e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 65. Log joint probability = 133.806. Improved by 1.02108e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 66. Log joint probability = 133.806. Improved by 8.08545e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 67. Log joint probability = 133.806. Improved by 1.06262e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 68. Log joint probability = 133.806. Improved by 1.44616e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 69. Log joint probability = 133.806. Improved by 2.11851e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 70. Log joint probability = 133.806. Improved by 2.4721e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 71. Log joint probability = 133.806. Improved by 3.84309e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 72. Log joint probability = 133.806. Improved by 8.01389e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 73. Log joint probability = 133.806. Improved by 6.42814e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 74. Log joint probability = 133.806. Improved by 3.08296e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 75. Log joint probability = 133.806. Improved by 7.11785e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 76. Log joint probability = 133.806. Improved by 6.76762e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 77. Log joint probability = 133.806. Improved by 2.88068e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 78. Log joint probability = 133.806. Improved by 6.82979e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 36. Log joint probability = 134.393. Improved by 0.0516495.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 37. Log joint probability = 134.406. Improved by 0.0128166.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 38. Log joint probability = 134.53. Improved by 0.124634.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 39. Log joint probability = 134.593. Improved by 0.0626.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 40. Log joint probability = 134.626. Improved by 0.03309.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 41. Log joint probability = 134.631. Improved by 0.00515215.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 42. Log joint probability = 134.664. Improved by 0.0326243.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 43. Log joint probability = 134.675. Improved by 0.0115272.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 44. Log joint probability = 134.678. Improved by 0.00297174.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 45. Log joint probability = 134.687. Improved by 0.00902203.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 46. Log joint probability = 134.695. Improved by 0.00741251.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 47. Log joint probability = 134.698. Improved by 0.00291338.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 48. Log joint probability = 134.698. Improved by 0.000831812.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 49. Log joint probability = 134.699. Improved by 0.000221433.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 50. Log joint probability = 134.7. Improved by 0.00103722.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 79. Log joint probability = 133.806. Improved by 4.89768e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 80. Log joint probability = 133.806. Improved by 5.13849e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 51. Log joint probability = 134.7. Improved by 0.00033267.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 81. Log joint probability = 133.806. Improved by 1.1728e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=865, ip=172.31.136.199)\u001B[0m Iteration 82. Log joint probability = 133.806. Improved by 5.41323e-09.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 52. Log joint probability = 134.7. Improved by 0.000370356.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 53. Log joint probability = 134.701. Improved by 0.000590457.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 54. Log joint probability = 134.701. Improved by 0.000308186.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 55. Log joint probability = 134.701. Improved by 1.19587e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 56. Log joint probability = 134.703. Improved by 0.0017289.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 57. Log joint probability = 134.705. Improved by 0.00162144.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 58. Log joint probability = 134.706. Improved by 0.000936565.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 59. Log joint probability = 134.706. Improved by 0.000489671.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 60. Log joint probability = 134.706. Improved by 2.13758e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 61. Log joint probability = 134.706. Improved by 7.25762e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 62. Log joint probability = 134.706. Improved by 0.000109131.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 63. Log joint probability = 134.706. Improved by 5.9817e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 64. Log joint probability = 134.706. Improved by 0.000246335.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 65. Log joint probability = 134.707. Improved by 2.75556e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 66. Log joint probability = 134.707. Improved by 6.77305e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 67. Log joint probability = 134.707. Improved by 0.000101361.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 68. Log joint probability = 134.707. Improved by 2.67652e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 69. Log joint probability = 134.707. Improved by 4.08686e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 70. Log joint probability = 134.707. Improved by 5.56634e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 71. Log joint probability = 134.707. Improved by 8.41062e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 72. Log joint probability = 134.707. Improved by 3.58515e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 73. Log joint probability = 134.707. Improved by 1.01022e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 74. Log joint probability = 134.707. Improved by 2.71279e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 75. Log joint probability = 134.707. Improved by 1.57461e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 76. Log joint probability = 134.707. Improved by 2.20976e-05.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 77. Log joint probability = 134.707. Improved by 4.12488e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 78. Log joint probability = 134.707. Improved by 4.15849e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 79. Log joint probability = 134.707. Improved by 4.0241e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 80. Log joint probability = 134.707. Improved by 5.34552e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 81. Log joint probability = 134.707. Improved by 2.28619e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 82. Log joint probability = 134.707. Improved by 1.55421e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 83. Log joint probability = 134.707. Improved by 4.21746e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 84. Log joint probability = 134.707. Improved by 1.7876e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 85. Log joint probability = 134.707. Improved by 4.65521e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 86. Log joint probability = 134.707. Improved by 6.75201e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 87. Log joint probability = 134.707. Improved by 1.22495e-06.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 88. Log joint probability = 134.707. Improved by 6.8387e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 89. Log joint probability = 134.707. Improved by 1.51393e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 90. Log joint probability = 134.707. Improved by 3.06142e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 91. Log joint probability = 134.707. Improved by 2.65367e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 92. Log joint probability = 134.707. Improved by 3.27718e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 93. Log joint probability = 134.707. Improved by 1.4017e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 94. Log joint probability = 134.707. Improved by 1.27841e-07.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 95. Log joint probability = 134.707. Improved by 7.60193e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 96. Log joint probability = 134.707. Improved by 2.21328e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 97. Log joint probability = 134.707. Improved by 1.95887e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 98. Log joint probability = 134.707. Improved by 7.67787e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 99. Log joint probability = 134.707. Improved by 1.98719e-08.\n", + "\u001B[2m\u001B[36m(train_model pid=864, ip=172.31.136.199)\u001B[0m Iteration 100. Log joint probability = 134.707. Improved by 6.91463e-09.\n" ] }, { diff --git a/doc/source/ray-air/examples/batch_tuning.ipynb b/doc/source/ray-air/examples/batch_tuning.ipynb index 9194adc8529e..7e3c4bc872ea 100644 --- a/doc/source/ray-air/examples/batch_tuning.ipynb +++ b/doc/source/ray-air/examples/batch_tuning.ipynb @@ -55,10 +55,29 @@ "```{tip}\n", "Prerequisite for this notebook: Read the [Key Concepts](tune-60-seconds) page for Ray Tune.\n", "```\n", - "\n", - "Let us start by importing a few required libraries, including open-source Ray itself!" + "First, let's make sure we have all Python packages we need installed." ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!pip install -q \"ray[air]\" scikit-learn" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Next, let's import a few required libraries, including open-source Ray itself!" + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "code", "execution_count": 1, diff --git a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb index 0919cb625690..1e8e6734b67c 100644 --- a/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb +++ b/doc/source/ray-air/examples/convert_existing_pytorch_code_to_ray_air.ipynb @@ -559,6 +559,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "abe8e708", "metadata": {}, @@ -576,62 +577,63 @@ "\n", "1. We import Ray Train and Ray AIR Session:\n", "\n", - "```python\n", - "import ray.train as train\n", - "from ray.air import session\n", - "```\n", + " ```python\n", + " import ray.train as train\n", + " from ray.air import session\n", + " ```\n", "\n", "\n", "2. We use a `config` dict to configure some hyperparameters (this is not strictly needed but good practice, especially if you want to o hyperparameter tuning later):\n", "\n", - "```python\n", - "def train_func(config: dict):\n", - " batch_size = config[\"batch_size\"]\n", - " lr = config[\"lr\"]\n", - " epochs = config[\"epochs\"]\n", - "```\n", + " ```python\n", + " def train_func(config: dict):\n", + " batch_size = config[\"batch_size\"]\n", + " lr = config[\"lr\"]\n", + " epochs = config[\"epochs\"]\n", + " ```\n", "\n", "3. We dynamically adjust the worker batch size according to the number of workers:\n", "\n", - "```python\n", - " batch_size_per_worker = batch_size // session.get_world_size()\n", - "```\n", + " ```python\n", + " batch_size_per_worker = batch_size // session.get_world_size()\n", + " ```\n", "\n", "4. We prepare the data loader for distributed data sharding:\n", "\n", - "```python\n", - " train_dataloader = train.torch.prepare_data_loader(train_dataloader)\n", - " test_dataloader = train.torch.prepare_data_loader(test_dataloader)\n", - "```\n", + " ```python\n", + " train_dataloader = train.torch.prepare_data_loader(train_dataloader)\n", + " test_dataloader = train.torch.prepare_data_loader(test_dataloader)\n", + " ```\n", "\n", "5. We prepare the model for distributed gradient updates:\n", "\n", - "```python\n", - " model = train.torch.prepare_model(model)\n", - "```\n", - "\n", - "Note that `train.torch.prepare_model()` also automatically takes care of setting up devices (e.g. GPU training) - so we can get rid of those lines in our current code!\n", - "\n", + " ```python\n", + " model = train.torch.prepare_model(model)\n", + " ```\n", + " :::{note}\n", + " Note that `train.torch.prepare_model()` also automatically takes care of setting up devices (e.g. GPU training) - so we can get rid of those lines in our current code!\n", + " :::\n", "\n", "6. We capture the validation loss and report it to Ray train:\n", "\n", - "```python\n", - " test_loss = test(test_dataloader, model, loss_fn)\n", - " session.report(dict(loss=test_loss))\n", - "```\n", + " ```python\n", + " test_loss = test(test_dataloader, model, loss_fn)\n", + " session.report(dict(loss=test_loss))\n", + " ```\n", "\n", "7. In the `train_epoch()` and `test_epoch()` functions we divide the `size` by the world size:\n", "\n", - "```python\n", - " size = len(dataloader.dataset) // session.get_world_size() # Divide by word size\n", - "```\n", + " ```python\n", + " # Divide by word size\n", + " size = len(dataloader.dataset) // session.get_world_size()\n", + " ```\n", "\n", "8. In the `train_epoch()` function we can get rid of the device mapping. Ray Train does this for us:\n", "\n", - "```python\n", - " # We don't need this anymore! Ray Train does this automatically:\n", - " # X, y = X.to(device), y.to(device) \n", - "```\n", + " ```python\n", + " # We don't need this anymore! Ray Train does this automatically:\n", + " # X, y = X.to(device), y.to(device) \n", + " ```\n", "\n", "That's it - you need less than 10 lines of Ray Train-specific code and can otherwise continue to use your original code.\n", "\n", @@ -1079,7 +1081,7 @@ "id": "ad556eeb", "metadata": {}, "source": [ - "Batch predictors work with Ray Datastreams. Here we convert our test dataset into a Ray Datastream - note that this is not very efficient, and you can look at our {ref}`other tutorials ` to see more efficient ways to generate a Ray Datastream." + "Batch predictors work with Ray Data. Here we convert our test dataset into a Dataset - note that this is not very efficient, and you can look at our {ref}`other tutorials ` to see more efficient ways to generate a Dataset." ] }, { @@ -1125,7 +1127,7 @@ "id": "41094a55", "metadata": {}, "source": [ - "`results` is another Ray Datastream. We can use `results.show()` to see our prediction results:" + "`results` is another Dataset. We can use `results.show()` to see our prediction results:" ] }, { @@ -1229,7 +1231,7 @@ ], "source": [ "predicted_classes = results.map_batches(\n", - " lambda batch: [classes[pred.argmax(0)] for pred in batch[\"predictions\"]], \n", + " lambda batch: {\"pred\": [classes[pred.argmax(0)] for pred in batch[\"predictions\"]]}, \n", " batch_size=32,\n", " batch_format=\"pandas\")" ] @@ -1277,7 +1279,7 @@ ], "source": [ "real_classes = [classes[y] for x, y in test_data]\n", - "for predicted, real in zip(predicted_classes.take(), real_classes):\n", + "for predicted, real in zip(predicted_classes.take_batch()[\"pred\"], real_classes):\n", " print((predicted, real))" ] }, @@ -1295,7 +1297,7 @@ "- save and retrieve model checkpoints via Ray AIR\n", "- load a model for batch prediction\n", "\n", - "In our {ref}`other examples ` you can learn how to do more things with the Ray AIR API, such as **serving your model with Ray Serve** or **tune your hyperparameters with Ray Tune.** You can also learn how to **construct Ray Datasets** to leverage Ray AIR's **preprocessing** API.\n", + "In our {ref}`other examples ` you can learn how to do more things with the Ray AIR API, such as **serving your model with Ray Serve** or **tune your hyperparameters with Ray Tune.** You can also learn how to **construct Ray Data** to leverage Ray AIR's **preprocessing** API.\n", "\n", "We hope this tutorial gave you a good starting point to leverage Ray AIR. If you have any questions, suggestions, or run into any problems pelase reach out on [Discuss](https://discuss.ray.io/) or [GitHub](https://github.com/ray-project/ray)!" ] diff --git a/doc/source/ray-air/examples/convert_existing_tf_code_to_ray_air.ipynb b/doc/source/ray-air/examples/convert_existing_tf_code_to_ray_air.ipynb index cf31b16e20ff..d0ba3609819d 100644 --- a/doc/source/ray-air/examples/convert_existing_tf_code_to_ray_air.ipynb +++ b/doc/source/ray-air/examples/convert_existing_tf_code_to_ray_air.ipynb @@ -705,7 +705,7 @@ "id": "fd72830b", "metadata": {}, "source": [ - "Batch predictors work with [Ray Datasets](datasets). Here, we create a {class}`Dataset ` of images from our test set." + "Batch predictors work with [Ray Data](data). Here, we create a {class}`Dataset ` of images from our test set." ] }, { @@ -723,7 +723,7 @@ "id": "6ab1b08a", "metadata": {}, "source": [ - "Let's run {meth}`BatchPredictor.predict ` on our Ray Dataset. This will distribute the prediction across a specified number of workers!" + "Let's run {meth}`BatchPredictor.predict ` on our Dataset. This will distribute the prediction across a specified number of workers!" ] }, { @@ -741,7 +741,7 @@ "id": "9ccadf89", "metadata": {}, "source": [ - "`predict_results` is also a Ray Dataset, and we can take a look at the predictions inside:" + "`predict_results` is also a Dataset, and we can take a look at the predictions inside:" ] }, { @@ -769,7 +769,7 @@ "...\n", "```\n", "\n", - "Our model outputs logits, but we want the actual predicted labels. We can convert the logits to labels by taking the `argmax` of each model output in `predict_results` using {meth}`map_batches `. Then, we can compute the accuracy by comparing to the test set labels!" + "Our model outputs logits, but we want the actual predicted labels. We can convert the logits to labels by taking the `argmax` of each model output in `predict_results` using {meth}`map_batches `. Then, we can compute the accuracy by comparing to the test set labels!" ] }, { @@ -803,10 +803,10 @@ ], "source": [ "predicted_classes = predict_results.map_batches(\n", - " lambda batch: [pred.argmax(0) for pred in batch[\"predictions\"]], \n", + " lambda batch: {\"pred\": [pred.argmax(0) for pred in batch[\"predictions\"]]}, \n", " batch_format=\"pandas\"\n", ")\n", - "predicted_classes_np = predicted_classes.take_all()\n", + "predicted_classes_np = predicted_classes.take_batch(float(\"inf\"))[\"pred\"]\n", "\n", "pred_accuracy = (predicted_classes_np == y_test).astype(int).sum() / len(predicted_classes_np)\n", "print(\"Prediction Accuracy =\", pred_accuracy)" @@ -826,7 +826,7 @@ "- save and retrieve model checkpoints via Ray AIR\n", "- load a model for batch prediction\n", "\n", - "In our [other examples](air-examples-ref) you can learn how to do more things with the Ray AIR API, such as **serving your model with Ray Serve** or **tune your hyperparameters with Ray Tune**. You can also learn how to **construct Ray Datasets** to leverage Ray AIR’s **preprocessing** API.\n", + "In our [other examples](air-examples-ref) you can learn how to do more things with the Ray AIR API, such as **serving your model with Ray Serve** or **tune your hyperparameters with Ray Tune**. You can also learn how to **construct Ray Data** to leverage Ray AIR’s **preprocessing** API.\n", "\n", "See [this table](train-framework-catalog) for a full catalog of frameworks that AIR supports out of the box.\n", "\n", diff --git a/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb b/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb new file mode 100644 index 000000000000..f4c0953a92c5 --- /dev/null +++ b/doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb @@ -0,0 +1,1043 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(dolly_lightning_fsdp_finetuning)=\n", + "\n", + "# Fine-tune `dolly-v2-7b` with Ray AIR LightningTrainer and FSDP" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we demonstrate how to use Ray AIR to fine-tune a [`dolly-v2-7b`](https://huggingface.co/databricks/dolly-v2-7b) model. `dolly-v2-12b` is a 12 billion parameter causal language model created by Databricks, derived from EleutherAI’s [Pythia-12b](https://huggingface.co/EleutherAI/pythia-12b), and fine-tuned on a [~15K record instruction corpus](https://github.com/databrickslabs/dolly/tree/master/data).\n", + "\n", + "We load the pre-trained model from the HuggingFace model hub into a LightningModule and launch an FSDP fine-tuning job across 16 T4 GPUs with the help of {class}`Ray LightningTrainer `. It is also straightforward to fine-tune other similar large language models in a similar manner as shown in this example.\n", + "\n", + "Before starting this example, we highly recommend reading [Ray AIR Key Concepts](air-key-concepts) and [Ray Data Key Concepts](data_key_concepts)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up ray cluster \n", + "In this example, we are using a ray cluster with 16 g4dn.4xlarge instances. Each instance has one Tesla T4 GPU (16GiB Memory). \n", + "\n", + "We define a `runtime_env` to install the necessary Python libraries on each node. You can skip this step if you have already installed all the required packages in your workers' base image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import ray\n", + "\n", + "ray.init(\n", + " runtime_env={\n", + " \"pip\": [\n", + " \"datasets\",\n", + " \"evaluate\",\n", + " \"transformers>=4.26.0\",\n", + " \"torch>=1.12.0\",\n", + " \"pytorch_lightning>=2.0\",\n", + " ]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "MODEL_NAME = \"databricks/dolly-v2-7b\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare your data \n", + "We are using tiny_shakespeare for fine-tuning, which contains 40,000 lines of Shakespeare from a variety of Shakespeare's plays. Featured in Andrej Karpathy's blog post ['The Unreasonable Effectiveness of Recurrent Neural Networks'](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). \n", + "\n", + "Dataset samples:\n", + "```\n", + "BAPTISTA:\n", + "I know him well: you are welcome for his sake.\n", + "\n", + "GREMIO:\n", + "Saving your tale, Petruchio, I pray,\n", + "Let us, that are poor petitioners, speak too:\n", + "Baccare! you are marvellous forward.\n", + "\n", + "PETRUCHIO:\n", + "O, pardon me, Signior Gremio; I would fain be doing.\n", + "```\n", + "\n", + "Here, we have adopted similar pre-processing logic from another demo: {ref}`GPT-J-6B Fine-Tuning with Ray AIR and DeepSpeed `." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import ray\n", + "import pandas as pd\n", + "from datasets import load_dataset\n", + "from ray.data.preprocessors import BatchMapper, Chain\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM\n", + "\n", + "def split_text(batch: pd.DataFrame) -> pd.DataFrame:\n", + " text = list(batch[\"text\"])\n", + " flat_text = \"\".join(text)\n", + " split_text = [\n", + " x.strip()\n", + " for x in flat_text.split(\"\\n\")\n", + " if x.strip() and not x.strip()[-1] == \":\"\n", + " ]\n", + " return pd.DataFrame(split_text, columns=[\"text\"])\n", + "\n", + "\n", + "def tokenize(batch: pd.DataFrame) -> dict:\n", + " tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side=\"left\")\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " ret = tokenizer(\n", + " list(batch[\"text\"]),\n", + " truncation=True,\n", + " max_length=256,\n", + " padding=\"max_length\",\n", + " return_tensors=\"np\",\n", + " )\n", + " ret[\"labels\"] = ret[\"input_ids\"].copy()\n", + " return dict(ret)\n", + "\n", + "splitter = BatchMapper(split_text, batch_format=\"pandas\")\n", + "tokenizer = BatchMapper(tokenize, batch_format=\"pandas\")\n", + "preprocessor = Chain(splitter, tokenizer)\n", + "\n", + "hf_dataset = load_dataset(\"tiny_shakespeare\")\n", + "ray_datasets = ray.data.from_huggingface(hf_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We first split the original paragraphs into multiple sentences, then tokenize them. Here are some samples:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'text': 'Before we proceed any further, hear me speak.'},\n", + " {'text': 'Speak, speak.'},\n", + " {'text': 'You are all resolved rather to die than to famish?'},\n", + " {'text': 'Resolved. resolved.'},\n", + " {'text': 'First, you know Caius Marcius is chief enemy to the people.'},\n", + " {'text': \"We know't, we know't.\"},\n", + " {'text': \"Let us kill him, and we'll have corn at our own price.\"},\n", + " {'text': \"Is't a verdict?\"},\n", + " {'text': \"No more talking on't; let it be done: away, away!\"},\n", + " {'text': 'One word, good citizens.'}]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = ray_datasets[\"train\"]\n", + "splitter.fit_transform(ds).take(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define your lightning model\n", + "\n", + "In this example, we use the [dolly-v2-7b](https://huggingface.co/databricks/dolly-v2-7b) model for finetuning. It is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use. We load the model weights from Huggingface Model Hub and encapsulate it into a `pl.LightningModule`.\n", + "\n", + ":::{note}\n", + "Make sure you pass the FSDP wrapped model parameters `self.trainer.model.parameters()` into the optimizer, instead of `self.model.parameters()`. \n", + ":::\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "import pytorch_lightning as pl\n", + "\n", + "class DollyV2Model(pl.LightningModule):\n", + " def __init__(self, lr=2e-5, eps=1e-8):\n", + " super().__init__()\n", + " self.lr = lr\n", + " self.eps = eps\n", + " self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n", + " self.predictions = []\n", + " self.references = []\n", + "\n", + " def forward(self, batch):\n", + " outputs = self.model(\n", + " batch[\"input_ids\"], \n", + " attention_mask=batch[\"attention_mask\"], \n", + " labels=batch[\"labels\"]\n", + " )\n", + " return outputs.loss\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " loss = self.forward(batch)\n", + " self.log(\"train_loss\", loss, prog_bar=True, on_step=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " if self.global_rank == 0:\n", + " print(self.trainer.model)\n", + " return torch.optim.AdamW(self.trainer.model.parameters(), lr=self.lr, eps=self.eps)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure your FSDP strategy\n", + "As Dolly-v2-3b is a relatively large model, it cannot be properly fit into a single commercial GPU. In this example, we use the FSDP strategy to shard model parameters across multiple workers. This allows us to avoid GPU out-of-memory issues and support a larger global batch size.\n", + "\n", + "![](https://user-images.githubusercontent.com/26745457/236892936-d4b91751-4689-421e-ac5f-edfd2eeeb635.png)\n", + "Image source: [Fully Sharded Data Parallel: faster AI training with fewer GPUs](https://engineering.fb.com/2021/07/15/open-source/fsdp/)\n", + "\n", + ":::{note}\n", + "FSDP is a type of data parallelism that shards model parameters, optimizer states and gradients across DDP ranks. This was inspired by Xu et al. as well as the ZeRO Stage 3 from DeepSpeed. You may refer to these blogs for more information:\n", + "\n", + "- [Fully Sharded Data Parallel: faster AI training with fewer GPUs](https://engineering.fb.com/2021/07/15/open-source/fsdp/)\n", + "- [Getting Started with Fully Sharded Data Parallel(FSDP)](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html#:~:text=FSDP%20is%20a%20type%20of,sizes%20for%20our%20training%20job.)\n", + "- [PyTorch FSDP Tutorial](https://www.youtube.com/watch?v=8_k76AHu__s&list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT)\n", + ":::\n", + "\n", + "To start trainig with Lightning's [FSDPStrategy](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.FSDPStrategy.html#lightning.pytorch.strategies.FSDPStrategy), you only need to provide the initialization arguments in `LightningConfigBuilder.strategy()`. Behind the scenes, LightningTrainer handles the cluster environment settings and job launching.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import functools\n", + "from ray.train.lightning import LightningTrainer, LightningConfigBuilder\n", + "from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig\n", + "from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy\n", + "from torch.distributed.fsdp import ShardingStrategy, BackwardPrefetch\n", + "from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer\n", + "\n", + "# Define the model sharding policy:\n", + "# Wrap every GPTNeoXLayer as its own FSDP instance\n", + "auto_wrap_policy = functools.partial(\n", + " transformer_auto_wrap_policy,\n", + " transformer_layer_cls = {GPTNeoXLayer}\n", + ")\n", + "\n", + "# Aggregate all arguments for LightningTrainer\n", + "lightning_config = (\n", + " LightningConfigBuilder()\n", + " .module(cls=DollyV2Model, lr=2e-5, eps=1e-8)\n", + " .trainer(\n", + " max_epochs=1, \n", + " accelerator=\"gpu\", \n", + " precision=\"16-mixed\",\n", + " )\n", + " .strategy(\n", + " name=\"fsdp\",\n", + " sharding_strategy=ShardingStrategy.FULL_SHARD,\n", + " backward_prefetch=BackwardPrefetch.BACKWARD_PRE,\n", + " forward_prefetch=True,\n", + " auto_wrap_policy=auto_wrap_policy,\n", + " limit_all_gathers=True,\n", + " activation_checkpointing=[GPTNeoXLayer],\n", + " )\n", + " .checkpointing(save_top_k=0, save_weights_only=True, save_last=True)\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + ":::{tip}\n", + "\n", + "Some tips for FSDP configutarion:\n", + "- `sharding_strategy`:\n", + " - `ShardingStrategy.NO_SHARD`: Parameters, gradients, and optimizer states are not sharded. Similar to DDP.\n", + " - `ShardingStrategy.SHARD_GRAD_OP`: Gradients and optimizer states are sharded during computation, and additionally, parameters are sharded outside computation. Similar to ZeRO stage-2.\n", + " - `ShardingStrategy.FULL_SHARD`: Parameters, gradients, and optimizer states are sharded. It has minimal GRAM usage among the 3 options. Similar to ZeRO stage-3.\n", + "- `auto_wrap_policy`:\n", + " - Model layers are often wrapped with FSDP in a layered fashion. This means that only the layers in a single FSDP instance are required to aggregate all parameters to a single device during forwarding or backward calculations.\n", + " - Use `transformer_auto_wrap_policy` to automatically wrap each Transformer Block into a single FSDP instance. \n", + "- `backward_prefetch` and `forward_prefetch`:\n", + " - Overlap the upcoming all-gather while executing the current forward/backward pass. It can improve throughput but may slightly increase peak memory usage.\n", + ":::" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-tune with LightningTrainer\n", + "\n", + "```{note}\n", + "Here we save the checkpoints to the local file system. You can also upload the checkpoints to cloud storage by setting S3 bucket URI to {class}`air.RunConfig(storage_path=S3_BUCKET_URI) `. See {ref}`train-run-config` for an example.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_workers = 16\n", + "batch_size_per_worker = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-05-05 00:17:19,384\tINFO streaming_executor.py:91 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "2023-05-05 00:17:19,384\tINFO streaming_executor.py:92 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-05-05 00:17:19,385\tINFO streaming_executor.py:94 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "Running: 0.0/272.0 CPU, 0.0/16.0 GPU, 1.98 MiB/73.21 GiB object_store_memory: 0%| | 0/1 [00:00.\n", + " \r" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pytorch_lightning.callbacks import TQDMProgressBar\n", + "\n", + "# Create a customized progress bar for LightningTrainer\n", + "class DollyV2ProgressBar(TQDMProgressBar):\n", + " def __init__(self, num_iters_per_epoch, *args, **kwargs):\n", + " super().__init__(*args, **kwargs)\n", + " self.num_iters_per_epoch = num_iters_per_epoch\n", + " \n", + " def on_train_epoch_start(self, trainer, *_):\n", + " super().on_train_epoch_start(trainer, *_)\n", + " self.train_progress_bar.reset(self.num_iters_per_epoch)\n", + "\n", + "total_batches = splitter.fit_transform(ray_datasets[\"train\"]).count()\n", + "num_iters_per_epoch = total_batches // (num_workers * batch_size_per_worker)\n", + "lightning_config.trainer(callbacks=[DollyV2ProgressBar(num_iters_per_epoch)])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "
    \n", + "
    \n", + "

    Tune Status

    \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Current time:2023-05-05 01:03:12
    Running for: 00:45:50.28
    Memory: 35.4/124.4 GiB
    \n", + "
    \n", + "
    \n", + "
    \n", + "

    System Info

    \n", + " Using FIFO scheduling algorithm.
    Logical resource usage: 0/272 CPUs, 0/16 GPUs (0.0/16.0 accelerator_type:T4)\n", + "
    \n", + " \n", + "
    \n", + "
    \n", + "
    \n", + "

    Trial Status

    \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Trial name status loc iter total time (s) train_loss epoch step
    LightningTrainer_e0990_00000TERMINATED10.0.102.147:41219 1 2699.78 0.166992 0 135
    \n", + "
    \n", + "
    \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-05-05 00:17:21,842\tWARNING trial_runner.py:1607 -- The maximum number of pending trials has been automatically set to the number of available cluster CPUs, which is high (299 CPUs/pending trials). If you're running an experiment with a large number of trials, this could lead to scheduling overhead. In this case, consider setting the `TUNE_MAX_PENDING_TRIALS_PG` environment variable to the desired maximum number of concurrent trials.\n", + "(LightningTrainer pid=41219) 2023-05-05 00:17:28,673\tINFO backend_executor.py:128 -- Starting distributed worker processes: ['41376 (10.0.102.147)', '8301 (10.0.67.96)', '8263 (10.0.103.36)', '27794 (10.0.105.149)', '8088 (10.0.110.210)', '8238 (10.0.106.19)', '8225 (10.0.81.63)', '8200 (10.0.106.22)', '8231 (10.0.90.160)', '8345 (10.0.98.168)', '28207 (10.0.76.146)', '8213 (10.0.115.72)', '8272 (10.0.92.209)', '8247 (10.0.74.31)', '27629 (10.0.68.102)', '8224 (10.0.88.86)']\n", + "(RayTrainWorker pid=41376) 2023-05-05 00:17:30,953\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=16]\n", + "\n", + "(pid=41219) Running: 0.0/272.0 CPU, 0.0/16.0 GPU, 0.0 MiB/73.21 GiB object_store_memory: 0%| | 0/1 [00:00 TaskPoolMapOperator[BatchMapper->BatchMapper] -> AllToAllOperator[RandomizeBlockOrder]\n", + "\n", + "(pid=41219) Running: 0.0/272.0 CPU, 0.0/16.0 GPU, 0.0 MiB/73.21 GiB object_store_memory: 0%| | 0/1 [00:00\n", + "

    Trial Progress

    \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Trial name _report_on date done epoch experiment_taghostname iterations_since_restorenode_ip pidshould_checkpoint step time_since_restore time_this_iter_s time_total_s timestamp train_loss training_iterationtrial_id
    LightningTrainer_e0990_00000train_epoch_end2023-05-05_01-02-26True 0 0ip-10-0-102-147 110.0.102.14741219True 135 2699.78 2699.78 2699.78 1683273746 0.166992 1e0990_00000
    \n", + "
    \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=41376) `Trainer.fit` stopped: `max_epochs=1` reached.\n", + "(RayTrainWorker pid=41376) RayFSDPStrategy: tearing down strategy...\n" + ] + } + ], + "source": [ + "from ray.tune.syncer import SyncConfig\n", + "# Save AIR checkpoints according to the performance on validation set\n", + "run_config = RunConfig(\n", + " name=\"finetune_dolly-v2-7b\",\n", + " checkpoint_config=CheckpointConfig(),\n", + " sync_config=SyncConfig(sync_artifacts=False)\n", + ")\n", + "\n", + "# Scale the DDP training workload across 16 GPUs\n", + "# You can change this config based on your compute resources.\n", + "scaling_config = ScalingConfig(\n", + " num_workers=num_workers, use_gpu=True, resources_per_worker={\"CPU\": 12, \"GPU\": 1}\n", + ")\n", + "\n", + "trainer = LightningTrainer(\n", + " lightning_config=lightning_config.build(),\n", + " run_config=run_config,\n", + " scaling_config=scaling_config,\n", + " datasets={\"train\": ray_datasets[\"train\"]},\n", + " datasets_iter_config={\"batch_size\": batch_size_per_worker},\n", + " preprocessor=preprocessor,\n", + ")\n", + "result = trainer.fit()\n", + "\n", + "result\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We finished training in 2361s. The price for an on-demand g4dn.4xlarge instance is `$1.204/hour`, while a g4dn.4xlarge instance costs `$2.176/hour`. The total cost would be `($1.204 * 15 + $2.176) * 2699 / 3600 = $15.17`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text-generation with HuggingFace Pipeline\n", + "\n", + "We can use the [HuggingFace Pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) to generate predictions from our fine-tuned model. Let's input some prompts and see if our tuned Dolly can speak like Shakespeare:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from transformers import pipeline\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, padding_side=\"right\")\n", + "\n", + "dolly = result.checkpoint.get_model(model_class=DollyV2Model, map_location=torch.device(\"cpu\"))\n", + "\n", + "nlp_pipeline = pipeline(\n", + " task=\"text-generation\", \n", + " model=dolly.model, \n", + " tokenizer=tokenizer, \n", + " device_map=\"auto\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'generated_text': 'This is the very place, my lord, where I was born.'}]\n", + "[{'generated_text': 'I am a man of a thousand lives, and I will live.'}]\n", + "[{'generated_text': 'Once more, my lord, I beseech you, hear me speak.'}]\n" + ] + } + ], + "source": [ + "for prompt in [\"This is\", \"I am\", \"Once more\"]:\n", + " print(nlp_pipeline(prompt, max_new_tokens=20, do_sample=True, pad_token_id=tokenizer.eos_token_id))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "References:\n", + "- [PyTorch FSDP Tutorial](https://www.youtube.com/watch?v=8_k76AHu__s&list=PL_lsbAsL_o2BT6aerEKgIoufVD_fodnuT)\n", + "- [Getting Started with Fully Sharded Data Parallel(FSDP)](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html#:~:text=FSDP%20is%20a%20type%20of,sizes%20for%20our%20training%20job.)\n", + "- [Fully Sharded Data Parallel: faster AI training with fewer GPUs](https://engineering.fb.com/2021/07/15/open-source/fsdp/)\n", + "- [Hugging Face: dolly-v2-7b Model Card](https://huggingface.co/databricks/dolly-v2-7b)\n", + "- [Hugging Face: Handling big models for inference](https://huggingface.co/docs/accelerate/usage_guides/big_modeling)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/doc/source/ray-air/examples/feast_example.ipynb b/doc/source/ray-air/examples/feast_example.ipynb index 25462ab455ea..3631138f01ea 100644 --- a/doc/source/ray-air/examples/feast_example.ipynb +++ b/doc/source/ray-air/examples/feast_example.ipynb @@ -1062,7 +1062,7 @@ "source": [ "## Define Preprocessors\n", "\n", - "[Preprocessor](https://docs.ray.io/en/latest/ray-air/getting-started.html#preprocessors) does last mile processing on Ray Datastreams before feeding into training model." + "[Preprocessor](https://docs.ray.io/en/latest/ray-air/getting-started.html#preprocessors) does last mile processing on Ray Data before feeding into training model." ] }, { diff --git a/doc/source/ray-air/examples/gptj_batch_prediction.ipynb b/doc/source/ray-air/examples/gptj_batch_prediction.ipynb index 64b3b27fd888..148ff843ec55 100644 --- a/doc/source/ray-air/examples/gptj_batch_prediction.ipynb +++ b/doc/source/ray-air/examples/gptj_batch_prediction.ipynb @@ -95,7 +95,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since we will be using a pretrained model from Hugging Face hub, the simplest way is to use {meth}`map_batches ` with a [callable class UDF](transform_datasets_callable_classes). This will allow us to save time by initializing a model just once and then feed it multiple batches of data." + "Since we will be using a pretrained model from Hugging Face hub, the simplest way is to use {meth}`map_batches ` with a [callable class UDF](transforming_data_actors). This will allow us to save time by initializing a model just once and then feed it multiple batches of data." ] }, { @@ -167,7 +167,8 @@ " PredictCallable,\n", " batch_size=4,\n", " fn_constructor_kwargs=dict(model_id=model_id, revision=revision),\n", - " compute=\"actors\",\n", + " batch_format=\"pandas\",\n", + " compute=ray.data.ActorPoolStrategy(),\n", " num_gpus=1,\n", " )\n", ")" diff --git a/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb b/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb index 3a7f92378434..27e223dbbc99 100644 --- a/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb +++ b/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb @@ -1,21 +1,24 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ + "(gptj_deepspeed_finetune)=\n", + "\n", "# GPT-J-6B Fine-Tuning with Ray AIR and DeepSpeed\n", "\n", "In this example, we will showcase how to use the Ray AIR for **GPT-J fine-tuning**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This particular model has 6 billion parameters. For more information on GPT-J, click [here](https://huggingface.co/docs/transformers/model_doc/gptj).\n", "\n", "We will use Ray AIR (with the 🤗 Transformers integration) and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n", "\n", - "This example focuses more on the performance and distributed computing aspects of Ray AIR. If you are looking for a more beginner friendly introduction to Ray AIR 🤗 Transformers integration, see {doc}`this example `.\n", + "This example focuses more on the performance and distributed computing aspects of Ray AIR. If you are looking for a more beginner-friendly introduction to Ray AIR 🤗 Transformers integration, see {doc}`this example `.\n", "\n", "It is highly recommended to read [Ray AIR Key Concepts](air-key-concepts) and [Ray Data Key Concepts](data_key_concepts) before starting this example.\n", "\n", "```{note}\n", - "In order to run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The amount of memory needed will depend on the model. This notebook is being tested with 16 g4dn.4xlarge instances.\n", + "To run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The required amount of memory depends on the model. This notebook is tested with 16 g4dn.4xlarge instances (including the head node). If you wish to use a CPU head node, turn on [cloud checkpointing](tune-cloud-checkpointing) to avoid OOM errors that may happen due to the default behavior of syncing the checkpoint files to the head node.\n", "```\n", "\n", "In this notebook, we will:\n", @@ -308,7 +311,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will use [Ray Data](datasets) for distributed preprocessing and data ingestion. We can easily convert the dataset obtained from Hugging Face Hub to Ray Data by using {meth}`ray.data.from_huggingface`." + "We will use [Ray Data](data) for distributed preprocessing and data ingestion. We can easily convert the dataset obtained from Hugging Face Hub to Ray Data by using {meth}`ray.data.from_huggingface`." ] }, { @@ -401,16 +404,17 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Fine-tuning the model with Ray AIR \n", "\n", - "We can now configure Ray AIR's {class}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer` to perform distributed fine-tuning of the model. In order to do that, we specify a `trainer_init_per_worker` function, which creates a 🤗 Transformers `Trainer` that will be distributed by Ray using Distributed Data Parallelism (using PyTorch Distributed backend internally). This means that each worker will have its own copy of the model, but operate on different data, At the end of each step, all the workers will sync gradients.\n", + "We can now configure Ray AIR's {class}`~ray.train.hf_transformers.TransformersTrainer` to perform distributed fine-tuning of the model. In order to do that, we specify a `trainer_init_per_worker` function, which creates a 🤗 Transformers `Trainer` that will be distributed by Ray using Distributed Data Parallelism (using PyTorch Distributed backend internally). This means that each worker will have its own copy of the model, but operate on different data, At the end of each step, all the workers will sync gradients.\n", "\n", "Because GPT-J is a relatively large model, it may not be possible to fit it on smaller GPU types (<=16 GB GRAM). To deal with that issue, we can use [DeepSpeed](https://github.com/microsoft/DeepSpeed), a library to optimize the training process and allow us to (among other things) offload and partition optimizer and parameter states, reducing GRAM usage. Furthermore, DeepSpeed ZeRO Stage 3 allows us to load large models without running out of memory.\n", "\n", - "🤗 Transformers and Ray AIR's integration ({class}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer`) allow you to easily configure and use DDP and DeepSpeed. All you need to do is specify the DeepSpeed configuration in the [`TrainingArguments`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments) object.\n", + "🤗 Transformers and Ray AIR's integration ({class}`~ray.train.hf_transformers.TransformersTrainer`) allow you to easily configure and use DDP and DeepSpeed. All you need to do is specify the DeepSpeed configuration in the [`TrainingArguments`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments) object.\n", "\n", "```{tip}\n", "There are many DeepSpeed settings that allow you to trade-off speed for memory usage. The settings used below are tailored to the cluster setup used (16 g4dn.4xlarge nodes) and per device batch size of 16. Some things to keep in mind:\n", @@ -559,15 +563,16 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "With our `trainer_init_per_worker` complete, we can now instantiate the {class}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer`. Aside from the function, we set the `scaling_config`, controlling the amount of workers and resources used, and the `datasets` we will use for training and evaluation.\n", + "With our `trainer_init_per_worker` complete, we can now instantiate the {class}`~ray.train.hf_transformers.TransformersTrainer`. Aside from the function, we set the `scaling_config`, controlling the amount of workers and resources used, and the `datasets` we will use for training and evaluation.\n", "\n", "We pass the preprocessors we have defined earlier as an argument, wrapped in a {class}`~ray.data.preprocessors.chain.Chain`. The preprocessor will be included with the returned {class}`~ray.air.checkpoint.Checkpoint`, meaning it will also be applied during inference.\n", "\n", "```{note}\n", - "If you want to upload checkpoints to cloud storage (eg. S3), use {class}`~ray.tune.syncer.SyncConfig` - see {ref}`train-config-sync` for an example. Using cloud storage is highly recommended, especially for production.\n", + "If you want to upload checkpoints to cloud storage (eg. S3), set {class}`air.RunConfig(storage_path) `. See {ref}`train-run-config` for an example. Using cloud storage is highly recommended, especially for production.\n", "```" ] }, @@ -577,12 +582,12 @@ "metadata": {}, "outputs": [], "source": [ - "from ray.train.huggingface import HuggingFaceTrainer\n", + "from ray.train.hf_transformers import TransformersTrainer\n", "from ray.air.config import ScalingConfig\n", "from ray.data.preprocessors import Chain\n", "\n", "\n", - "trainer = HuggingFaceTrainer(\n", + "trainer = TransformersTrainer(\n", " trainer_init_per_worker=trainer_init_per_worker,\n", " trainer_init_config={\n", " \"batch_size\": 16, # per device\n", @@ -599,10 +604,11 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we call the {meth}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer.fit` method to start training with Ray AIR. We will save the {class}`~ray.air.Result` object to a variable so we can access metrics and checkpoints." + "Finally, we call the {meth}`~ray.train.hf_transformers.TransformersTrainer.fit` method to start training with Ray AIR. We will save the {class}`~ray.air.Result` object to a variable so we can access metrics and checkpoints." ] }, { @@ -640,7 +646,7 @@ "Trial name status loc iter total time (s) loss learning_rate epoch\n", "\n", "\n", - "HuggingFaceTrainer_f623d_00000TERMINATED10.0.30.196:30861 85 2579.30.0715 4.70588e-07 1\n", + "TransformersTrainer_f623d_00000TERMINATED10.0.30.196:30861 85 2579.30.0715 4.70588e-07 1\n", "\n", "\n", "
    \n", @@ -977,7 +983,7 @@ { "data": { "text/plain": [ - "HuggingFaceCheckpoint(local_path=/home/ray/ray_results/HuggingFaceTrainer_2023-03-06_16-35-29/HuggingFaceTrainer_f623d_00000_0_2023-03-06_16-35-30/checkpoint_000000)" + "TransformersCheckpoint(local_path=/home/ray/ray_results/TransformersTrainer_2023-03-06_16-35-29/TransformersTrainer_f623d_00000_0_2023-03-06_16-35-30/checkpoint_000000)" ] }, "execution_count": 18, @@ -996,13 +1002,13 @@ "source": [ "### Generate text from prompt\n", "\n", - "We can use the {class}`~ray.train.huggingface.huggingface_predictor.HuggingFacePredictor` to generate predictions from our fine-tuned model.\n", + "We can use the {class}`~ray.train.hf_transformers.huggingface_predictor.TransformersPredictor` to generate predictions from our fine-tuned model.\n", "\n", "```{tip}\n", "For large scale batch inference, consider configuring cloud checkpointing and then pass the cloud-backed {class}`~ray.air.checkpoint.Checkpoint` to {class}`~ray.train.batch_predictor.BatchPredictor`. More information [here](air-predictors).\n", "```\n", "\n", - "Because the {class}`~ray.train.huggingface.huggingface_predictor.HuggingFacePredictor` uses a 🤗 Transformers [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) under the hood, we disable the tokenizer AIR Preprocessor we have used for training and let the `pipeline` to tokenize the data itself." + "Because the {class}`~ray.train.hf_transformers.huggingface_predictor.TransformersPredictor` uses a 🤗 Transformers [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) under the hood, we disable the tokenizer AIR Preprocessor we have used for training and let the `pipeline` to tokenize the data itself." ] }, { @@ -1028,13 +1034,13 @@ "metadata": {}, "outputs": [], "source": [ - "from ray.train.huggingface import HuggingFacePredictor\n", + "from ray.train.hf_transformers import TransformersPredictor\n", "import pandas as pd\n", "\n", "prompts = pd.DataFrame([\"Romeo and Juliet\", \"Romeo\", \"Juliet\"], columns=[\"text\"])\n", "\n", "# Predict on the head node.\n", - "predictor = HuggingFacePredictor.from_checkpoint(\n", + "predictor = TransformersPredictor.from_checkpoint(\n", " checkpoint=checkpoint,\n", " task=\"text-generation\",\n", " torch_dtype=torch.float16 if use_gpu else None,\n", diff --git a/doc/source/ray-air/examples/huggingface_text_classification.ipynb b/doc/source/ray-air/examples/huggingface_text_classification.ipynb index 59d25e4a5164..6823ea3d2279 100644 --- a/doc/source/ray-air/examples/huggingface_text_classification.ipynb +++ b/doc/source/ray-air/examples/huggingface_text_classification.ipynb @@ -433,7 +433,7 @@ "id": "256fOuzjhYbY" }, "source": [ - "For Ray AIR, instead of using 🤗 Dataset objects directly, we will convert them to [Ray Datastreams](https://docs.ray.io/en/latest/data/dataset.html). Both are backed by Arrow tables, so the conversion is straightforward. We will use the built-in `ray.data.from_huggingface` function." + "For Ray AIR, instead of using 🤗 Dataset objects directly, we will convert them to [Ray Data](data). Both are backed by Arrow tables, so the conversion is straightforward. We will use the built-in `ray.data.from_huggingface` function." ] }, { @@ -444,9 +444,9 @@ { "data": { "text/plain": [ - "{'train': Datastream(num_blocks=1, num_rows=8551, schema={sentence: string, label: int64, idx: int32}),\n", - " 'validation': Datastream(num_blocks=1, num_rows=1043, schema={sentence: string, label: int64, idx: int32}),\n", - " 'test': Datastream(num_blocks=1, num_rows=1063, schema={sentence: string, label: int64, idx: int32})}" + "{'train': Dataset(num_blocks=1, num_rows=8551, schema={sentence: string, label: int64, idx: int32}),\n", + " 'validation': Dataset(num_blocks=1, num_rows=1043, schema={sentence: string, label: int64, idx: int32}),\n", + " 'test': Dataset(num_blocks=1, num_rows=1063, schema={sentence: string, label: int64, idx: int32})}" ] }, "execution_count": 11, @@ -522,13 +522,13 @@ "\n", "We will not go into details about each specific component of the training (see the [original notebook](https://github.com/huggingface/notebooks/blob/6ca682955173cc9d36ffa431ddda505a048cbe80/examples/text_classification.ipynb) for that). The tokenizer is the same as we have used to encoded the dataset before.\n", "\n", - "The main difference when using the Ray AIR is that we need to create our 🤗 Transformers `Trainer` inside a function (`trainer_init_per_worker`) and return it. That function will be passed to the `HuggingFaceTrainer` and will run on every Ray worker. The training will then proceed by the means of PyTorch DDP.\n", + "The main difference when using the Ray AIR is that we need to create our 🤗 Transformers `Trainer` inside a function (`trainer_init_per_worker`) and return it. That function will be passed to the `TransformersTrainer` and will run on every Ray worker. The training will then proceed by the means of PyTorch DDP.\n", "\n", "Make sure that you initialize the model, metric, and tokenizer inside that function. Otherwise, you may run into serialization errors.\n", "\n", "Furthermore, `push_to_hub=True` is not yet supported. Ray will, however, checkpoint the model at every epoch, allowing you to push it to hub manually. We will do that after the training.\n", "\n", - "If you wish to use thrid party logging libraries, such as MLflow or Weights&Biases, do not set them in `TrainingArguments` (they will be automatically disabled) - instead, you should pass Ray AIR callbacks to `HuggingFaceTrainer`'s `run_config`. In this example, we will use MLflow." + "If you wish to use thrid party logging libraries, such as MLflow or Weights&Biases, do not set them in `TrainingArguments` (they will be automatically disabled) - instead, you should pass Ray AIR callbacks to `TransformersTrainer`'s `run_config`. In this example, we will use MLflow." ] }, { @@ -596,7 +596,7 @@ "id": "CdzABDVcIrJg" }, "source": [ - "With our `trainer_init_per_worker` complete, we can now instantiate the `HuggingFaceTrainer`. Aside from the function, we set the `scaling_config`, controlling the amount of workers and resources used, and the `datasets` we will use for training and evaluation.\n", + "With our `trainer_init_per_worker` complete, we can now instantiate the `TransformersTrainer`. Aside from the function, we set the `scaling_config`, controlling the amount of workers and resources used, and the `datasets` we will use for training and evaluation.\n", "\n", "We specify the `MLflowLoggerCallback` inside the `run_config`, and pass the preprocessor we have defined earlier as an argument. The preprocessor will be included with the returned `Checkpoint`, meaning it will also be applied during inference." ] @@ -609,11 +609,11 @@ }, "outputs": [], "source": [ - "from ray.train.huggingface import HuggingFaceTrainer\n", + "from ray.train.hf_transformers import TransformersTrainer\n", "from ray.air.config import RunConfig, ScalingConfig, CheckpointConfig\n", "from ray.air.integrations.mlflow import MLflowLoggerCallback\n", "\n", - "trainer = HuggingFaceTrainer(\n", + "trainer = TransformersTrainer(\n", " trainer_init_per_worker=trainer_init_per_worker,\n", " scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),\n", " datasets={\n", @@ -656,12 +656,12 @@ { "data": { "text/html": [ - "== Status ==
    Current time: 2022-08-25 10:14:09 (running for 00:04:06.45)
    Memory usage on this node: 4.3/62.0 GiB
    Using FIFO scheduling algorithm.
    Resources requested: 0/208 CPUs, 0/16 GPUs, 0.0/574.34 GiB heap, 0.0/241.51 GiB objects (0.0/4.0 accelerator_type:T4)
    Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-08-25_10-10-02
    Number of trials: 1/1 (1 TERMINATED)
    \n", + "== Status ==
    Current time: 2022-08-25 10:14:09 (running for 00:04:06.45)
    Memory usage on this node: 4.3/62.0 GiB
    Using FIFO scheduling algorithm.
    Resources requested: 0/208 CPUs, 0/16 GPUs, 0.0/574.34 GiB heap, 0.0/241.51 GiB objects (0.0/4.0 accelerator_type:T4)
    Result logdir: /home/ray/ray_results/TransformersTrainer_2022-08-25_10-10-02
    Number of trials: 1/1 (1 TERMINATED)
    \n", "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "
    Trial name status loc iter total time (s) loss learning_rate epoch
    HuggingFaceTrainer_c1ff5_00000TERMINATED172.31.90.137:947 2 200.2170.3886 0 2
    TransformersTrainer_c1ff5_00000TERMINATED172.31.90.137:947 2 200.2170.3886 0 2


    " ], @@ -823,7 +823,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_c1ff5_00000:\n", + "Result for TransformersTrainer_c1ff5_00000:\n", " _time_this_iter_s: 90.87123560905457\n", " _timestamp: 1661447540\n", " _training_iteration: 1\n", @@ -923,7 +923,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_c1ff5_00000:\n", + "Result for TransformersTrainer_c1ff5_00000:\n", " _time_this_iter_s: 96.96447467803955\n", " _timestamp: 1661447637\n", " _training_iteration: 2\n", @@ -957,7 +957,7 @@ " trial_id: c1ff5_00000\n", " warmup_time: 0.003858327865600586\n", " \n", - "Result for HuggingFaceTrainer_c1ff5_00000:\n", + "Result for TransformersTrainer_c1ff5_00000:\n", " _time_this_iter_s: 96.96447467803955\n", " _timestamp: 1661447637\n", " _training_iteration: 2\n", @@ -1029,7 +1029,7 @@ { "data": { "text/plain": [ - "Result(metrics={'loss': 0.3886, 'learning_rate': 0.0, 'epoch': 2.0, 'step': 1070, 'eval_loss': 0.6215357184410095, 'eval_matthews_correlation': 0.42957017514952434, 'eval_runtime': 0.9956, 'eval_samples_per_second': 273.204, 'eval_steps_per_second': 5.022, 'train_runtime': 174.4696, 'train_samples_per_second': 98.023, 'train_steps_per_second': 6.133, 'train_loss': 0.4661755713346963, '_timestamp': 1661447637, '_time_this_iter_s': 96.96447467803955, '_training_iteration': 2, 'should_checkpoint': True, 'done': True, 'trial_id': 'c1ff5_00000', 'experiment_tag': '0'}, error=None, log_dir=PosixPath('/home/ray/ray_results/HuggingFaceTrainer_2022-08-25_10-10-02/HuggingFaceTrainer_c1ff5_00000_0_2022-08-25_10-10-04'))" + "Result(metrics={'loss': 0.3886, 'learning_rate': 0.0, 'epoch': 2.0, 'step': 1070, 'eval_loss': 0.6215357184410095, 'eval_matthews_correlation': 0.42957017514952434, 'eval_runtime': 0.9956, 'eval_samples_per_second': 273.204, 'eval_steps_per_second': 5.022, 'train_runtime': 174.4696, 'train_samples_per_second': 98.023, 'train_steps_per_second': 6.133, 'train_loss': 0.4661755713346963, '_timestamp': 1661447637, '_time_this_iter_s': 96.96447467803955, '_training_iteration': 2, 'should_checkpoint': True, 'done': True, 'trial_id': 'c1ff5_00000', 'experiment_tag': '0'}, error=None, log_dir=PosixPath('/home/ray/ray_results/TransformersTrainer_2022-08-25_10-10-02/TransformersTrainer_c1ff5_00000_0_2022-08-25_10-10-04'))" ] }, "execution_count": 16, @@ -1052,7 +1052,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If we would like to tune any hyperparameters of the model, we can do so by simply passing our `HuggingFaceTrainer` into a `Tuner` and defining the search space.\n", + "If we would like to tune any hyperparameters of the model, we can do so by simply passing our `TransformersTrainer` into a `Tuner` and defining the search space.\n", "\n", "We can also take advantage of the advanced search algorithms and schedulers provided by Ray Tune. In this example, we will use an `ASHAScheduler` to aggresively terminate underperforming trials." ] @@ -1099,15 +1099,15 @@ "data": { "text/html": [ "== Status ==
    Current time: 2022-08-25 10:20:13 (running for 00:06:01.75)
    Memory usage on this node: 4.4/62.0 GiB
    Using AsyncHyperBand: num_stopped=4\n", - "Bracket: Iter 4.000: -0.8064090609550476 | Iter 1.000: -0.6378736793994904
    Resources requested: 0/208 CPUs, 0/16 GPUs, 0.0/574.34 GiB heap, 0.0/241.51 GiB objects (0.0/4.0 accelerator_type:T4)
    Current best trial: 5654d_00001 with eval_loss=0.6492420434951782 and parameters={'trainer_init_config': {'learning_rate': 0.0002, 'epochs': 4}}
    Result logdir: /home/ray/ray_results/HuggingFaceTrainer_2022-08-25_10-14-11
    Number of trials: 4/4 (4 TERMINATED)
    \n", + "Bracket: Iter 4.000: -0.8064090609550476 | Iter 1.000: -0.6378736793994904
    Resources requested: 0/208 CPUs, 0/16 GPUs, 0.0/574.34 GiB heap, 0.0/241.51 GiB objects (0.0/4.0 accelerator_type:T4)
    Current best trial: 5654d_00001 with eval_loss=0.6492420434951782 and parameters={'trainer_init_config': {'learning_rate': 0.0002, 'epochs': 4}}
    Result logdir: /home/ray/ray_results/TransformersTrainer_2022-08-25_10-14-11
    Number of trials: 4/4 (4 TERMINATED)
    \n", "\n", "\n", "\n", "\n", - "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", "\n", "
    Trial name status loc trainer_init_conf... iter total time (s) loss learning_rate epoch
    HuggingFaceTrainer_5654d_00000TERMINATED172.31.90.137:1729 2e-05 4 347.171 0.1958 0 4
    HuggingFaceTrainer_5654d_00001TERMINATED172.31.76.237:1805 0.0002 1 95.24920.6225 0.00015 1
    HuggingFaceTrainer_5654d_00002TERMINATED172.31.85.32:1322 0.002 1 93.76130.6463 0.0015 1
    HuggingFaceTrainer_5654d_00003TERMINATED172.31.85.193:1060 0.02 1 99.36770.926 0.015 1
    TransformersTrainer_5654d_00000TERMINATED172.31.90.137:1729 2e-05 4 347.171 0.1958 0 4
    TransformersTrainer_5654d_00001TERMINATED172.31.76.237:1805 0.0002 1 95.24920.6225 0.00015 1
    TransformersTrainer_5654d_00002TERMINATED172.31.85.32:1322 0.002 1 93.76130.6463 0.0015 1
    TransformersTrainer_5654d_00003TERMINATED172.31.85.193:1060 0.02 1 99.36770.926 0.015 1


    " ], @@ -1354,7 +1354,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_5654d_00000:\n", + "Result for TransformersTrainer_5654d_00000:\n", " _time_this_iter_s: 85.01727724075317\n", " _timestamp: 1661447753\n", " _training_iteration: 1\n", @@ -1419,7 +1419,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_5654d_00001:\n", + "Result for TransformersTrainer_5654d_00001:\n", " _time_this_iter_s: 84.79700112342834\n", " _timestamp: 1661447759\n", " _training_iteration: 1\n", @@ -1484,7 +1484,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_5654d_00002:\n", + "Result for TransformersTrainer_5654d_00002:\n", " _time_this_iter_s: 84.01720070838928\n", " _timestamp: 1661447764\n", " _training_iteration: 1\n", @@ -1549,7 +1549,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_5654d_00003:\n", + "Result for TransformersTrainer_5654d_00003:\n", " _time_this_iter_s: 89.4301290512085\n", " _timestamp: 1661447782\n", " _training_iteration: 1\n", @@ -1614,7 +1614,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_5654d_00000:\n", + "Result for TransformersTrainer_5654d_00000:\n", " _time_this_iter_s: 76.82565689086914\n", " _timestamp: 1661447830\n", " _training_iteration: 2\n", @@ -1679,7 +1679,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Result for HuggingFaceTrainer_5654d_00000:\n", + "Result for TransformersTrainer_5654d_00000:\n", " _time_this_iter_s: 76.47252488136292\n", " _timestamp: 1661447906\n", " _training_iteration: 3\n", @@ -1767,7 +1767,7 @@ "output_type": "stream", "text": [ "(RayTrainWorker pid=1789, ip=172.31.90.137) {'train_runtime': 329.1948, 'train_samples_per_second': 103.902, 'train_steps_per_second': 6.501, 'train_loss': 0.34860724689804506, 'epoch': 4.0}\n", - "Result for HuggingFaceTrainer_5654d_00000:\n", + "Result for TransformersTrainer_5654d_00000:\n", " _time_this_iter_s: 98.92064905166626\n", " _timestamp: 1661448005\n", " _training_iteration: 4\n", @@ -1894,7 +1894,7 @@ " 0.003661\n", " 4\n", " 0.00020\n", - " /home/ray/ray_results/HuggingFaceTrainer_2022-...\n", + " /home/ray/ray_results/TransformersTrainer_2022-...\n", " \n", " \n", " 3\n", @@ -1918,7 +1918,7 @@ " 0.004133\n", " 4\n", " 0.02000\n", - " /home/ray/ray_results/HuggingFaceTrainer_2022-...\n", + " /home/ray/ray_results/TransformersTrainer_2022-...\n", " \n", " \n", " 2\n", @@ -1942,7 +1942,7 @@ " 0.004533\n", " 4\n", " 0.00200\n", - " /home/ray/ray_results/HuggingFaceTrainer_2022-...\n", + " /home/ray/ray_results/TransformersTrainer_2022-...\n", " \n", " \n", " 0\n", @@ -1966,7 +1966,7 @@ " 0.003702\n", " 4\n", " 0.00002\n", - " /home/ray/ray_results/HuggingFaceTrainer_2022-...\n", + " /home/ray/ray_results/TransformersTrainer_2022-...\n", " \n", " \n", "\n", @@ -2005,10 +2005,10 @@ "0 4 0.00002 \n", "\n", " logdir \n", - "1 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", - "3 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", - "2 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", - "0 /home/ray/ray_results/HuggingFaceTrainer_2022-... \n", + "1 /home/ray/ray_results/TransformersTrainer_2022-... \n", + "3 /home/ray/ray_results/TransformersTrainer_2022-... \n", + "2 /home/ray/ray_results/TransformersTrainer_2022-... \n", + "0 /home/ray/ray_results/TransformersTrainer_2022-... \n", "\n", "[4 rows x 33 columns]" ] @@ -2044,7 +2044,7 @@ "id": "Tfoyu1q7hYbb" }, "source": [ - "You can now use the checkpoint to run prediction with `HuggingFacePredictor`, which wraps around [🤗 Pipelines](https://huggingface.co/docs/transformers/main_classes/pipelines). In order to distribute prediction, we use `BatchPredictor`. While this is not necessary for the very small example we are using (you could use `HuggingFacePredictor` directly), it will scale well to a large dataset." + "You can now use the checkpoint to run prediction with `TransformersPredictor`, which wraps around [🤗 Pipelines](https://huggingface.co/docs/transformers/main_classes/pipelines). In order to distribute prediction, we use `BatchPredictor`. While this is not necessary for the very small example we are using (you could use `TransformersPredictor` directly), it will scale well to a large dataset." ] }, { @@ -2096,17 +2096,17 @@ } ], "source": [ - "from ray.train.huggingface import HuggingFacePredictor\n", + "from ray.train.hf_transformers import TransformersPredictor\n", "from ray.train.batch_predictor import BatchPredictor\n", "import pandas as pd\n", "\n", "predictor = BatchPredictor.from_checkpoint(\n", " checkpoint=best_result.checkpoint,\n", - " predictor_cls=HuggingFacePredictor,\n", + " predictor_cls=TransformersPredictor,\n", " task=\"text-classification\",\n", " device=0 if use_gpu else -1, # -1 is CPU, otherwise device index\n", ")\n", - "prediction = predictor.predict(ray_datasets[\"test\"].map_batches(lambda x: x[[\"sentence\"]]), num_gpus_per_worker=int(use_gpu))\n", + "prediction = predictor.predict(ray_datasets[\"test\"].map_batches(lambda x: x[[\"sentence\"]], batch_format=\"pandas\"), num_gpus_per_worker=int(use_gpu))\n", "prediction.show()" ] }, @@ -2189,9 +2189,9 @@ }, "outputs": [], "source": [ - "from ray.train.huggingface import HuggingFaceCheckpoint\n", + "from ray.train.hf_transformers import TransformersCheckpoint\n", "\n", - "checkpoint = HuggingFaceCheckpoint.from_checkpoint(result.checkpoint)\n", + "checkpoint = TransformersCheckpoint.from_checkpoint(result.checkpoint)\n", "hf_trainer = checkpoint.get_model(model=AutoModelForSequenceClassification)" ] }, diff --git a/doc/source/ray-air/examples/index.rst b/doc/source/ray-air/examples/index.rst index 252364456b63..8c29a9645cde 100644 --- a/doc/source/ray-air/examples/index.rst +++ b/doc/source/ray-air/examples/index.rst @@ -29,6 +29,8 @@ Text/NLP - :doc:`/ray-air/examples/gptj_batch_prediction`: How to use Ray AIR to do batch prediction with the Hugging Face Transformers GPT-J model. - :doc:`/ray-air/examples/gptj_serving`: How to use Ray AIR to do online serving with the Hugging Face Transformers GPT-J model. - :doc:`/ray-air/examples/dreambooth_finetuning`: How to fine-tune a DreamBooth text-to-image model with your own images. +- :doc:`/ray-air/examples/opt_deepspeed_batch_inference`: How to run batch inference on a dataset of texts with a 30B OPT model. +- :doc:`/ray-air/examples/dolly_lightning_fsdp_finetuning`: How to fine-tune a dolly-v2-7b model with Ray AIR LightningTrainer and FSDP. Image/CV -------- diff --git a/doc/source/ray-air/examples/lightgbm_example.ipynb b/doc/source/ray-air/examples/lightgbm_example.ipynb index 13ef9da63285..420a1c895b58 100644 --- a/doc/source/ray-air/examples/lightgbm_example.ipynb +++ b/doc/source/ray-air/examples/lightgbm_example.ipynb @@ -51,7 +51,7 @@ "from ray.data.preprocessors.encoder import Categorizer\n", "from ray.train.lightgbm import LightGBMTrainer\n", "from ray.air.config import ScalingConfig\n", - "from ray.data import Datastream\n", + "from ray.data import Dataset\n", "from ray.air.result import Result\n", "from ray.data.preprocessors import StandardScaler" ] @@ -71,7 +71,7 @@ "metadata": {}, "outputs": [], "source": [ - "def prepare_data() -> Tuple[Datastream, Datastream, Datastream]:\n", + "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer_with_categorical.csv\")\n", " train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n", " test_dataset = valid_dataset.drop_columns(cols=[\"target\"])\n", @@ -210,8 +210,8 @@ "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)\n", "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", "\u001b[2m\u001b[36m(pid=1491578)\u001b[0m from pandas import MultiIndex, Int64Index\n", - "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: Datastream 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: Datastream 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: Dataset 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", "\u001b[2m\u001b[36m(LightGBMTrainer pid=1491578)\u001b[0m UserWarning: cpus_per_actor is set to less than 2. Distributed LightGBM needs at least 2 CPUs per actor to train efficiently. This may lead to a degradation of performance during training.\n", "\u001b[2m\u001b[36m(pid=1491651)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", "\u001b[2m\u001b[36m(pid=1491651)\u001b[0m from pandas import MultiIndex, Int64Index\n", diff --git a/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb b/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb new file mode 100644 index 000000000000..e5945910fe0c --- /dev/null +++ b/doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb @@ -0,0 +1,911 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "dfdf1047", + "metadata": {}, + "source": [ + "# Batch Inference with OPT 30B and Ray Data\n", + "\n", + "This notebook was tested on a single p3.16xlarge instance with 8 V100 GPUs.\n", + "\n", + "## Set Up\n", + "Initialize Ray and a runtime environment to ensure that all dependent packages are available." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "36bb842b-b6b6-4cbc-a4f9-a3a65ec069ce", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-04-22 11:12:15,071\tINFO worker.py:1314 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS\n", + "fatal: not a git repository (or any parent up to mount point /home/ray)\n", + "Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\n", + "2023-04-22 11:12:15,676\tINFO worker.py:1432 -- Connecting to existing Ray cluster at address: 172.31.244.129:9031...\n", + "2023-04-22 11:12:15,724\tINFO worker.py:1607 -- Connected to Ray cluster. View the dashboard at https://console.anyscale.com/api/v2/sessions/ses_jgkdnu2723aleytwqqhebr12vs/services?redirect_to=dashboard \n", + "2023-04-22 11:12:15,732\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_7ad665e3661cefc8f8037daeb0b5ba6e.zip' (0.03MiB) to Ray cluster...\n", + "2023-04-22 11:12:15,733\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_7ad665e3661cefc8f8037daeb0b5ba6e.zip'.\n" + ] + }, + { + "data": { + "text/html": [ + "
    \n", + "
    \n", + "

    Ray

    \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
    Python version:3.9.15
    Ray version: 3.0.0.dev0
    Dashboard:http://console.anyscale.com/api/v2/sessions/ses_jgkdnu2723aleytwqqhebr12vs/services?redirect_to=dashboard
    \n", + "
    \n", + "
    \n" + ], + "text/plain": [ + "RayContext(dashboard_url='console.anyscale.com/api/v2/sessions/ses_jgkdnu2723aleytwqqhebr12vs/services?redirect_to=dashboard', python_version='3.9.15', ray_version='3.0.0.dev0', ray_commit='17df2ef17983406bb178c251044c9dc654b378c0', address_info={'node_ip_address': '172.31.244.129', 'raylet_ip_address': '172.31.244.129', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2023-04-22_11-09-11_790337_150/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2023-04-22_11-09-11_790337_150/sockets/raylet', 'webui_url': 'console.anyscale.com/api/v2/sessions/ses_jgkdnu2723aleytwqqhebr12vs/services?redirect_to=dashboard', 'session_dir': '/tmp/ray/session_2023-04-22_11-09-11_790337_150', 'metrics_export_port': 61073, 'gcs_address': '172.31.244.129:9031', 'address': '172.31.244.129:9031', 'dashboard_agent_listen_port': 52365, 'node_id': 'e6e9dfeda4469dd816c080bec2cf1cd12abdd978ae74b87e869164eb'})" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ray\n", + "\n", + "ray.init(\n", + " runtime_env={\n", + " \"pip\": [\n", + " \"numpy==1.23\",\n", + " \"protobuf==3.20.0\",\n", + " \"transformers==4.27.2\",\n", + " \"accelerate==0.17.1\",\n", + " \"deepspeed==0.8.3\",\n", + " ],\n", + " \"env_vars\": {\n", + " \"HF_HUB_DISABLE_PROGRESS_BARS\": \"1\",\n", + " }\n", + " }\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b619a878", + "metadata": {}, + "source": [ + "## Define Hyperparameters\n", + "\n", + "Define a list of hyperparameters as a global dataclass.\n", + "\n", + "Refer to https://deepspeed.readthedocs.io/en/stable/inference-init.html#deepspeed.inference.config.DeepSpeedInferenceConfig for more details about the configurations of a DeepSpeed inference job." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "613df744", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from typing import Optional\n", + "\n", + "\n", + "@dataclass\n", + "class Config:\n", + " model_name: str = \"facebook/opt-30b\"\n", + " # Path to HuggingFace cache directory. Default is ~/.cache/huggingface/.\n", + " cache_dir: Optional[str] = None\n", + " # Path to the directory that actually holds model files.\n", + " # e.g., ~/.cache/huggingface/models--facebook--opt-30b/snapshots/xxx/\n", + " # If this path is not None, we skip download models from HuggingFace.\n", + " repo_root: Optional[str] = None\n", + " # This is how many DeepSpeed-inference replicas to run for\n", + " # this batch inference job.\n", + " num_worker_groups: int = 1\n", + " # Number of DeepSpeed workers per group.\n", + " num_workers_per_group: int = 8\n", + "\n", + " batch_size: int = 1\n", + " dtype: str = \"float16\"\n", + " # Maximum number of tokens DeepSpeed inference-engine can work with,\n", + " # including the input and output tokens.\n", + " max_tokens: int = 1024\n", + " # Use meta tensors to initialize model.\n", + " use_meta_tensor: bool = True\n", + " # Use cache for generation.\n", + " use_cache: bool = True\n", + " # The path for which we want to save the loaded model with a checkpoint.\n", + " save_mp_checkpoint_path: Optional[str] = None\n", + "\n", + "\n", + "config = Config()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "28df05bf", + "metadata": {}, + "source": [ + "## Download and Cache Model\n", + "\n", + "Next, we will download and cache model files on all instances of the cluster before we run the job.\n", + "\n", + "Notice that when we download model snapshots from HuggingFace, we skip files that end with safetensors, msgpack, and h5 extensions. These are Tensorflow and JAX weight files. We only need PyTorch weights for this example.\n", + "\n", + "We execute the ``download_model()`` function on every node of the cluster by using a ``NodeAffinitySchedulingStrategy`` from Ray Core." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "63b8a84d-57a6-4430-8fe8-9811760b8b7c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Caching model locally ...\n", + "Done. Model saved in /home/ray/.cache/huggingface/hub/models--facebook--opt-30b/snapshots/ceea0a90ac0f6fae7c2c34bcb40477438c152546\n" + ] + } + ], + "source": [ + "\n", + "from huggingface_hub import snapshot_download\n", + "import ray\n", + "from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy\n", + "\n", + "\n", + "@ray.remote\n", + "def download_model(config: Config):\n", + " # This function downloads the specified HF model into a local directory.\n", + " # This can also download models from cloud storages like S3.\n", + " return snapshot_download(\n", + " repo_id=config.model_name,\n", + " cache_dir=config.cache_dir,\n", + " allow_patterns=[\"*\"],\n", + " # Skip downloading TF and FLAX weight files.\n", + " ignore_patterns=[\"*.safetensors\", \"*.msgpack\", \"*.h5\"],\n", + " revision=None,\n", + " )\n", + "\n", + "if config.repo_root is None:\n", + " # Download model files to all GPU nodes, and set correct repo_root.\n", + " refs = []\n", + " for node in ray.nodes():\n", + " if node[\"Alive\"] and node[\"Resources\"].get(\"GPU\", None):\n", + " node_id = node[\"NodeID\"]\n", + " scheduling_strategy = NodeAffinitySchedulingStrategy(\n", + " node_id=node_id, soft=False\n", + " )\n", + " options = {\"scheduling_strategy\": scheduling_strategy}\n", + " refs.append(\n", + " download_model.options(scheduling_strategy=scheduling_strategy).remote(config)\n", + " )\n", + "\n", + " print(\"Caching model locally ...\")\n", + "\n", + " # Wait for models to finish downloading.\n", + " config.repo_root = ray.get(refs)[0]\n", + "\n", + " print(f\"Done. Model saved in {config.repo_root}\")\n", + "else:\n", + " print(f\"Using existing model saved in {config.repo_root}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6b14b7d9", + "metadata": {}, + "source": [ + "## Define DeepSpeed Utility Classes\n", + "\n", + "Next, we define a few utility classes and functions that are useful for setting up and running the DeepSpeed inference job.\n", + "\n", + "Note that the Pipeline is modeled after https://github.com/microsoft/DeepSpeedExamples/tree/efacebb3ddbea86bb20c3af30fd060be0fa41ac8/inference/huggingface/text-generation." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f9aad2a9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + " from pandas import MultiIndex, Int64Index\n" + ] + } + ], + "source": [ + "import gc\n", + "import io\n", + "import json\n", + "import math\n", + "import os\n", + "from pathlib import Path\n", + "from typing import List\n", + "\n", + "import deepspeed\n", + "import torch\n", + "from deepspeed.runtime.utils import see_memory_usage\n", + "from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "\n", + "class DSPipeline:\n", + " \"\"\"\n", + " Example helper class for comprehending DeepSpeed Meta Tensors, meant to mimic HF pipelines.\n", + " The DSPipeline can run with and without meta tensors.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " model_name,\n", + " dtype=torch.float16,\n", + " is_meta=True,\n", + " device=-1,\n", + " repo_root=None,\n", + " ):\n", + " self.model_name = model_name\n", + " self.dtype = dtype\n", + "\n", + " if isinstance(device, torch.device):\n", + " self.device = device\n", + " elif isinstance(device, str):\n", + " self.device = torch.device(device)\n", + " elif device < 0:\n", + " self.device = torch.device(\"cpu\")\n", + " else:\n", + " self.device = torch.device(f\"cuda:{device}\")\n", + "\n", + " self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side=\"right\")\n", + " self.tokenizer.pad_token = self.tokenizer.eos_token\n", + "\n", + " if is_meta:\n", + " # When meta tensors enabled, use checkpoints\n", + " self.config = AutoConfig.from_pretrained(self.model_name)\n", + " self.checkpoints_json = self._generate_json(repo_root)\n", + "\n", + " with deepspeed.OnDevice(dtype=dtype, device=\"meta\"):\n", + " self.model = AutoModelForCausalLM.from_config(self.config)\n", + " else:\n", + " self.model = AutoModelForCausalLM.from_pretrained(self.model_name)\n", + "\n", + " self.model.eval()\n", + "\n", + " def __call__(self, inputs, **kwargs):\n", + " input_list = [inputs] if isinstance(inputs, str) else inputs\n", + " outputs = self.generate_outputs(input_list, **kwargs)\n", + " return outputs\n", + "\n", + " def _generate_json(self, repo_root):\n", + " if os.path.exists(os.path.join(repo_root, \"ds_inference_config.json\")):\n", + " # Simply use the available inference config.\n", + " return os.path.join(repo_root, \"ds_inference_config.json\")\n", + "\n", + " # Write a checkpoints config file in local directory.\n", + " checkpoints_json = \"checkpoints.json\"\n", + "\n", + " with io.open(checkpoints_json, \"w\", encoding=\"utf-8\") as f:\n", + " file_list = [\n", + " str(entry).split(\"/\")[-1]\n", + " for entry in Path(repo_root).rglob(\"*.[bp][it][n]\")\n", + " if entry.is_file()\n", + " ]\n", + " data = {\n", + " # Hardcode bloom for now.\n", + " # Possible choices are \"bloom\", \"ds_model\", \"Megatron\".\n", + " \"type\": \"bloom\",\n", + " \"checkpoints\": file_list,\n", + " \"version\": 1.0\n", + " }\n", + " json.dump(data, f)\n", + "\n", + " return checkpoints_json\n", + "\n", + " def generate_outputs(self, inputs, **generate_kwargs):\n", + " input_tokens = self.tokenizer.batch_encode_plus(\n", + " inputs, return_tensors=\"pt\", padding=True\n", + " )\n", + " for t in input_tokens:\n", + " if torch.is_tensor(input_tokens[t]):\n", + " input_tokens[t] = input_tokens[t].to(self.device)\n", + "\n", + " self.model.cuda().to(self.device)\n", + "\n", + " outputs = self.model.generate(**input_tokens, **generate_kwargs)\n", + " outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", + "\n", + " return outputs\n", + "\n", + "\n", + "def _memory_usage(gpu_id: int, msg: str):\n", + " \"\"\"Print memory usage.\"\"\"\n", + " if gpu_id != 0:\n", + " return\n", + " see_memory_usage(msg, True)\n", + "\n", + "\n", + "def init_model(config: Config, world_size: int, gpu_id: int) -> DSPipeline:\n", + " \"\"\"Initialize the deepspeed model.\"\"\"\n", + " data_type = getattr(torch, config.dtype)\n", + "\n", + " _memory_usage(gpu_id, \"before init\")\n", + " pipe = DSPipeline(\n", + " model_name=config.model_name,\n", + " dtype=data_type,\n", + " is_meta=config.use_meta_tensor,\n", + " device=gpu_id,\n", + " repo_root=config.repo_root,\n", + " )\n", + " _memory_usage(gpu_id, \"after init\")\n", + "\n", + " if config.use_meta_tensor:\n", + " ds_kwargs = dict(\n", + " base_dir=config.repo_root, checkpoint=pipe.checkpoints_json\n", + " )\n", + " else:\n", + " ds_kwargs = dict()\n", + "\n", + " gc.collect()\n", + "\n", + " pipe.model = deepspeed.init_inference(\n", + " pipe.model,\n", + " dtype=data_type,\n", + " mp_size=world_size,\n", + " replace_with_kernel_inject=True,\n", + " replace_method=True,\n", + " max_tokens=config.max_tokens,\n", + " save_mp_checkpoint_path=config.save_mp_checkpoint_path,\n", + " **ds_kwargs,\n", + " )\n", + " _memory_usage(gpu_id, \"after init_inference\")\n", + "\n", + " return pipe\n", + "\n", + "\n", + "def generate(\n", + " input_sentences: List[str], pipe: DSPipeline, batch_size: int, **generate_kwargs\n", + ") -> List[str]:\n", + " \"\"\"Generate predictions using a DSPipeline.\"\"\"\n", + " if batch_size > len(input_sentences):\n", + " # Dynamically extend to support larger bs by repetition.\n", + " input_sentences *= math.ceil(batch_size / len(input_sentences))\n", + "\n", + " inputs = input_sentences[:batch_size]\n", + " outputs = pipe(inputs, **generate_kwargs)\n", + " return outputs" + ] + }, + { + "cell_type": "markdown", + "id": "bd20d4d9", + "metadata": {}, + "source": [] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "62eee91d", + "metadata": {}, + "source": [ + "## Define a DeepSpeed Predictor\n", + "\n", + "Define an AIR Predictor to be instantiated by the Dataset pipeline below.\n", + "\n", + "Each DeepSpeedPredictor is a stateful Ray actor that understands how to process the input prompt using a group of DeepSpeed inference workers.\n", + "\n", + "More specifically, each DeepSpeedPredictor sets up a proper PyTorch DDP process group before spinning up multiple PredictionWorkers. Since the model is loaded using the DeepSpeed inference framework, each PredictionWorker handles a shard of the entire DeepSpeed inference model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "516a200d-14e4-4b52-a615-e09778ba4117", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import List\n", + "\n", + "import pandas as pd\n", + "import ray\n", + "import ray.util\n", + "from ray.air import Checkpoint, ScalingConfig\n", + "from ray.air.util.torch_dist import (\n", + " TorchDistributedWorker,\n", + " init_torch_dist_process_group,\n", + " shutdown_torch_dist_process_group,\n", + ")\n", + "from ray.train.predictor import Predictor\n", + "from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy\n", + "\n", + "\n", + "@ray.remote\n", + "class PredictionWorker(TorchDistributedWorker):\n", + " \"\"\"A PredictionWorker is a Ray remote actor that runs a single shard of a DeepSpeed job.\n", + " \n", + " Multiple PredictionWorkers of the same WorkerGroup form a PyTorch DDP process\n", + " group and work together under the orchestration of DeepSpeed.\n", + " \"\"\"\n", + " def __init__(self, config: Config, world_size: int):\n", + " self.config = config\n", + " self.world_size = world_size\n", + "\n", + " def init_model(self, local_rank: int):\n", + " \"\"\"Initialize model for inference.\"\"\"\n", + " # Note: We have to provide the local_rank that was used to initiate\n", + " # the DDP process group here. e.g., a PredictionWorker may be the\n", + " # rank 0 worker of a group, but occupies gpu 7.\n", + " self.generator = init_model(self.config, self.world_size, local_rank)\n", + "\n", + " def generate(self, data: pd.DataFrame, column: str, **kwargs) -> List[str]:\n", + " return generate(\n", + " list(data[column]), self.generator, self.config.batch_size, **kwargs\n", + " )\n", + "\n", + "\n", + "# TODO: This Predictor should be part of Ray AIR.\n", + "class DeepSpeedPredictor(Predictor):\n", + " def __init__(self, checkpoint: Checkpoint, scaling_config: ScalingConfig) -> None:\n", + " self.checkpoint = checkpoint\n", + " self.scaling_config = scaling_config\n", + " self.init_worker_group(scaling_config)\n", + "\n", + " def __del__(self):\n", + " shutdown_torch_dist_process_group(self.prediction_workers)\n", + "\n", + " def init_worker_group(self, scaling_config: ScalingConfig):\n", + " \"\"\"Create the worker group.\n", + "\n", + " Each worker in the group communicates with other workers through the\n", + " torch distributed backend. The worker group is inelastic (a failure of\n", + " one worker destroys the entire group). Each worker in the group\n", + " recieves the same input data and outputs the same generated text.\n", + " \"\"\"\n", + " config = self.checkpoint.to_dict()[\"config\"]\n", + "\n", + " # Start a placement group for the workers.\n", + " self.pg = scaling_config.as_placement_group_factory().to_placement_group()\n", + " prediction_worker_cls = PredictionWorker.options(\n", + " num_cpus=scaling_config.num_cpus_per_worker,\n", + " num_gpus=scaling_config.num_gpus_per_worker,\n", + " resources=scaling_config.additional_resources_per_worker,\n", + " scheduling_strategy=PlacementGroupSchedulingStrategy(\n", + " placement_group=self.pg, placement_group_capture_child_tasks=True\n", + " ),\n", + " )\n", + " # Create the prediction workers.\n", + " self.prediction_workers = [\n", + " prediction_worker_cls.remote(config, scaling_config.num_workers)\n", + " for i in range(scaling_config.num_workers)\n", + " ]\n", + "\n", + " # Initialize torch distributed process group for the workers.\n", + " local_ranks = init_torch_dist_process_group(self.prediction_workers, backend=\"nccl\")\n", + "\n", + " # Initialize the model on each worker.\n", + " ray.get([\n", + " worker.init_model.remote(local_rank)\n", + " for worker, local_rank in zip(self.prediction_workers, local_ranks)\n", + " ])\n", + "\n", + " def _predict_pandas(\n", + " self,\n", + " data: pd.DataFrame,\n", + " input_column: str = \"prompt\",\n", + " output_column: str = \"output\",\n", + " **kwargs\n", + " ) -> pd.DataFrame:\n", + " data_ref = ray.put(data)\n", + " prediction = ray.get(\n", + " [\n", + " worker.generate.remote(data_ref, column=input_column, **kwargs)\n", + " for worker in self.prediction_workers\n", + " ]\n", + " )[0]\n", + "\n", + " return pd.DataFrame(prediction, columns=[output_column])\n", + "\n", + " @classmethod\n", + " def from_checkpoint(cls, checkpoint: Checkpoint, **kwargs) -> \"Predictor\":\n", + " return cls(checkpoint=checkpoint, **kwargs)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ca57e150", + "metadata": {}, + "source": [ + "## Create a Dataset Pipeline\n", + "\n", + "Finally, we connect all these pieces together, and use a BatchPredictor to run multiple copies of the DeepSpeedPredictor actors.\n", + "\n", + "This step helps parallelize our batch inference job and utilize all available resources in the cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "48bf4a4f-0ac4-4e77-a05a-710d42e0dc4e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-04-22 11:14:12,074\tWARNING dataset.py:4124 -- Deprecation warning: use Dataset.materialize() instead of fully_executed().\n", + "2023-04-22 11:14:12,079\tINFO streaming_executor.py:87 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Repartition] -> AllToAllOperator[RandomShuffle]\n", + "2023-04-22 11:14:12,081\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-04-22 11:14:12,082\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- Repartition 1: 0%| | 0/16 [00:00 ActorPoolMapOperator[MapBatches(ScoringWrapper)]\n", + "2023-04-22 11:14:12,682\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-04-22 11:14:12,683\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1.\n", + "2023-04-22 11:14:12,785\tINFO actor_pool_map_operator.py:114 -- MapBatches(ScoringWrapper): Waiting for 1 pool actors to start...\n", + "(_MapWorker pid=7005) The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.\n", + "0it [00:00, ?it/s]05) \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10038) [2023-04-22 11:14:30,762] [INFO] [utils.py:829:see_memory_usage] before init\n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:30,762] [INFO] [utils.py:830:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB \n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:30,762] [INFO] [utils.py:838:see_memory_usage] CPU Virtual Memory: used = 11.63 GB, percent = 2.4%\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10040) --------------------------------------------------------------------------\n", + "(PredictionWorker pid=10040) Aim collects anonymous usage analytics. \n", + "(PredictionWorker pid=10040) Read how to opt-out here: \n", + "(PredictionWorker pid=10040) https://aimstack.readthedocs.io/en/latest/community/telemetry.html \n", + "(PredictionWorker pid=10040) --------------------------------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10045) [2023-04-22 11:14:33,061] [INFO] [logging.py:93:log_dist] [Rank -1] DeepSpeed info: version=0.8.3, git-hash=unknown, git-branch=unknown\n", + "(PredictionWorker pid=10045) [2023-04-22 11:14:33,062] [WARNING] [config_utils.py:75:_process_deprecated_field] Config parameter replace_method is deprecated. This parameter is no longer needed, please remove from your call to DeepSpeed-inference\n", + "(PredictionWorker pid=10045) [2023-04-22 11:14:33,062] [WARNING] [config_utils.py:75:_process_deprecated_field] Config parameter mp_size is deprecated use tensor_parallel.tp_size instead\n", + "(PredictionWorker pid=10045) [2023-04-22 11:14:33,062] [INFO] [logging.py:93:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1\n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:33,074] [INFO] [utils.py:829:see_memory_usage] after init\n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:33,075] [INFO] [utils.py:830:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB \n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:33,075] [INFO] [utils.py:838:see_memory_usage] CPU Virtual Memory: used = 12.25 GB, percent = 2.6%\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10040) Using /home/ray/.cache/torch_extensions/py39_cu116 as PyTorch extensions root...\n", + "(PredictionWorker pid=10038) Creating extension directory /home/ray/.cache/torch_extensions/py39_cu116/transformer_inference...\n", + "(PredictionWorker pid=10038) Detected CUDA files, patching ldflags\n", + "(PredictionWorker pid=10038) Emitting ninja build file /home/ray/.cache/torch_extensions/py39_cu116/transformer_inference/build.ninja...\n", + "(PredictionWorker pid=10038) Building extension module transformer_inference...\n", + "(PredictionWorker pid=10038) Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10038) [1/9] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/dequantize.cu -o dequantize.cuda.o \n", + "(PredictionWorker pid=10038) [2/9] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/relu.cu -o relu.cuda.o \n", + "(PredictionWorker pid=10038) [3/9] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu -o apply_rotary_pos_emb.cuda.o \n", + "(PredictionWorker pid=10038) [4/9] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/transform.cu -o transform.cuda.o \n", + "(PredictionWorker pid=10038) /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/transform.cu(56): warning #177-D: variable \"lane\" was declared but never referenced\n", + "(PredictionWorker pid=10038) \n", + "(PredictionWorker pid=10038) /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/transform.cu(93): warning #177-D: variable \"half_dim\" was declared but never referenced\n", + "(PredictionWorker pid=10038) \n", + "(PredictionWorker pid=10038) /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/transform.cu(110): warning #177-D: variable \"vals_half\" was declared but never referenced\n", + "(PredictionWorker pid=10038) \n", + "(PredictionWorker pid=10038) /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/transform.cu(111): warning #177-D: variable \"output_half\" was declared but never referenced\n", + "(PredictionWorker pid=10038) \n", + "(PredictionWorker pid=10038) /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/transform.cu(128): warning #177-D: variable \"lane\" was declared but never referenced\n", + "(PredictionWorker pid=10038) \n", + "(PredictionWorker pid=10038) [5/9] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/softmax.cu -o softmax.cuda.o \n", + "(PredictionWorker pid=10038) /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/softmax.cu(272): warning #177-D: variable \"alibi_offset\" was declared but never referenced\n", + "(PredictionWorker pid=10038) \n", + "(PredictionWorker pid=10038) /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/softmax.cu(427): warning #177-D: variable \"warp_num\" was declared but never referenced\n", + "(PredictionWorker pid=10038) \n", + "(PredictionWorker pid=10038) [6/9] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/gelu.cu -o gelu.cuda.o \n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:33,250] [INFO] [logging.py:93:log_dist] [Rank -1] DeepSpeed info: version=0.8.3, git-hash=unknown, git-branch=unknown [repeated 7x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:33,251] [WARNING] [config_utils.py:75:_process_deprecated_field] Config parameter replace_method is deprecated. This parameter is no longer needed, please remove from your call to DeepSpeed-inference [repeated 7x across cluster]\n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:33,251] [WARNING] [config_utils.py:75:_process_deprecated_field] Config parameter mp_size is deprecated use tensor_parallel.tp_size instead [repeated 7x across cluster]\n", + "(PredictionWorker pid=10038) [2023-04-22 11:14:33,251] [INFO] [logging.py:93:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1 [repeated 7x across cluster]\n", + "(PredictionWorker pid=10038) [7/9] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++14 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/layer_norm.cu -o layer_norm.cuda.o \n", + "(PredictionWorker pid=10038) [8/9] c++ -MMD -MF pt_binding.o.d -DTORCH_EXTENSION_NAME=transformer_inference -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\\"_gcc\\\" -DPYBIND11_STDLIB=\\\"_libstdcpp\\\" -DPYBIND11_BUILD_ABI=\\\"_cxxabi1011\\\" -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/includes -I/home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/includes -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/TH -isystem /home/ray/anaconda3/lib/python3.9/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ray/anaconda3/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -c /home/ray/anaconda3/lib/python3.9/site-packages/deepspeed/ops/csrc/transformer/inference/csrc/pt_binding.cpp -o pt_binding.o \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10038) Loading extension module transformer_inference...\n", + "(PredictionWorker pid=10041) -------------------------------------------------------------------------- [repeated 14x across cluster]\n", + "(PredictionWorker pid=10041) Aim collects anonymous usage analytics. [repeated 7x across cluster]\n", + "(PredictionWorker pid=10041) Read how to opt-out here: [repeated 7x across cluster]\n", + "(PredictionWorker pid=10041) https://aimstack.readthedocs.io/en/latest/community/telemetry.html [repeated 7x across cluster]\n", + "(PredictionWorker pid=10041) Using /home/ray/.cache/torch_extensions/py39_cu116 as PyTorch extensions root... [repeated 7x across cluster]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10038) [9/9] c++ pt_binding.o gelu.cuda.o relu.cuda.o layer_norm.cuda.o softmax.cuda.o dequantize.cuda.o apply_rotary_pos_emb.cuda.o transform.cuda.o -shared -lcurand -L/home/ray/anaconda3/lib/python3.9/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o transformer_inference.so\n", + "(PredictionWorker pid=10038) Time to load transformer_inference op: 46.834928035736084 seconds\n", + "(PredictionWorker pid=10038) [2023-04-22 11:15:21,799] [INFO] [logging.py:93:log_dist] [Rank 0] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 7168, 'intermediate_size': 28672, 'heads': 56, 'num_hidden_layers': -1, 'fp16': True, 'pre_layer_norm': True, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-12, 'mp_size': 8, 'q_int8': False, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': , 'specialized_mode': False, 'training_mp_size': 1, 'bigscience_bloom': False, 'max_out_tokens': 1024, 'scale_attn_by_inverse_layer_idx': False, 'enable_qkv_quantization': False, 'use_mup': False, 'return_single_tuple': False}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(PredictionWorker pid=10040) No modifications detected for re-loaded extension module transformer_inference, skipping build step...\n", + "Loading 7 checkpoint shards: 0%| | 0/7 [00:00`! A {class}`BatchPredictor ` takes a checkpoint and a predictor class (e.g., {class}`~ray.train.torch.TorchPredictor`, {class}`~ray.train.tensorflow.TensorflowPredictor`) and provides an interface to run batch prediction on Ray {class}`~ray.data.Datastream`s. It will distribute the inference workload across multiple workers when calling `predict()` and run prediction on multiple shards of data in parallel. You can find more details in [Using Predictors for Inference](air-predictors).\n", + "Now that we have our dataset loaded and preprocessed with [Ray Data](data), we're ready to construct our {class}`BatchPredictor `! A {class}`BatchPredictor ` takes a checkpoint and a predictor class (e.g., {class}`~ray.train.torch.TorchPredictor`, {class}`~ray.train.tensorflow.TensorflowPredictor`) and provides an interface to run batch prediction on Ray {class}`~ray.data.Dataset`s. It will distribute the inference workload across multiple workers when calling `predict()` and run prediction on multiple shards of data in parallel. You can find more details in [Using Predictors for Inference](air-predictors).\n", "\n", "For the demo, we'll directly load a pretrained ResNet model from `torchvision.models` and construct a {class}`~ray.train.torch.TorchCheckpoint` which includes the preprocessor. You can also load your own Ray AIR checkpoint from your previous Train/Tune experiments. You can find more details about checkpoint loading at the [AIR `Checkpoint` API reference](air-checkpoint-ref)." ] @@ -391,7 +391,7 @@ "source": [ "## Evaluating Prediction Accuracy\n", "\n", - "`BatchPredictor.predict()` will return a Ray Datastream with a column of model output with key `\"predictions\"`, and all columns specified in `keep_columns`.\n", + "`BatchPredictor.predict()` will return a Dataset with a column of model output with key `\"predictions\"`, and all columns specified in `keep_columns`.\n", "\n", "In this example, the output of the ResNet model is a 1000-dimensional tensor containing the logits of each class. We'll measure accuracy with Top-1 and Top-5 accuracy.\n", "(Top-N accuracy: The percentage of predictions where the true label falls in the top N predicted classes.)" diff --git a/doc/source/ray-air/examples/pytorch_tabular_starter.py b/doc/source/ray-air/examples/pytorch_tabular_starter.py index 1e0c4e9d780c..72654ab7d593 100644 --- a/doc/source/ray-air/examples/pytorch_tabular_starter.py +++ b/doc/source/ray-air/examples/pytorch_tabular_starter.py @@ -54,7 +54,7 @@ def train_loop_per_worker(config): epochs = config["num_epochs"] num_features = config["num_features"] - # Get the Ray Datastream shard for this data parallel worker, + # Get the Dataset shard for this data parallel worker, # and convert it to a PyTorch Dataset. train_data = session.get_dataset_shard("train") # Create model. @@ -98,8 +98,8 @@ def train_loop_per_worker(config): preprocessor=preprocessor, ) # Execute training. -result = trainer.fit() -print(f"Last result: {result.metrics}") +best_result = trainer.fit() +print(f"Last result: {best_result.metrics}") # Last result: {'loss': 0.6559339960416158, ...} # __air_pytorch_train_end__ diff --git a/doc/source/ray-air/examples/sklearn_example.ipynb b/doc/source/ray-air/examples/sklearn_example.ipynb index 1b64d0c6f13d..7d47f4f1390c 100644 --- a/doc/source/ray-air/examples/sklearn_example.ipynb +++ b/doc/source/ray-air/examples/sklearn_example.ipynb @@ -50,7 +50,7 @@ "\n", "\n", "import ray\n", - "from ray.data import Datastream\n", + "from ray.data import Dataset\n", "from ray.train.batch_predictor import BatchPredictor\n", "from ray.train.sklearn import SklearnPredictor\n", "from ray.data.preprocessors import Chain, OrdinalEncoder, StandardScaler\n", @@ -81,7 +81,7 @@ "metadata": {}, "outputs": [], "source": [ - "def prepare_data() -> Tuple[Datastream, Datastream, Datastream]:\n", + "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer_with_categorical.csv\")\n", " train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n", " test_dataset = valid_dataset.drop_columns([\"target\"])\n", diff --git a/doc/source/ray-air/examples/stablediffusion_batch_prediction.ipynb b/doc/source/ray-air/examples/stablediffusion_batch_prediction.ipynb index efa12042a183..90735426b2a7 100644 --- a/doc/source/ray-air/examples/stablediffusion_batch_prediction.ipynb +++ b/doc/source/ray-air/examples/stablediffusion_batch_prediction.ipynb @@ -89,7 +89,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since we will be using a pretrained model from Hugging Face hub, the simplest way is to use {meth}`map_batches ` with a [callable class UDF](transform_datasets_callable_classes). This will allow us to save time by initializing a model just once and then feed it multiple batches of data." + "Since we will be using a pretrained model from Hugging Face hub, the simplest way is to use {meth}`map_batches ` with a [callable class UDF](transforming_data_actors). This will allow us to save time by initializing a model just once and then feed it multiple batches of data." ] }, { @@ -121,12 +121,14 @@ "\n", " def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:\n", " import torch\n", + " import numpy as np\n", "\n", " # Set a different seed for every image in batch\n", " self.pipe.generator = [\n", " torch.Generator(device=\"cuda\").manual_seed(i) for i in range(len(batch))\n", " ]\n", - " return self.pipe(list(batch[\"prompt\"])).images" + " images = self.pipe(list(batch[\"prompt\"])).images\n", + " return {\"images\": np.array(images, dtype=object)}" ] }, { @@ -160,10 +162,11 @@ " PredictCallable,\n", " batch_size=1,\n", " fn_constructor_kwargs=dict(model_id=model_id),\n", - " compute=\"actors\",\n", + " compute=ray.data.ActorPoolStrategy(),\n", + " batch_format=\"pandas\",\n", " num_gpus=1,\n", ")\n", - "images = preds.take_all()" + "results = preds.take_all()" ] }, { @@ -192,7 +195,7 @@ } ], "source": [ - "images[0]" + "results[0][\"images\"]" ] }, { @@ -213,7 +216,7 @@ } ], "source": [ - "images[1]" + "results[1][\"images\"]" ] }, { diff --git a/doc/source/ray-air/examples/tf_tabular_starter.py b/doc/source/ray-air/examples/tf_tabular_starter.py index ed339626b9b7..56e66f2f60b3 100644 --- a/doc/source/ray-air/examples/tf_tabular_starter.py +++ b/doc/source/ray-air/examples/tf_tabular_starter.py @@ -55,8 +55,8 @@ def train_loop_per_worker(config): epochs = config["num_epochs"] num_features = config["num_features"] - # Get the Ray Datastream shard for this data parallel worker, - # and convert it to a Tensorflow Datastream. + # Get the Dataset shard for this data parallel worker, + # and convert it to a Tensorflow Dataset. train_data = session.get_dataset_shard("train") strategy = tf.distribute.MultiWorkerMirroredStrategy() @@ -103,8 +103,8 @@ def train_loop_per_worker(config): preprocessor=preprocessor, ) -result = trainer.fit() -print(f"Last result: {result.metrics}") +best_result = trainer.fit() +print(f"Last result: {best_result.metrics}") # Last result: {'loss': 8.997025489807129, ...} # __air_tf_train_end__ diff --git a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb index 20ee25f6b8d4..fc03148db989 100644 --- a/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb +++ b/doc/source/ray-air/examples/tfx_tabular_train_to_serve.ipynb @@ -14,7 +14,7 @@ "In this example, we showcase how to achieve the same tasks as the Keras Tutorial using [Ray AIR](https://docs.ray.io/en/latest/ray-air/getting-started.html), covering\n", "every step from data ingestion to pushing a model to serving.\n", "\n", - "1. Read a CSV into [Ray Datastream](https://docs.ray.io/en/latest/data/dataset.html).\n", + "1. Read a CSV into [Dataset](dataset_concept).\n", "2. Process the dataset by chaining [Ray AIR preprocessors](https://docs.ray.io/en/latest/ray-air/getting-started.html#preprocessors).\n", "3. Train the model using the TensorflowTrainer from AIR.\n", "4. Serve the model using Ray Serve and the above preprocessors." @@ -439,13 +439,13 @@ "from typing import Tuple\n", "\n", "\n", - "def split_data(data: pd.DataFrame) -> Tuple[ray.data.Datastream, pd.DataFrame, np.array]:\n", + "def split_data(data: pd.DataFrame) -> Tuple[ray.data.Dataset, pd.DataFrame, np.array]:\n", " \"\"\"Split the data in a stratified way.\n", "\n", " Returns:\n", " A tuple containing train dataset, test data and test label.\n", " \"\"\"\n", - " # There is a native offering in Ray Datastream for split as well.\n", + " # There is a native offering in Dataset for split as well.\n", " # However, supporting stratification is a TODO there. So use\n", " # scikit-learn equivalent here.\n", " train_data, test_data = train_test_split(\n", diff --git a/doc/source/ray-air/examples/torch_detection.ipynb b/doc/source/ray-air/examples/torch_detection.ipynb index 31a9f4748184..1daaa8631657 100644 --- a/doc/source/ray-air/examples/torch_detection.ipynb +++ b/doc/source/ray-air/examples/torch_detection.ipynb @@ -10,7 +10,7 @@ "This tutorial explains how to fine-tune `fasterrcnn_resnet50_fpn` using the [Ray AI Runtime](air) for parallel data ingest and training.\n", "\n", "Here's what you'll do:\n", - "1. Load raw images and [VOC-style](http://host.robots.ox.ac.uk/pascal/VOC/) annotations into a Datastream\n", + "1. Load raw images and [VOC-style](http://host.robots.ox.ac.uk/pascal/VOC/) annotations into a Dataset\n", "2. Fine-tune `fasterrcnn_resnet50_fpn` (the backbone is pre-trained on ImageNet)\n", "3. Evaluate the model's accuracy\n", "\n", @@ -71,7 +71,7 @@ "id": "65bf13b8", "metadata": {}, "source": [ - "## Create a `Datastream`\n", + "## Create a `Dataset`\n", "\n", "You'll work with a subset of [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) that contains cats and dogs (the full dataset has 20 classes)." ] @@ -229,11 +229,11 @@ "\n", "```\n", "\n", - "[Ray Datasets](datasets) lets you read and preprocess data in parallel. Datasets doesn't\n", + "[Ray Data](data) lets you read and preprocess data in parallel. Ray Data doesn't\n", "have built-in support for VOC-style annotations, so you'll need to define a custom\n", "datasource.\n", "\n", - "A Datasource is an object that reads data of a particular type. For example, Datasets\n", + "A Datasource is an object that reads data of a particular type. For example, Ray Data\n", "implements a Datasource that reads CSV files. Your datasource will parse labels and\n", "bounding boxes from XML files. Later, you'll read the corresponding images.\n", "\n", @@ -326,7 +326,7 @@ "import ray\n", "\n", "\n", - "annotations: ray.data.Datastream = ray.data.read_datasource(\n", + "annotations: ray.data.Dataset = ray.data.read_datasource(\n", " VOCAnnotationDatasource(), paths=\"s3://anonymous@air-example-data/AnimalDetection/Annotations\"\n", ")" ] @@ -749,7 +749,7 @@ "Stage 0: 100%|██████████| 1/1 [00:03<00:00, 3.96s/it]2023-03-01 13:07:29,436\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor]\n", "(PipelineSplitExecutorCoordinator pid=191352) \n", "Stage 0: : 2it [00:08, 4.31s/it] 2023-03-01 13:07:33,990\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor]\n", - "(RayTrainWorker pid=175612) 2023-03-01 13:07:34,394\tWARNING plan.py:527 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#datasets-and-tune\n", + "(RayTrainWorker pid=175612) 2023-03-01 13:07:34,394\tWARNING plan.py:527 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#data-and-tune\n", "(PipelineSplitExecutorCoordinator pid=191352) \n", "Stage 0: : 3it [00:13, 4.48s/it]2023-03-01 13:07:38,660\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor]\n", "(RayTrainWorker pid=175612) /tmp/ipykernel_160001/3839218723.py:23: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:199.)\n", @@ -800,7 +800,7 @@ "(RayTrainWorker pid=175612) 2023-03-01 13:07:41,980\tINFO distributed.py:1027 -- Reducer buckets have been rebuilt in this iteration.\n", "(PipelineSplitExecutorCoordinator pid=191352) \n", "Stage 0: : 4it [01:11, 25.77s/it]2023-03-01 13:08:37,068\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor]\n", - "(RayTrainWorker pid=175614) 2023-03-01 13:08:37,464\tWARNING plan.py:527 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#datasets-and-tune\n", + "(RayTrainWorker pid=175614) 2023-03-01 13:08:37,464\tWARNING plan.py:527 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#data-and-tune\n", "2023-03-01 13:08:45,074\tINFO tune.py:825 -- Total run time: 125.51 seconds (125.36 seconds for the tuning loop).\n" ] } diff --git a/doc/source/ray-air/examples/torch_image_example.ipynb b/doc/source/ray-air/examples/torch_image_example.ipynb index d43dda4fcd7e..5718e54a1ca7 100644 --- a/doc/source/ray-air/examples/torch_image_example.ipynb +++ b/doc/source/ray-air/examples/torch_image_example.ipynb @@ -53,7 +53,7 @@ "\n", "We'll train our classifier on a popular image dataset called [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html).\n", "\n", - "First, let's load CIFAR-10 into a Ray Datastream." + "First, let's load CIFAR-10 into a Dataset." ] }, { @@ -100,8 +100,8 @@ "train_dataset = torchvision.datasets.CIFAR10(\"data\", download=True, train=True)\n", "test_dataset = torchvision.datasets.CIFAR10(\"data\", download=True, train=False)\n", "\n", - "train_dataset: ray.data.Datastream = ray.data.from_torch(train_dataset)\n", - "test_dataset: ray.data.Datastream = ray.data.from_torch(test_dataset)" + "train_dataset: ray.data.Dataset = ray.data.from_torch(train_dataset)\n", + "test_dataset: ray.data.Dataset = ray.data.from_torch(test_dataset)" ] }, { @@ -118,7 +118,7 @@ "version_minor": 0 }, "text/plain": [ - "VBox(children=(HTML(value='

    Datastream

    '), Tab(children=(HTML(value='
    Dataset'), Tab(children=(HTML(value='
    ` doesn't parallelize reads, so you shouldn't use it with larger datasets.\n", "\n", - "Next, let's represent our data using a dictionary of ndarrays instead of tuples. This lets us call {py:meth}`Datastream.iter_torch_batches ` later in the tutorial." + "Next, let's represent our data using a dictionary of ndarrays instead of tuples. This lets us call {py:meth}`Dataset.iter_torch_batches ` later in the tutorial." ] }, { @@ -189,9 +189,9 @@ "import torch\n", "\n", "\n", - "def convert_batch_to_numpy(batch: Tuple[Image, int]) -> Dict[str, np.ndarray]:\n", - " images = np.stack([np.array(image) for image, _ in batch])\n", - " labels = np.array([label for _, label in batch])\n", + "def convert_batch_to_numpy(batch) -> Dict[str, np.ndarray]:\n", + " images = np.stack([np.array(image) for image, _ in batch[\"item\"]])\n", + " labels = np.array([label for _, label in batch[\"item\"]])\n", " return {\"image\": images, \"label\": labels}\n", "\n", "\n", @@ -213,7 +213,7 @@ "version_minor": 0 }, "text/plain": [ - "VBox(children=(HTML(value='

    Datastream

    '), Tab(children=(HTML(value='
    Dataset'), Tab(children=(HTML(value='
    `.\n", - "* We call {py:func}`session.get_dataset_shard ` and {py:meth}`Datastream.iter_torch_batches ` to get a subset of our training data.\n", + "* We call {py:func}`session.get_dataset_shard ` and {py:meth}`Dataset.iter_torch_batches ` to get a subset of our training data.\n", "* We save model state using {py:func}`session.report `." ] }, @@ -559,7 +559,7 @@ " model=Net(),\n", ")\n", "\n", - "outputs: ray.data.Datastream = batch_predictor.predict(\n", + "outputs: ray.data.Dataset = batch_predictor.predict(\n", " data=test_dataset,\n", " dtype=torch.float,\n", " feature_columns=[\"image\"],\n", @@ -614,7 +614,7 @@ " return df[[\"prediction\", \"label\"]]\n", "\n", "\n", - "predictions = outputs.map_batches(convert_logits_to_classes)\n", + "predictions = outputs.map_batches(convert_logits_to_classes, batch_format=\"pandas\")\n", "\n", "predictions.show(1)" ] @@ -665,7 +665,7 @@ " return df\n", "\n", "\n", - "scores = predictions.map_batches(calculate_prediction_scores)\n", + "scores = predictions.map_batches(calculate_prediction_scores, batch_format=\"pandas\")\n", "\n", "scores.show(1)" ] diff --git a/doc/source/ray-air/examples/torch_incremental_learning.ipynb b/doc/source/ray-air/examples/torch_incremental_learning.ipynb index 038059889856..0f05e07cde6d 100644 --- a/doc/source/ray-air/examples/torch_incremental_learning.ipynb +++ b/doc/source/ray-air/examples/torch_incremental_learning.ipynb @@ -46,9 +46,9 @@ }, "source": [ "This example will cover the following:\n", - "1. Loading a PyTorch Dataset to Ray Datastreams\n", - "2. Create an `Iterator[ray.data.Datastreams]` abstraction to represent a stream of data to train on for incremental training.\n", - "3. Implement a custom Ray AIR preprocessor to preprocess the Datastream.\n", + "1. Loading a PyTorch Dataset to Ray Data\n", + "2. Create an `Iterator[ray.data.Dataset]` abstraction to represent a stream of data to train on for incremental training.\n", + "3. Implement a custom Ray AIR preprocessor to preprocess the dataset.\n", "4. Incrementally train a model using data parallel training.\n", "5. Use our trained model to perform batch prediction on test data.\n", "6. Incrementally deploying our trained model with Ray Serve and performing online prediction queries." @@ -242,19 +242,19 @@ "id": "3SVSrkqrDJuc" }, "source": [ - "## 3a: Load MNIST Dataset to a Ray Datastream\n", + "## 3a: Load MNIST Dataset to a Dataset\n", "\n", - "Let's first define a simple function that will return the original MNIST Dataset as a distributed Ray Datastream. Ray Datastreams are the standard way to load and exchange data in Ray libraries and applications, read more about them [here](https://docs.ray.io/en/latest/data/dataset.html)!\n", + "Let's first define a simple function that will return the original MNIST Dataset as a distributed Dataset. Ray Data is the standard way to load and exchange data in Ray libraries and applications, read more about the library [here](data)!\n", "\n", "The function in the below code snippet does the following:\n", "1. Downloads the MNIST Dataset from torchvision in-memory\n", - "2. Loads the in-memory Torch Dataset into a Ray Datastream\n", - "3. Converts the Ray Datastream into Numpy format. Instead of the Ray Datastream iterating over tuples, it will have 2 columns: \"image\" & \"label\". \n", - "This will allow us to apply built-in preprocessors to the Ray Datastream and allow Ray Datastreams to be used with Ray AIR Predictors.\n", + "2. Loads the in-memory Torch Dataset into a Dataset\n", + "3. Converts the Dataset into Numpy format. Instead of the Dataset iterating over tuples, it will have 2 columns: \"image\" & \"label\". \n", + "This will allow us to apply built-in preprocessors to the Dataset and allow Datasets to be used with Ray AIR Predictors.\n", "\n", "For this example, since we are just working with MNIST dataset, which is small, we use the {py:class}`~ray.data.datasource.from_torch` which just loads the full MNIST dataset into memory.\n", "\n", - "For loading larger datasets in a parallel fashion, you should use [Ray Datastream's additional read APIs](https://docs.ray.io/en/master/data/dataset.html#supported-input-formats) to load data from parquet, csv, image files, and more!" + "For loading larger datasets in a parallel fashion, you should use [Dataset's additional read APIs](input-output) to load data from parquet, csv, image files, and more!" ] }, { @@ -273,8 +273,8 @@ "import ray\n", "\n", "\n", - "def get_mnist_dataset(train: bool = True) -> ray.data.Datastream:\n", - " \"\"\"Returns MNIST Dataset as a ray.data.Datastream.\n", + "def get_mnist_dataset(train: bool = True) -> ray.data.Dataset:\n", + " \"\"\"Returns MNIST Dataset as a ray.data.Dataset.\n", " \n", " Args:\n", " train: Whether to return the train dataset or test dataset.\n", @@ -289,8 +289,8 @@ " mnist_dataset = ray.data.from_torch(mnist_dataset)\n", " \n", " def convert_batch_to_numpy(batch):\n", - " images = np.array([np.array(item[0]) for item in batch])\n", - " labels = np.array([item[1] for item in batch])\n", + " images = np.array([np.array(item[0]) for item in batch[\"item\"]])\n", + " labels = np.array([item[1] for item in batch[\"item\"]])\n", "\n", " return {\"image\": images, \"label\": labels}\n", "\n", @@ -308,10 +308,10 @@ "\n", "Now we can create our \"stream\" abstraction. This abstraction provides two\n", "methods (`generate_train_stream` and `generate_test_stream`) that each returns an Iterator\n", - "over Ray Datasets. Each item in this iterator contains a unique permutation of\n", + "over Ray Data. Each item in this iterator contains a unique permutation of\n", "MNIST, and is one task that we want to train on.\n", "\n", - "In this example, \"the stream of tasks\" is contrived since all the data for all tasks exist already in an offline setting. For true online continual learning, you would want to implement a custom dataset iterator that reads from some stream datasource to produce new tasks. The only abstraction that's needed is `Iterator[ray.data.Datastream]`.\n", + "In this example, \"the stream of tasks\" is contrived since all the data for all tasks exist already in an offline setting. For true online continual learning, you would want to implement a custom dataset iterator that reads from some stream datasource to produce new tasks. The only abstraction that's needed is `Iterator[ray.data.Dataset]`.\n", "\n", "Note that the test dataset stream has the same permutations that are used for the training dataset stream. In general for continual learning, it is expected that the data distribution of the test/prediction data follows what the model was trained on. If you notice that the distribution of new prediction queries is changing compared to the distribution of the training data, then you should probably trigger training of a new task." ] @@ -357,7 +357,7 @@ " self.test_mnist_dataset = get_mnist_dataset(train=False)\n", "\n", " def random_permute_dataset(\n", - " self, dataset: ray.data.Datastream, permutation: np.ndarray\n", + " self, dataset: ray.data.Dataset, permutation: np.ndarray\n", " ):\n", " \"\"\"Randomly permutes the pixels for each image in the dataset.\"\"\"\n", "\n", @@ -368,14 +368,14 @@ "\n", " return dataset.map_batches(PixelsPermutation, compute=ActorPoolStrategy(), batch_format=\"pandas\")\n", "\n", - " def generate_train_stream(self) -> Iterator[ray.data.Datastream]:\n", + " def generate_train_stream(self) -> Iterator[ray.data.Dataset]:\n", " for permutation in self.permutations:\n", " permuted_mnist_dataset = self.random_permute_dataset(\n", " self.train_mnist_dataset, permutation\n", " )\n", " yield permuted_mnist_dataset\n", "\n", - " def generate_test_stream(self) -> Iterator[ray.data.Datastream]:\n", + " def generate_test_stream(self) -> Iterator[ray.data.Dataset]:\n", " for permutation in self.permutations:\n", " mnist_dataset = get_mnist_dataset(train=False)\n", " permuted_mnist_dataset = self.random_permute_dataset(\n", @@ -401,7 +401,7 @@ "source": [ "# Step 4: Define the logic for Training and Inference/Prediction\n", "\n", - "Now that we can get an Iterator over Ray Datasets, we can incrementally train our model in a data parallel fashion via Ray Train, while incrementally deploying our model via Ray Serve. Let's define some helper functions to allow us to do this!\n", + "Now that we can get an Iterator over Ray Data, we can incrementally train our model in a data parallel fashion via Ray Train, while incrementally deploying our model via Ray Serve. Let's define some helper functions to allow us to do this!\n", "\n", "If you are not familiar with data parallel training, it is a form of distributed training strategies, where we have multiple model replicas, and each replica trains on a different batch of data. After each batch, the gradients are synchronized across the replicas. This effecitively allows us to train on more data in a shorter amount of time." ] @@ -421,7 +421,7 @@ "\n", "This is just standard PyTorch training, with the difference being that we can leverage [Ray Train's utility functions](train-pytorch-integration) and [Ray AIR Sesssion](air-session-ref):\n", "- `ray.train.torch.prepare_model(...)`: This will prepare the model for distributed training by wrapping it in either PyTorch `DistributedDataParallel` or `FullyShardedDataParallel` and moving it to the correct accelerator device.\n", - "- `ray.air.session.get_dataset_shard(...)`: This will get the Ray Dataset shard for this particular Data Parallel worker.\n", + "- `ray.air.session.get_dataset_shard(...)`: This will get the Dataset shard for this particular Data Parallel worker.\n", "- `ray.air.session.report({}, checkpoint=...)`: This will tell Ray Train to persist the provided `Checkpoint` object.\n", "- `ray.air.session.get_checkpoint()`: Returns a checkpoint to resume from. This is useful for either fault tolerance purposes, or for our purposes, to continue training the same model on a new incoming dataset." ] @@ -459,7 +459,7 @@ " optimizer = SGD(model.parameters(), lr=learning_rate, momentum=momentum)\n", " criterion = CrossEntropyLoss()\n", "\n", - " # Get the Ray Dataset shard for this data parallel worker, and convert it to a PyTorch Dataset.\n", + " # Get the Dataset shard for this data parallel worker, and convert it to a PyTorch Dataset.\n", " dataset_shard = session.get_dataset_shard(\"train\").iter_torch_batches(\n", " batch_size=batch_size,\n", " )\n", @@ -550,7 +550,7 @@ "from ray.train.batch_predictor import BatchPredictor\n", "from ray.train.torch import TorchPredictor\n", "\n", - "def batch_predict(checkpoint: ray.air.Checkpoint, test_dataset: ray.data.Datastream) -> float:\n", + "def batch_predict(checkpoint: ray.air.Checkpoint, test_dataset: ray.data.Dataset) -> float:\n", " \"\"\"Perform batch prediction on the provided test dataset, and return accuracy results.\"\"\"\n", "\n", " batch_predictor = BatchPredictor.from_checkpoint(checkpoint, predictor_cls=TorchPredictor, model=SimpleMLP(num_classes=10))\n", @@ -1393,7 +1393,7 @@ " # **************Batch Prediction**************************\n", "\n", " # We can do batch prediction on the test data for the tasks seen so far.\n", - " # TODO: Fix type signature in Ray Datasets\n", + " # TODO: Fix type signature in Ray Data\n", " # TODO: Fix dataset.union when used with empty list.\n", " if len(all_test_datasets_seen_so_far) > 0:\n", " full_test_dataset = test_dataset.union(*all_test_datasets_seen_so_far)\n", @@ -1471,7 +1471,7 @@ "\n", "We have now incrementally trained our simple multi-layer perceptron. Let's compare the incrementally trained model via fine tuning against a model that is trained on all the tasks up front.\n", "\n", - "Since we are using a naive fine-tuning strategy, we should expect that our incrementally trained model will perform worse than the the one that is fully trained! However, there's various other strategies that have been developed and are actively being researched to improve accuracy for incremental training. And overall, incremental/continual learning allows you to train in many real world settings where the entire dataset is not available up front, but new data is arriving at a relatively high rate." + "Since we are using a naive fine-tuning strategy, we should expect that our incrementally trained model will perform worse than the one that is fully trained! However, there's various other strategies that have been developed and are actively being researched to improve accuracy for incremental training. And overall, incremental/continual learning allows you to train in many real world settings where the entire dataset is not available up front, but new data is arriving at a relatively high rate." ] }, { @@ -1480,7 +1480,7 @@ "id": "RNHsEVBHc0p2" }, "source": [ - "Let's first combine all of our datasets for each task into a single, unified Dataset" + "Let's first combine all of our datasets for each task into a single, unified dataset" ] }, { @@ -1525,7 +1525,7 @@ "id": "tJ6Oqdgvc5dn" }, "source": [ - "Then, we train a new model on the unified Dataset using the same configurations as before." + "Then, we train a new model on the unified dataset using the same configurations as before." ] }, { diff --git a/doc/source/ray-air/examples/upload_to_comet_ml.ipynb b/doc/source/ray-air/examples/upload_to_comet_ml.ipynb index eaa4c8f39b4e..04c812c61a1d 100644 --- a/doc/source/ray-air/examples/upload_to_comet_ml.ipynb +++ b/doc/source/ray-air/examples/upload_to_comet_ml.ipynb @@ -26,7 +26,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -qU \"ray[tune]\" sklearn xgboost_ray comet_ml" + "!pip install -qU \"ray[tune]\" scikit-learn xgboost_ray comet_ml" ] }, { @@ -57,7 +57,7 @@ "id": "29fcd93b", "metadata": {}, "source": [ - "We define a simple function that returns our training dataset as a Ray Datastream:" + "We define a simple function that returns our training dataset as a Dataset:" ] }, { @@ -67,7 +67,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_train_dataset() -> ray.data.Datastream:\n", + "def get_train_dataset() -> ray.data.Dataset:\n", " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n", " return dataset" ] @@ -96,7 +96,7 @@ "metadata": {}, "outputs": [], "source": [ - "def train_model(train_dataset: ray.data.Datastream, comet_project: str) -> Result:\n", + "def train_model(train_dataset: ray.data.Dataset, comet_project: str) -> Result:\n", " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n", " trainer = XGBoostTrainer(\n", " scaling_config=ScalingConfig(num_workers=2),\n", @@ -138,7 +138,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-19 15:19:17,237\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n" + "2022-05-19 15:19:17,237\tINFO services.py:1483 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265\u001B[39m\u001B[22m\n" ] }, { @@ -165,23 +165,23 @@ "output_type": "stream", "text": [ "COMET WARNING: As you are running in a Jupyter environment, you will need to call `experiment.end()` when finished to ensure all metrics and code are logged before exiting.\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:21,584\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:21,584\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=16 --runtime-env-hash=-2010331134\n", "COMET INFO: Experiment is live on comet.ml https://www.comet.ml/krfricke/ray-air-example/ecd3726ca127497ba7386003a249fad6\n", "\n", "COMET WARNING: Failed to add tag(s) None to the experiment\n", "\n", "COMET WARNING: Empty mapping given to log_params({}); ignoring\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m UserWarning: Datastream 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:24,628\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m 2022-05-19 15:19:25,961\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,830\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,918\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[33m(raylet)\u001b[0m 2022-05-19 15:19:26,923\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331134\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m 2022-05-19 15:19:29,272\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=19876)\u001b[0m [15:19:29] task [xgboost.ray]:4505889744 got new rank 1\n", - "\u001b[2m\u001b[36m(_RemoteRayXGBoostActor pid=19875)\u001b[0m [15:19:29] task [xgboost.ray]:6941849424 got new rank 0\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:24,628\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=17 --runtime-env-hash=-2010331069\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:25,961\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,830\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=18 --runtime-env-hash=-2010331069\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,918\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=20 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=21 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,922\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=22 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[33m(raylet)\u001B[0m 2022-05-19 15:19:26,923\tINFO context.py:70 -- Exec'ing worker with command: exec /Users/kai/.pyenv/versions/3.7.7/bin/python3.7 /Users/kai/coding/ray/python/ray/workers/default_worker.py --node-ip-address=127.0.0.1 --node-manager-port=61222 --object-store-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/plasma_store --raylet-name=/tmp/ray/session_2022-05-19_15-19-14_632568_19778/sockets/raylet --redis-address=None --storage=None --temp-dir=/tmp/ray --metrics-agent-port=62873 --logging-rotate-bytes=536870912 --logging-rotate-backup-count=5 --gcs-address=127.0.0.1:61938 --redis-password=5241590000000000 --startup-token=19 --runtime-env-hash=-2010331134\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:29,272\tINFO main.py:1025 -- [RayXGBoost] Starting XGBoost training.\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=19876)\u001B[0m [15:19:29] task [xgboost.ray]:4505889744 got new rank 1\n", + "\u001B[2m\u001B[36m(_RemoteRayXGBoostActor pid=19875)\u001B[0m [15:19:29] task [xgboost.ray]:6941849424 got new rank 0\n", "COMET WARNING: The given value of the metric episodes_total was None; ignoring\n", "COMET WARNING: The given value of the metric timesteps_total was None; ignoring\n", "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 1.0.0 created\n" @@ -271,7 +271,7 @@ "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 10.0.0 created (previous was: 9.0.0)\n", "COMET INFO: Scheduling the upload of 3 assets for a size of 16.37 KB, this can take some time\n", "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:10.0.0' has started uploading asynchronously\n", - "\u001b[2m\u001b[36m(GBDTTrainable pid=19852)\u001b[0m 2022-05-19 15:19:33,890\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=569 in 7.96 seconds (4.61 pure XGBoost training time).\n", + "\u001B[2m\u001B[36m(GBDTTrainable pid=19852)\u001B[0m 2022-05-19 15:19:33,890\tINFO main.py:1519 -- [RayXGBoost] Finished XGBoost training on training data with total N=569 in 7.96 seconds (4.61 pure XGBoost training time).\n", "COMET INFO: Artifact 'krfricke/checkpoint_XGBoostTrainer_ac544_00000:9.0.0' has been fully uploaded successfully\n", "COMET INFO: Artifact 'checkpoint_XGBoostTrainer_ac544_00000' version 11.0.0 created (previous was: 10.0.0)\n", "COMET INFO: Scheduling the upload of 3 assets for a size of 16.39 KB, this can take some time\n", diff --git a/doc/source/ray-air/examples/upload_to_wandb.ipynb b/doc/source/ray-air/examples/upload_to_wandb.ipynb index 8079de127b5c..5ba2e60d5630 100644 --- a/doc/source/ray-air/examples/upload_to_wandb.ipynb +++ b/doc/source/ray-air/examples/upload_to_wandb.ipynb @@ -33,7 +33,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -qU \"ray[tune]\" sklearn xgboost_ray wandb" + "!pip install -qU \"ray[tune]\" scikit-learn xgboost_ray wandb" ] }, { @@ -63,7 +63,7 @@ "id": "2efa1564", "metadata": {}, "source": [ - "We define a simple function that returns our training dataset as a Ray Datastream:\n" + "We define a simple function that returns our training dataset as a Dataset:\n" ] }, { @@ -73,7 +73,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_train_dataset() -> ray.data.Datastream:\n", + "def get_train_dataset() -> ray.data.Dataset:\n", " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n", " return dataset" ] @@ -119,7 +119,7 @@ "from ray.train.xgboost import XGBoostTrainer\n", "\n", "\n", - "def train_model_xgboost(train_dataset: ray.data.Datastream, wandb_project: str) -> Result:\n", + "def train_model_xgboost(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:\n", " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n", " trainer = XGBoostTrainer(\n", " scaling_config=ScalingConfig(num_workers=2),\n", @@ -160,7 +160,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-10-28 16:28:19,325\tINFO worker.py:1524 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "2022-10-28 16:28:19,325\tINFO worker.py:1524 -- Started a local Ray instance. View the dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8265 \u001B[39m\u001B[22m\n", "2022-10-28 16:28:22,993\tWARNING read_api.py:297 -- ⚠️ The number of blocks in this dataset (1) limits its parallelism to 1 concurrent tasks. This is much less than the number of available CPU slots in the cluster. Use `.repartition(n)` to increase the number of dataset blocks.\n", "2022-10-28 16:28:26,033\tINFO wandb.py:267 -- Already logged into W&B.\n" ] @@ -285,7 +285,7 @@ "from ray.train.torch import TorchTrainer\n", "\n", "\n", - "def train_model_torch(train_dataset: ray.data.Datastream, wandb_project: str) -> Result:\n", + "def train_model_torch(train_dataset: ray.data.Dataset, wandb_project: str) -> Result:\n", " \"\"\"Train a simple XGBoost model and return the result.\"\"\"\n", " trainer = TorchTrainer(\n", " train_loop_per_worker=train_loop,\n", diff --git a/doc/source/ray-air/examples/xgboost_example.ipynb b/doc/source/ray-air/examples/xgboost_example.ipynb index 0705bebddd1e..671c309bbec9 100644 --- a/doc/source/ray-air/examples/xgboost_example.ipynb +++ b/doc/source/ray-air/examples/xgboost_example.ipynb @@ -77,7 +77,7 @@ "from ray.train.xgboost import XGBoostPredictor\n", "from ray.train.xgboost import XGBoostTrainer\n", "from ray.air.config import ScalingConfig\n", - "from ray.data import Datastream\n", + "from ray.data import Dataset\n", "from ray.air.result import Result\n", "from ray.data.preprocessors import StandardScaler" ] @@ -101,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "def prepare_data() -> Tuple[Datastream, Datastream, Datastream]:\n", + "def prepare_data() -> Tuple[Dataset, Dataset, Dataset]:\n", " dataset = ray.data.read_csv(\"s3://anonymous@air-example-data/breast_cancer.csv\")\n", " train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)\n", " test_dataset = valid_dataset.drop_columns([\"target\"])\n", @@ -252,8 +252,8 @@ "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m FutureWarning: pandas.Float64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", "\u001b[2m\u001b[36m(pid=1493910)\u001b[0m FutureWarning: pandas.UInt64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m UserWarning: Datastream 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", - "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m UserWarning: Datastream 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m UserWarning: Dataset 'train' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", + "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m UserWarning: Dataset 'valid' has 1 blocks, which is less than the `num_workers` 2. This dataset will be automatically repartitioned to 2 blocks.\n", "\u001b[2m\u001b[36m(XGBoostTrainer pid=1493910)\u001b[0m 2022-06-22 17:29:04,073\tINFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.\n", "\u001b[2m\u001b[36m(pid=1494007)\u001b[0m /home/ubuntu/ray/venv/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", "\u001b[2m\u001b[36m(pid=1494007)\u001b[0m from pandas import MultiIndex, Int64Index\n", diff --git a/doc/source/ray-air/examples/xgboost_starter.py b/doc/source/ray-air/examples/xgboost_starter.py index 7938072e5e2d..3925e1672b09 100644 --- a/doc/source/ray-air/examples/xgboost_starter.py +++ b/doc/source/ray-air/examples/xgboost_starter.py @@ -45,8 +45,8 @@ datasets={"train": train_dataset, "valid": valid_dataset}, preprocessor=preprocessor, ) -result = trainer.fit() -print(result.metrics) +best_result = trainer.fit() +print(best_result.metrics) # __air_xgb_train_end__ # __air_xgb_tuner_start__ diff --git a/doc/source/ray-air/experimental-features.rst b/doc/source/ray-air/experimental-features.rst new file mode 100644 index 000000000000..b8c338fee290 --- /dev/null +++ b/doc/source/ray-air/experimental-features.rst @@ -0,0 +1,133 @@ +.. _air-experimental-features: + +================================ +Experimental features in Ray AIR +================================ + +The Ray Team is testing a number of experimental features in Ray AIR. + +During development, the features +are disabled per default. You can opt-in by setting a +feature-specific environment variable. + +After some time, the Ray Team enables the feature by default to gather +more feedback from the community. In that case, you can still +disable the feature using the same environment variable to +fully revert to the old behavior. + +If you run into issues with experimental features, +`open an issue `_ +on GitHub. The Ray Team considers feedback before removing +the old implementation and making the new implementation the +default. + +.. note:: + + Experimental features can undergo frequent changes, + especially on the master branch and the nightly wheels. + +.. _air-experimental-new-output: + +Context-aware progress reporting +-------------------------------- + +.. note:: + + This feature is *disabled by default* in Ray 2.5. + + To enable, set the environment variable ``RAY_AIR_NEW_OUTPUT=1``. + +A context-aware output engine is available for Ray Train and Ray Tune runs. + +This output engine affects how the training progress +is printed in the console. The output changes depending on the execution +context: Ray Tune runs will be displayed differently to Ray Train runs. + +The features include: + +- Ray Train runs report status relevant to the single training run. + It does not use the default Ray Tune table layout from previous versions. +- The table format has been updated. +- The format of reporting configurations and observed metrics is different from pervious versions. +- Significant reduction in the default metrics displayed in the console output for runs (e.g., RLlib runs). +- Decluttered the output to improve readability. + + +This output feature only works for the regular console. +It is automatically disabled when you use Jupyter Notebooks +or Ray client. + + +.. _air-experimental-rich: + +Rich layout (sticky status) +--------------------------- + +.. note:: + + This feature is *disabled by default*. + + To enable, set the environment variable ``RAY_AIR_RICH_LAYOUT=1``. + +The :ref:`context-aware output engine ` +exposes an advanced layout using the +`rich `_ library. + +The *rich* layout provides a sticky +status table: The regular console logs are still printed +as before, but the trial overview table (in Ray Tune) is stuck to the bottom of the +screen and periodically updated. + +This feature is still in development. You can opt-in to try +it out. + +To opt-in, set the ``RAY_AIR_RICH_LAYOUT=1`` environment variable +and install rich (``pip install rich``). + +.. figure:: images/rich-sticky-status.png + + +.. _air-experimental-execution: + +Event-based trial execution engine +---------------------------------- + +.. note:: + + This feature is *enabled by default* starting Ray 2.5. + + To disable, set the environment variable ``TUNE_NEW_EXECUTION=0``. + + +Ray Tune has an updated trial execution engine. +Since Ray Tune is also the execution backend for +Ray Train, the updated engine affects both tuning and training runs. + +The update is a refactor of the :ref:`TrialRunner ` +which uses a generic Ray actor and future manager instead of +the previous ``RayTrialExecutor``. This manager exposes an +interface to react to scheduling and task execution events, which makes +it easier to maintain and develop. + +This is a drop-in replacement of an internal class, and you shouldn't see +any change to the previous behavior. + +However, if you notice any odd behavior, you can opt out of +the event-based execution engine and see if it resolves your problem. + +In that case, please `open an issue `_ +on GitHub, ideally with a reproducible script. + +Things to look out for: + +- Less trials are running in parallel than before +- It takes longer to start new trials (or goes much faster) +- The tuning run finishes, but the script does not exit +- The end-to-end runtime is much slower than before +- The CPU load on the head node is high, + even though the training jobs don't + require many resources or don't run on the head node +- Any exceptions are raised that indicate an error in starting or + stopping trials or the experiment + +Note that some edge cases may not be captured in the regression tests. Your feedback is welcome. diff --git a/doc/source/ray-air/getting-started.rst b/doc/source/ray-air/getting-started.rst index aa998f69dc71..e35f1e7c1192 100644 --- a/doc/source/ray-air/getting-started.rst +++ b/doc/source/ray-air/getting-started.rst @@ -84,78 +84,88 @@ First, let's start by loading a dataset from storage: Then, we define a ``Preprocessor`` pipeline for our task: -.. tabbed:: XGBoost +.. tabs:: - .. literalinclude:: examples/xgboost_starter.py - :language: python - :start-after: __air_xgb_preprocess_start__ - :end-before: __air_xgb_preprocess_end__ + .. group-tab:: XGBoost -.. tabbed:: Pytorch + .. literalinclude:: examples/xgboost_starter.py + :language: python + :start-after: __air_xgb_preprocess_start__ + :end-before: __air_xgb_preprocess_end__ - .. literalinclude:: examples/pytorch_tabular_starter.py - :language: python - :start-after: __air_pytorch_preprocess_start__ - :end-before: __air_pytorch_preprocess_end__ + .. group-tab:: Pytorch -.. tabbed:: Tensorflow + .. literalinclude:: examples/pytorch_tabular_starter.py + :language: python + :start-after: __air_pytorch_preprocess_start__ + :end-before: __air_pytorch_preprocess_end__ - .. literalinclude:: examples/tf_tabular_starter.py - :language: python - :start-after: __air_tf_preprocess_start__ - :end-before: __air_tf_preprocess_end__ + .. group-tab:: Tensorflow + + .. literalinclude:: examples/tf_tabular_starter.py + :language: python + :start-after: __air_tf_preprocess_start__ + :end-before: __air_tf_preprocess_end__ + +.. _air-getting-started-training: Training ~~~~~~~~ Train a model with a ``Trainer`` with common ML frameworks: -.. tabbed:: XGBoost +.. tabs:: + + .. group-tab:: XGBoost - .. literalinclude:: examples/xgboost_starter.py - :language: python - :start-after: __air_xgb_train_start__ - :end-before: __air_xgb_train_end__ + .. literalinclude:: examples/xgboost_starter.py + :language: python + :start-after: __air_xgb_train_start__ + :end-before: __air_xgb_train_end__ -.. tabbed:: Pytorch + .. group-tab:: Pytorch - .. literalinclude:: examples/pytorch_tabular_starter.py - :language: python - :start-after: __air_pytorch_train_start__ - :end-before: __air_pytorch_train_end__ + .. literalinclude:: examples/pytorch_tabular_starter.py + :language: python + :start-after: __air_pytorch_train_start__ + :end-before: __air_pytorch_train_end__ -.. tabbed:: Tensorflow + .. group-tab:: Tensorflow - .. literalinclude:: examples/tf_tabular_starter.py - :language: python - :start-after: __air_tf_train_start__ - :end-before: __air_tf_train_end__ + .. literalinclude:: examples/tf_tabular_starter.py + :language: python + :start-after: __air_tf_train_start__ + :end-before: __air_tf_train_end__ + +.. _air-getting-started-tuning: Hyperparameter Tuning ~~~~~~~~~~~~~~~~~~~~~ You can specify a hyperparameter space to search over for each trainer: -.. tabbed:: XGBoost +.. tabs:: + + .. group-tab:: XGBoost - .. literalinclude:: examples/xgboost_starter.py - :language: python - :start-after: __air_xgb_tuner_start__ - :end-before: __air_xgb_tuner_end__ + .. literalinclude:: examples/xgboost_starter.py + :language: python + :start-after: __air_xgb_tuner_start__ + :end-before: __air_xgb_tuner_end__ -.. tabbed:: Pytorch + .. group-tab:: Pytorch - .. literalinclude:: examples/pytorch_tabular_starter.py - :language: python - :start-after: __air_pytorch_tuner_start__ - :end-before: __air_pytorch_tuner_end__ + .. literalinclude:: examples/pytorch_tabular_starter.py + :language: python + :start-after: __air_pytorch_tuner_start__ + :end-before: __air_pytorch_tuner_end__ -.. tabbed:: Tensorflow + .. group-tab:: Tensorflow - .. literalinclude:: examples/tf_tabular_starter.py - :language: python - :start-after: __air_tf_tuner_start__ - :end-before: __air_tf_tuner_end__ + .. literalinclude:: examples/tf_tabular_starter.py + :language: python + :start-after: __air_tf_tuner_start__ + :end-before: __air_tf_tuner_end__ Then use the ``Tuner`` to run the search: @@ -167,29 +177,30 @@ Then use the ``Tuner`` to run the search: Batch Inference ~~~~~~~~~~~~~~~ -Use the trained model for scalable batch prediction with a ``BatchPredictor``. +After running the steps in :ref:`Training ` or :ref:`Tuning `, use the trained model for scalable batch prediction with a ``BatchPredictor``. -.. tabbed:: XGBoost +.. tabs:: - .. literalinclude:: examples/xgboost_starter.py - :language: python - :start-after: __air_xgb_batchpred_start__ - :end-before: __air_xgb_batchpred_end__ + .. group-tab:: XGBoost -.. tabbed:: Pytorch + .. literalinclude:: examples/xgboost_starter.py + :language: python + :start-after: __air_xgb_batchpred_start__ + :end-before: __air_xgb_batchpred_end__ - .. literalinclude:: examples/pytorch_tabular_starter.py - :language: python - :start-after: __air_pytorch_batchpred_start__ - :end-before: __air_pytorch_batchpred_end__ + .. group-tab:: Pytorch -.. tabbed:: Tensorflow + .. literalinclude:: examples/pytorch_tabular_starter.py + :language: python + :start-after: __air_pytorch_batchpred_start__ + :end-before: __air_pytorch_batchpred_end__ - .. literalinclude:: examples/tf_tabular_starter.py - :language: python - :start-after: __air_tf_batchpred_start__ - :end-before: __air_tf_batchpred_end__ + .. group-tab:: Tensorflow + .. literalinclude:: examples/tf_tabular_starter.py + :language: python + :start-after: __air_tf_batchpred_start__ + :end-before: __air_tf_batchpred_end__ Project Status -------------- diff --git a/doc/source/ray-air/images/rich-sticky-status.png b/doc/source/ray-air/images/rich-sticky-status.png new file mode 100644 index 000000000000..e054d2ceeb23 Binary files /dev/null and b/doc/source/ray-air/images/rich-sticky-status.png differ diff --git a/doc/source/ray-air/key-concepts.rst b/doc/source/ray-air/key-concepts.rst index 7058ca49455e..ea18ef28423b 100644 --- a/doc/source/ray-air/key-concepts.rst +++ b/doc/source/ray-air/key-concepts.rst @@ -10,15 +10,15 @@ Here, we cover the main concepts in AIR. Datasets --------- +----------- -:ref:`Ray Datasets ` are the standard way to load and exchange data in Ray AIR. In AIR, Datasets are used extensively for data loading, preprocessing, and batch inference. +:ref:`Ray Data ` is the standard way to load and exchange data in Ray AIR. It provides a `Dataset ` concept which is used extensively for data loading, preprocessing, and batch inference. Preprocessors ------------- -Preprocessors are primitives that can be used to transform input data into features. Preprocessors operate on :ref:`Datasets `, which makes them scalable and compatible with a variety of datasources and dataframe libraries. +Preprocessors are primitives that can be used to transform input data into features. Preprocessors operate on :ref:`Datasets `, which makes them scalable and compatible with a variety of datasources and dataframe libraries. A Preprocessor is fitted during Training, and applied at runtime in both Training and Serving on data batches in the same way. AIR comes with a collection of built-in preprocessors, and you can also define your own with simple templates. @@ -33,7 +33,7 @@ See the documentation on :ref:`Preprocessors `. Trainers -------- -Trainers are wrapper classes around third-party training frameworks such as XGBoost and Pytorch. They are built to help integrate with core Ray actors (for distribution), Ray Tune, and Ray Datasets. +Trainers are wrapper classes around third-party training frameworks such as XGBoost and Pytorch. They are built to help integrate with core Ray actors (for distribution), Ray Tune, and Ray Data. See the documentation on :ref:`Trainers `. diff --git a/doc/source/ray-air/predictors.rst b/doc/source/ray-air/predictors.rst index 994eee76287b..0656bcd4428a 100644 --- a/doc/source/ray-air/predictors.rst +++ b/doc/source/ray-air/predictors.rst @@ -146,34 +146,37 @@ Below, we provide examples of using common frameworks to do batch inference for Tabular ~~~~~~~ -.. tabbed:: XGBoost +.. tab-set:: - .. literalinclude:: examples/xgboost_batch_prediction.py - :language: python + .. tab-item:: XGBoost -.. tabbed:: Pytorch + .. literalinclude:: examples/xgboost_batch_prediction.py + :language: python - .. literalinclude:: examples/pytorch_tabular_batch_prediction.py - :language: python + .. tab-item:: Pytorch -.. tabbed:: Tensorflow + .. literalinclude:: examples/pytorch_tabular_batch_prediction.py + :language: python - .. literalinclude:: examples/tf_tabular_batch_prediction.py - :language: python + .. tab-item:: Tensorflow + .. literalinclude:: examples/tf_tabular_batch_prediction.py + :language: python Image ~~~~~ -.. tabbed:: Pytorch +.. tab-set:: - .. literalinclude:: examples/torch_image_batch_pretrained.py - :language: python + .. tab-item:: Pytorch + + .. literalinclude:: examples/torch_image_batch_pretrained.py + :language: python -.. tabbed:: Tensorflow + .. tab-item:: Tensorflow - Coming soon! + Coming soon! Text ~~~~ diff --git a/doc/source/ray-air/preprocessors.rst b/doc/source/ray-air/preprocessors.rst index 8b562e7a1525..a589d2dbdac2 100644 --- a/doc/source/ray-air/preprocessors.rst +++ b/doc/source/ray-air/preprocessors.rst @@ -15,7 +15,7 @@ Ray AIR provides several common preprocessors out of the box and interfaces to d Overview -------- -The most common way of using a preprocessor is by passing it as an argument to the constructor of a :ref:`Trainer ` in conjunction with a :ref:`Ray Dataset `. +The most common way of using a preprocessor is by passing it as an argument to the constructor of a :ref:`Trainer ` in conjunction with a :ref:`Ray Data `. For example, the following code trains a model with a preprocessor that normalizes the data. .. literalinclude:: doc_code/preprocessors.py diff --git a/doc/source/ray-air/trainers.rst b/doc/source/ray-air/trainers.rst index 3c5182b95977..3eaa89c212b3 100644 --- a/doc/source/ray-air/trainers.rst +++ b/doc/source/ray-air/trainers.rst @@ -67,22 +67,24 @@ Read more about :ref:`Ray Train's Deep Learning Trainers `. .. dropdown:: Code examples - .. tabbed:: Torch + .. tab-set:: - .. literalinclude:: doc_code/torch_trainer.py - :language: python + .. tab-item:: Torch - .. tabbed:: Tensorflow + .. literalinclude:: doc_code/torch_trainer.py + :language: python - .. literalinclude:: doc_code/tf_starter.py - :language: python - :start-after: __air_tf_train_start__ - :end-before: __air_tf_train_end__ + .. tab-item:: Tensorflow - .. tabbed:: Horovod + .. literalinclude:: doc_code/tf_starter.py + :language: python + :start-after: __air_tf_train_start__ + :end-before: __air_tf_train_end__ - .. literalinclude:: doc_code/hvd_trainer.py - :language: python + .. tab-item:: Horovod + + .. literalinclude:: doc_code/hvd_trainer.py + :language: python How to report metrics and checkpoints? @@ -143,17 +145,17 @@ Other Trainers Hugging Face ~~~~~~~~~~~~ -Transformers -************ +TransformersTrainer +******************* -:class:`HuggingFaceTrainer ` further extends :class:`TorchTrainer `, built +:class:`TransformersTrainer ` further extends :class:`TorchTrainer `, built for interoperability with the HuggingFace Transformers library. Users are required to provide a ``trainer_init_per_worker`` function which returns a ``transformers.Trainer`` object. The ``trainer_init_per_worker`` function will have access to preprocessed train and evaluation datasets. -Upon calling `HuggingFaceTrainer.fit()`, multiple workers (ray actors) will be spawned, +Upon calling `TransformersTrainer.fit()`, multiple workers (ray actors) will be spawned, and each worker will create its own copy of a ``transformers.Trainer``. Each worker will then invoke ``transformers.Trainer.train()``, which will perform distributed @@ -167,20 +169,20 @@ training via Pytorch DDP. :start-after: __hf_trainer_start__ :end-before: __hf_trainer_end__ -Accelerate -********** +AccelerateTrainer +***************** -If you prefer a more fine-grained Hugging Face API than what Transformers provides, you can use :class:`AccelerateTrainer ` -to run training functions making use of Hugging Face Accelerate. Similarly to :class:`HuggingFaceTrainer `, :class:`AccelerateTrainer ` +If you prefer a more fine-grained Hugging Face API than what Transformers provides, you can use :class:`AccelerateTrainer ` +to run training functions making use of Hugging Face Accelerate. Similarly to :class:`TransformersTrainer `, :class:`AccelerateTrainer ` is also an extension of :class:`TorchTrainer `. -:class:`AccelerateTrainer ` allows you to pass an Accelerate configuration file generated with ``accelerate config`` to be applied on all training workers. +:class:`AccelerateTrainer ` allows you to pass an Accelerate configuration file generated with ``accelerate config`` to be applied on all training workers. This ensures that the worker environments are set up correctly for Accelerate, allowing you to take advantage of Accelerate APIs and integrations such as DeepSpeed and FSDP just as you would if you were running Accelerate without Ray. .. note:: ``AccelerateTrainer`` will override some settings set with ``accelerate config``, mainly related to - the topology and networking. See the :class:`AccelerateTrainer ` + the topology and networking. See the :class:`AccelerateTrainer ` API reference for more details. Aside from Accelerate support, the usage is identical to :class:`TorchTrainer `, meaning you define your own training function diff --git a/doc/source/ray-air/tuner.rst b/doc/source/ray-air/tuner.rst index 03fb0f6deffd..43f14c470c0f 100644 --- a/doc/source/ray-air/tuner.rst +++ b/doc/source/ray-air/tuner.rst @@ -63,26 +63,28 @@ Depending on the model and dataset, you may want to tune: The following shows some example code on how to specify the ``param_space``. -.. tabbed:: XGBoost +.. tab-set:: - .. literalinclude:: doc_code/tuner.py - :language: python - :start-after: __xgboost_start__ - :end-before: __xgboost_end__ + .. tab-item:: XGBoost -.. tabbed:: Pytorch + .. literalinclude:: doc_code/tuner.py + :language: python + :start-after: __xgboost_start__ + :end-before: __xgboost_end__ - .. literalinclude:: doc_code/tuner.py - :language: python - :start-after: __torch_start__ - :end-before: __torch_end__ + .. tab-item:: Pytorch + + .. literalinclude:: doc_code/tuner.py + :language: python + :start-after: __torch_start__ + :end-before: __torch_end__ Read more about :ref:`Tune search spaces here `. You can use a Tuner to tune most arguments and configurations in Ray AIR, including but not limited to: -- Ray Datasets +- Ray Data - Preprocessors - Scaling configurations - and other hyperparameters. diff --git a/doc/source/ray-air/user-guides.rst b/doc/source/ray-air/user-guides.rst index 0650378739b8..ee92e2068367 100644 --- a/doc/source/ray-air/user-guides.rst +++ b/doc/source/ray-air/user-guides.rst @@ -9,80 +9,94 @@ User Guides AIR User Guides --------------- -.. panels:: - :container: text-center - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto fixed-height-img +.. grid:: 3 + :gutter: 2 + :class-container: container pb-4 - --- - :img-top: /ray-air/images/preprocessors.svg + .. grid-item-card:: + :img-top: /ray-air/images/preprocessors.svg + :class-img-top: pt-5 w-75 d-block mx-auto fixed-height-img - .. https://docs.google.com/drawings/d/1ZIbsXv5vvwTVIEr2aooKxuYJ_VL7-8VMNlRinAiPaTI/edit + +++ + .. button-ref:: /ray-air/preprocessors + :color: primary + :outline: + :expand: - .. link-button:: /ray-air/preprocessors - :type: ref - :text: Using Preprocessors - :classes: btn-link btn-block stretched-link + Using Preprocessors - --- - :img-top: /ray-air/images/train-icon.svg + .. grid-item-card:: + :img-top: /ray-air/images/train-icon.svg + :class-img-top: pt-5 w-75 d-block mx-auto fixed-height-img - .. https://docs.google.com/drawings/d/15SXGHbKPWdrzx3aTAIFcO2uh_s6Q7jLU03UMuwKSzzM/edit + +++ + .. button-ref:: trainers + :color: primary + :outline: + :expand: - .. link-button:: trainers - :type: ref - :text: Using Trainers - :classes: btn-link btn-block stretched-link + Using Trainers - --- - :img-top: /ray-air/images/ingest-icon.svg + .. grid-item-card:: + :img-top: /ray-air/images/ingest-icon.svg + :class-img-top: pt-5 w-75 d-block mx-auto fixed-height-img - .. https://docs.google.com/drawings/d/10GZE_6s6ss8PSxLYyzcbj6yEalWO4N7MS7ao8KO7ne0/edit + +++ + .. button-ref:: air-ingest + :color: primary + :outline: + :expand: - .. link-button:: air-ingest - :type: ref - :text: Configuring Training Datasets - :classes: btn-link btn-block stretched-link + Configuring Training Datasets - --- - :img-top: /ray-air/images/tuner.svg + .. grid-item-card:: + :img-top: /ray-air/images/tuner.svg + :class-img-top: pt-5 w-75 d-block mx-auto fixed-height-img - .. https://docs.google.com/drawings/d/1yMd12iMkyo6DGrFoET1TIlKfFnXX9dfh2u3GSdTz6W4/edit + +++ + .. button-ref:: /ray-air/tuner + :color: primary + :outline: + :expand: - .. link-button:: /ray-air/tuner - :type: ref - :text: Configuring Hyperparameter Tuning - :classes: btn-link btn-block stretched-link + Configuring Hyperparameter Tuning - --- - :img-top: /ray-air/images/predictors.png + .. grid-item-card:: + :img-top: /ray-air/images/predictors.png + :class-img-top: pt-5 w-75 d-block mx-auto fixed-height-img - .. https://docs.google.com/presentation/d/1jfkQk0tGqgkLgl10vp4-xjcbYG9EEtlZV_Vnve_NenQ/edit#slide=id.g131c21f5e88_0_549 + +++ + .. button-ref:: predictors + :color: primary + :outline: + :expand: - .. link-button:: predictors - :type: ref - :text: Using Predictors for Inference - :classes: btn-link btn-block stretched-link + Using Predictors for Inference - --- - :img-top: /ray-air/images/serve-icon.svg + .. grid-item-card:: + :img-top: /ray-air/images/serve-icon.svg + :class-img-top: pt-5 w-75 d-block mx-auto fixed-height-img - .. https://docs.google.com/drawings/d/1-rg77bV-vEMURXZw5_mIOUFM3FObIIYbFOiYzFJW_68/edit + +++ + .. button-ref:: /ray-air/examples/serving_guide + :color: primary + :outline: + :expand: - .. link-button:: /ray-air/examples/serving_guide - :type: ref - :text: Deploying Predictors with Serve - :classes: btn-link btn-block stretched-link + Deploying Predictors with Serve - --- - :img-top: /ray-air/images/air-deploy.svg + .. grid-item-card:: + :img-top: /ray-air/images/air-deploy.svg + :class-img-top: pt-5 w-75 d-block mx-auto fixed-height-img - .. https://docs.google.com/drawings/d/1ja1RfNCEFn50B9FHWSemUzwhtPAmVyoak1JqEJUmxs4/edit + +++ + .. button-ref:: air-deployment + :color: primary + :outline: + :expand: + + How to Deploy AIR - .. link-button:: air-deployment - :type: ref - :text: How to Deploy AIR - :classes: btn-link btn-block stretched-link .. _air-env-vars: @@ -96,6 +110,11 @@ Please also see the :ref:`Ray Tune environment variables `. - **RAY_AIR_FULL_TRACEBACKS**: If set to 1, will print full tracebacks for training functions, including internal code paths. Otherwise, abbreviated tracebacks that only show user code are printed. Defaults to 0 (disabled). +- **RAY_AIR_NEW_OUTPUT**: If set to 0, this disables + the :ref:`experimental new console output `. +- **RAY_AIR_RICH_LAYOUT**: If set to 1, this enables + the :ref:`stick table layout ` + (only available for Ray Tune). .. _air-multi-tenancy: @@ -111,3 +130,20 @@ If you still want to do this, refer to the :ref:`Ray Tune multi-tenancy docs ` for potential pitfalls. + +.. _air-experimental-overview: + +Experimental features in Ray 2.5+ +--------------------------------- +Starting in Ray 2.5, some experimental +features are enabled by default. + +Experimental features are enabled to allow for feedback +from users. Every experimental feature can be disabled +by setting an environment variable. Some features are +not ready for general testing and can only be *enabled* using an +environment variable. + +Please see the :ref:`experimental features ` +page for more details on the current features and how to enable +or disable them. diff --git a/doc/source/ray-contribute/debugging.rst b/doc/source/ray-contribute/debugging.rst index b156b35e6b61..66a602e891b0 100644 --- a/doc/source/ray-contribute/debugging.rst +++ b/doc/source/ray-contribute/debugging.rst @@ -1,5 +1,7 @@ -Debugging (internal) -==================== +Debugging for Ray Developers +============================ + +This debugging guide is for contributors to the Ray project. Starting processes in a debugger -------------------------------- diff --git a/doc/source/ray-contribute/development.rst b/doc/source/ray-contribute/development.rst index 8291c63d6ee7..e77d80c335cc 100644 --- a/doc/source/ray-contribute/development.rst +++ b/doc/source/ray-contribute/development.rst @@ -19,21 +19,23 @@ Clone the repository To build Ray locally you will need to have the Git repository, so first, fork it on GitHub. Then you can clone it to your machine: -.. tabbed:: Git SSH +.. tab-set:: - To clone the repository using Git with SSH (the default) run: + .. tab-item:: Git SSH - .. code-block:: shell + To clone the repository using Git with SSH (the default) run: - git clone git@github.com:[your username]/ray.git + .. code-block:: shell -.. tabbed:: Git HTTPS + git clone git@github.com:[your username]/ray.git - To clone the repository using Git with HTTPS run: + .. tab-item:: Git HTTPS - .. code-block:: shell + To clone the repository using Git with HTTPS run: - git clone https://github.com/[your username]/ray.git + .. code-block:: shell + + git clone https://github.com/[your username]/ray.git Then you can enter into the Ray git repository directory: @@ -43,21 +45,23 @@ Then you can enter into the Ray git repository directory: Next make sure you connect your repository to the upstream (main project) Ray repository. This will allow you to push your code to your repository when proposing changes (in pull requests) while also pulling updates from the main project. -.. tabbed:: Git SSH +.. tab-set:: + + .. tab-item:: Git SSH - To connect your repository using SSH (the default) run the command: + To connect your repository using SSH (the default) run the command: - .. code-block:: shell + .. code-block:: shell - git remote add upstream git@github.com:ray-project/ray.git + git remote add upstream git@github.com:ray-project/ray.git -.. tabbed:: Git HTTPS + .. tab-item:: Git HTTPS - To connect your repository using HTTPS run the command: + To connect your repository using HTTPS run the command: - .. code-block:: shell + .. code-block:: shell - git remote add upstream https://github.com/ray-project/ray.git + git remote add upstream https://github.com/ray-project/ray.git Every time you want to update your local version you can pull the changes from the main repository: @@ -73,46 +77,48 @@ Prepare the Python environment You probably want some type of Python virtual environment. For example, you can use Anaconda's ``conda``. -.. tabbed:: conda +.. tab-set:: + + .. tab-item:: conda + + Set up a ``conda`` environment named ``ray``: + + .. code-block:: shell - Set up a ``conda`` environment named ``ray``: + conda create -c conda-forge python=3.9 -n ray - .. code-block:: shell - conda create -c conda-forge python=3.9 -n ray + Activate your virtual environment to tell the shell/terminal to use this particular Python: + .. code-block:: shell - Activate your virtual environment to tell the shell/terminal to use this particular Python: + conda activate ray - .. code-block:: shell - - conda activate ray - - You need to activate the virtual environment every time you start a new shell/terminal to work on Ray. + You need to activate the virtual environment every time you start a new shell/terminal to work on Ray. -.. tabbed:: venv + .. tab-item:: venv - Use Python's integrated ``venv`` module to create a virtual environment called ``venv`` in the current directory: + Use Python's integrated ``venv`` module to create a virtual environment called ``venv`` in the current directory: - .. code-block:: shell + .. code-block:: shell - python -m venv venv + python -m venv venv - This contains a directory with all the packages used by the local Python of your project. You only need to do this step once. + This contains a directory with all the packages used by the local Python of your project. You only need to do this step once. - Activate your virtual environment to tell the shell/terminal to use this particular Python: + Activate your virtual environment to tell the shell/terminal to use this particular Python: - .. code-block:: shell + .. code-block:: shell - source venv/bin/activate + source venv/bin/activate - You need to activate the virtual environment every time you start a new shell/terminal to work on Ray. + You need to activate the virtual environment every time you start a new shell/terminal to work on Ray. - Creating a new virtual environment can come with older versions of ``pip`` and ``wheel``. To avoid problems when you install packages, use the module ``pip`` to install the latest version of ``pip`` (itself) and ``wheel``: + Creating a new virtual environment can come with older versions of ``pip`` and ``wheel``. To avoid problems when you install packages, use the module ``pip`` to install the latest version of ``pip`` (itself) and ``wheel``: - .. code-block:: shell + .. code-block:: shell - python -m pip install --upgrade pip wheel + python -m pip install --upgrade pip wheel .. _python-develop: diff --git a/doc/source/ray-contribute/doc_code/example_module.py b/doc/source/ray-contribute/doc_code/example_module.py new file mode 100644 index 000000000000..cba47448a945 --- /dev/null +++ b/doc/source/ray-contribute/doc_code/example_module.py @@ -0,0 +1,8 @@ +# example_module.py + +# fmt: off +# __is_even_begin__ +def is_even(x): + return (x % 2) == 0 +# __is_even_end__ +# fmt: on diff --git a/doc/source/ray-contribute/docs.ipynb b/doc/source/ray-contribute/docs.ipynb index 47fa7f49f730..7588db4ea287 100644 --- a/doc/source/ray-contribute/docs.ipynb +++ b/doc/source/ray-contribute/docs.ipynb @@ -28,28 +28,27 @@ "cd ray/doc\n", "```\n", "\n", - "**Note**: If you are on an Apple Silicon (M1) read the instructions below for installing the dependencies.\n", + "**Note**: If you are using Apple Silicon (M1), follow the instructions below before continuing.\n", "\n", - "Make sure you activate the Python environment you are using (e.g. venv, conda, etc.) and then to install the documentation dependencies, run the following command:\n", + "Activate the Python environment you are using (e.g., venv, conda, etc.). Install the documentation dependencies, with the following command:\n", "\n", "```shell\n", "pip install -r requirements-doc.txt\n", "```\n", "\n", - "Additionally, it's best if you install the dependencies for our linters with\n", + "Install the dependencies for our linters to ensure your changes comply with our style guide.\n", "\n", "```shell\n", "pip install -r ../python/requirements_linters.txt\n", "```\n", "\n", - "so that you can make sure your changes comply with our style guide.\n", - "Building the documentation is done by running the following command:\n", + "Build the documentation by running the following command:\n", "\n", "```shell\n", "make develop\n", "```\n", "\n", - "which will build the documentation into the `_build` directory.\n", + "Find the documentation build in the `_build` directory.\n", "After the build finishes, you can simply open the `_build/html/index.html` file in your browser.\n", "It's considered good practice to check the output of your build to make sure everything is working as expected.\n", "\n", @@ -157,38 +156,14 @@ "For example, in the above `autofunction` call, to change the API reference for `ray.tune.integration.docker.DockerSyncer`,\n", "you would have to [change the following source file](https://github.com/ray-project/ray/blob/7f1bacc7dc9caf6d0ec042e39499bbf1d9a7d065/python/ray/tune/integration/docker.py#L15-L38).\n", "\n", - "To show the usage of APIs, it is important to have small usage examples embedded in the API documentation. These should be self-contained and run out of the box, so a user can copy and paste them into a Python interpreter and play around with them (e.g., if applicable, they should point to example data). Users often rely on these examples to build their applications. You can use the Sphinx `testcode` primitive to embed\n", - "such examples into the `Examples:` section of the docstrings. For an example look [here](https://github.com/ray-project/ray/blob/5e61fb51400bc712449e85a7476fa4fca80f3b41/python/ray/train/torch/torch_predictor.py#L173-L221)\n", - "\n", - "These code snippets will be tested in the CI to make sure they keep working in the future and updated if there are changes to the APIs. You can use the `testoutput` primitive to specify the expected output of the code snippet, and the CI will check the output and give an error if they don't match.\n", - "\n", - "To run the doctests locally, run\n", - "\n", - "```shell\n", - "RAY_MOCK_MODULES=0 make doctest\n", - "```\n", - "\n", - "in the `ray/doc` directory.\n", + "To show the usage of APIs, it is important to have small usage examples embedded in the API documentation. These should be self-contained and run out of the box, so a user can copy and paste them into a Python interpreter and play around with them (e.g., if applicable, they should point to example data). Users often rely on these examples to build their applications. To learn more about writing examples, read [How to write code snippets](writing-code-snippets).\n", "\n", "## Adding code to an `.rST` or `.md` file\n", "\n", "Modifying text in an existing documentation file is easy, but you need to be careful when it comes to adding code.\n", "The reason is that we want to ensure every code snippet on our documentation is tested.\n", - "This requires us to have a process for including and testing code snippets in documents.\n", - "\n", - "In an `.rST` or `.md` file, you can add code snippets using `literalinclude` from the Sphinx system.\n", - "For instance, here's an example from the Tune's \"Key Concepts\" documentation: \n", - "\n", - "```markdown\n", - ".. literalinclude:: doc_code/key_concepts.py\n", - " :language: python\n", - " :start-after: __function_api_start__\n", - " :end-before: __function_api_end__\n", - "```\n", - "\n", - "Note that in the whole file there's not a single literal code block, code _has to be_ imported using the `literalinclude` directive.\n", - "The code that gets added to the document by `literalinclude`, including `start-after` and `end-before` tags,\n", - "reads as follows:" + "This requires us to have a process for including and testing code snippets in documents. To learn how to write testable code \n", + "snippets, read [How to write code snippets](writing-code-snippets).\n" ] }, { diff --git a/doc/source/ray-contribute/getting-involved.rst b/doc/source/ray-contribute/getting-involved.rst index 5b882ca9f26d..9bccb133a64d 100644 --- a/doc/source/ray-contribute/getting-involved.rst +++ b/doc/source/ray-contribute/getting-involved.rst @@ -34,7 +34,7 @@ What can I work on? ------------------- We use Github to track issues, feature requests, and bugs. Take a look at the -ones labeled `"good first issue" `__ and `"help wanted" `__ for a place to start. +ones labeled `"good first issue" `__ for a place to start. Setting up your development environment --------------------------------------- @@ -112,7 +112,7 @@ The full suite of tests is too large to run on a single machine. However, you ca This will run all of the tests in the file. To run a specific test, use the following: .. code-block:: shell - + # Directly calling `pytest -v ...` may lose import paths. python -m pytest -v -s test_file.py::name_of_the_test @@ -136,9 +136,9 @@ Code Style In general, we follow the `Google style guide `__ for C++ code and the `Black code style `__ for Python code. Python imports follow `PEP8 style `__. However, it is more important for code to be in a locally consistent style than to strictly follow guidelines. Whenever in doubt, follow the local code style of the component. -For Python documentation, we follow a subset of the `Google pydoc format `__. The following code snippet demonstrates the canonical Ray pydoc formatting: +For Python documentation, we follow a subset of the `Google pydoc format `__. The following code snippets demonstrate the canonical Ray pydoc formatting: -.. code-block:: python +.. testcode:: def ray_canonical_doc_style(param1: int, param2: str) -> bool: """First sentence MUST be inline with the quotes and fit on one line. @@ -147,17 +147,19 @@ For Python documentation, we follow a subset of the `Google pydoc format >> # Provide code examples as possible. - >>> ray_canonical_doc_style(41, "hello") - True + .. doctest:: - >>> # A second example. - >>> ray_canonical_doc_style(72, "goodbye") - False + >>> # Provide code examples for key use cases, as possible. + >>> ray_canonical_doc_style(41, "hello") + True + + >>> # A second example. + >>> ray_canonical_doc_style(72, "goodbye") + False Args: param1: The first parameter. Do not include the types in the - docstring (they should be defined only in the signature). + docstring. They should be defined only in the signature. Multi-line parameter docs should be indented by four spaces. param2: The second parameter. @@ -165,6 +167,56 @@ For Python documentation, we follow a subset of the `Google pydoc format str: + """Public property of the class. + + Properties created with the @property decorator + should be documented here. + """ + return "hello" + + def increment_attr1(self) -> None: + """Class methods are similar to regular functions. + + See above about how to document functions. + """ + + self.attr1 = self.attr1 + 1 + +See :ref:`this ` for more details about how to write code snippets in docstrings. + Lint and Formatting ~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/ray-contribute/profiling.rst b/doc/source/ray-contribute/profiling.rst index 88263f9225fd..f9d74b11add3 100644 --- a/doc/source/ray-contribute/profiling.rst +++ b/doc/source/ray-contribute/profiling.rst @@ -1,9 +1,9 @@ .. _ray-core-internal-profiling: -Profiling (internal) -==================== +Profiling for Ray Developers +============================ -This document details, for Ray developers, how to analyze Ray performance. +This guide helps contributors to the Ray project analyze Ray performance. Getting a stack trace of Ray C++ processes ------------------------------------------ diff --git a/doc/source/ray-contribute/stability.rst b/doc/source/ray-contribute/stability.rst index 136ac4aa44d6..a4676b93e1f9 100644 --- a/doc/source/ray-contribute/stability.rst +++ b/doc/source/ray-contribute/stability.rst @@ -42,6 +42,8 @@ but **may** include backwards-incompatible changes to beta components. Backwards-incompatible changes **must** be made only after a reasonable deprecation period to provide users with an opportunity to migrate their code. +.. _api-stability-stable: + Stable ~~~~~~ diff --git a/doc/source/ray-contribute/writing-code-snippets.rst b/doc/source/ray-contribute/writing-code-snippets.rst new file mode 100644 index 000000000000..9c91dc6a913a --- /dev/null +++ b/doc/source/ray-contribute/writing-code-snippets.rst @@ -0,0 +1,325 @@ +.. _writing-code-snippets_ref: + +========================== +How to write code snippets +========================== + +Users learn from example. So, whether you're writing a docstring or a user guide, +include examples that illustrate the relevant APIs. Your examples should run +out-of-the-box so that users can copy them and adapt them to their own needs. + +This page describes how to write code snippets so that they're tested in CI. + +.. note:: + The examples in this guide use reStructuredText. If you're writing + Markdown, use MyST syntax. To learn more, read the + `MyST documentation `_. + +----------------- +Types of examples +----------------- + +There are three types of examples: *doctest-style*, *code-output-style*, and *literalinclude*. + +*doctest-style* examples +======================== + +*doctest-style* examples mimic interactive Python sessions. :: + + .. doctest:: + + >>> def is_even(x): + ... return (x % 2) == 0 + >>> is_even(0) + True + >>> is_even(1) + False + +They're rendered like this: + +.. doctest:: + + >>> def is_even(x): + ... return (x % 2) == 0 + >>> is_even(0) + True + >>> is_even(1) + False + +.. tip:: + + If you're writing docstrings, exclude `.. doctest::` to simplify your code. :: + + Example: + >>> def is_even(x): + ... return (x % 2) == 0 + >>> is_even(0) + True + >>> is_even(1) + False + +*code-output-style* examples +============================ + +*code-output-style* examples contain ordinary Python code. :: + + .. testcode:: + + def is_even(x): + return (x % 2) == 0 + + print(is_even(0)) + print(is_even(1)) + + .. testoutput:: + + True + False + +They're rendered like this: + +.. testcode:: + + def is_even(x): + return (x % 2) == 0 + + print(is_even(0)) + print(is_even(1)) + +.. testoutput:: + + True + False + +*literalinclude* examples +========================= + +*literalinclude* examples display Python modules. :: + + .. literalinclude:: ./doc_code/example_module.py + :language: python + :start-after: __is_even_begin__ + :end-before: __is_even_end__ + +.. literalinclude:: ./doc_code/example_module.py + :language: python + +They're rendered like this: + +.. literalinclude:: ./doc_code/example_module.py + :language: python + :start-after: __is_even_begin__ + :end-before: __is_even_end__ + +--------------------------------------- +Which type of example should you write? +--------------------------------------- + +There's no hard rule about which style you should use. Choose the style that best +illustrates your API. + +.. tip:: + If you're not sure which style to use, use *code-block-style*. + +When to use *doctest-style* +=========================== + +If you're writing a small example that emphasizes object representations, or if you +want to print intermediate objects, use *doctest-style*. :: + + .. doctest:: + + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.schema() + Schema({'id': DataType(int64)}) + >>> ds.take(5) + [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': 4}] + +When to use *code-block-style* +============================== + +If you're writing a longer example, or if object representations aren't relevant to your example, use *code-block-style*. :: + + .. testcode:: + + import pandas as pd + import ray + from ray.train.batch_predictor import BatchPredictor + + def calculate_accuracy(df): + return pd.DataFrame({"correct": df["preds"] == df["label"]}) + + # Create a batch predictor that returns identity as the predictions. + batch_pred = BatchPredictor.from_pandas_udf( + lambda data: pd.DataFrame({"preds": data["feature_1"]})) + + # Create a dummy dataset. + ds = ray.data.from_pandas(pd.DataFrame({ + "feature_1": [1, 2, 3], "label": [1, 2, 3]})) + + # Execute batch prediction using this predictor. + predictions = batch_pred.predict(ds, + feature_columns=["feature_1"], keep_columns=["label"]) + + # Calculate final accuracy + correct = predictions.map_batches(calculate_accuracy) + print(f"Final accuracy: {correct.sum(on='correct') / correct.count()}") + + .. testoutput:: + + Final accuracy: 1.0 + +When to use *literalinclude* +============================ +If you're writing an end-to-end examples and your examples doesn't contain outputs, use +*literalinclude*. + +----------------------------------- +How to handle hard-to-test examples +----------------------------------- + +When is it okay to not test an example? +======================================= + +You don't need to test examples that require GPUs, or examples that depend on external +systems like Weights and Biases. + +Skipping *doctest-style* examples +================================= + +To skip a *doctest-style* example, append `# doctest: +SKIP` to your Python code. :: + + .. doctest:: + + >>> import ray + >>> ray.data.read_images("s3://private-bucket") # doctest: +SKIP + +Skipping *code-block-style* examples +==================================== + +To skip a *code-block-style* example, add `:skipif: True` to the `testoutput` block. :: + + .. testcode:: + :skipif: True + + from ray.air.integrations.wandb import WandbLoggerCallback + callback = WandbLoggerCallback( + project="Optimization_Project", + api_key_file=..., + log_config=True + ) + +---------------------------------------------- +How to handle long or non-determnistic outputs +---------------------------------------------- + +If your Python code is non-deterministic, or if your output is excessively long, you may want to skip all or part of an output. + +Ignoring *doctest-style* outputs +================================ + +To ignore parts of a *doctest-style* output, add `:options: +ELLIPSIS` to +the `doctest` directive and replace problematic sections with ellipsis. :: + + .. doctest:: + :options: +ELLIPSIS + + >>> import ray + >>> ray.data.read_images("s3://anonymous@air-example-data-2/imagenet-sample-images") + Dataset( + num_blocks=..., + num_rows=..., + schema={image: numpy.ndarray(shape=..., dtype=uint8)} + ) + +If you omit the `doctest` directive, append `# doctest: +ELLIPSIS` to your code instead. + + >>> import ray + >>> ray.data.read_images("s3://anonymous@air-example-data-2/imagenet-sample-images") # doctest: +ELLIPSIS + Dataset( + num_blocks=..., + num_rows=..., + schema={image: numpy.ndarray(shape=..., dtype=uint8)} + ) + +To ignore an output altogether, write a *code-block-style* snippet. Don't use `# doctest: +SKIP`. + +Ignoring *code-block-style* outputs +=================================== + +If parts of your output are long or non-deterministic, add `:options: +ELLIPSIS` to +the `testoutput` directive and replace problematic sections with ellipsis. :: + + .. testcode:: + + import ray + ds = ray.data.read_images("s3://anonymous@air-example-data-2/imagenet-sample-images") + print(ds) + + .. testoutput:: + :options: +ELLIPSIS + + Dataset( + num_blocks=..., + num_rows=..., + schema={image: numpy.ndarray(shape=..., dtype=uint8)} + ) + +If your output is nondeterministic and you want to display a sample output, add +`:options: +SKIP`. :: + + .. testcode:: + + import random + print(random.random()) + + .. testoutput:: + :options: +SKIP + + 0.969461416250246 + +If your output is hard to test and you don't want to display a sample output, add +`:options: +SKIP` and `:hide:`. :: + + .. testcode:: + + print("This output is hidden and untested") + + .. testoutput:: + :hide: + :options: +SKIP + + ... # Add ellipsis. Otherwise, Sphinx can't parse the block. + +-------------------- +How to test examples +-------------------- + +Testing specific examples +========================= + +To test specific examples, install `pytest-sphinx`. + +.. code-block:: bash + + pip install pytest-sphinx + +Then, run pytest on a module, docstring, or user guide. + +.. code-block:: bash + + pytest --doctest-modules python/ray/data/read_api.py + pytest --doctest-modules python/ray/data/read_api.py::ray.data.read_api.range + pytest --doctest-modules doc/source/data/getting-started.rst + +Testing all examples +==================== + +To test all code snippets, run + +.. code-block:: bash + + RAY_MOCK_MODULES=0 make doctest + +in the `ray/doc` directory. diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py index 62e6fb79d73b..eabad34c03ed 100644 --- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py +++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py @@ -268,7 +268,7 @@ def inference( num_gpus = 1 if use_gpu else 0 dataset.map_batches( model_cls, - compute="actors", + compute=ray.data.ActorPoolStrategy(), batch_size=batch_size, batch_format="pandas", num_gpus=num_gpus, diff --git a/doc/source/ray-core/actors.rst b/doc/source/ray-core/actors.rst index 6f34e8b42954..201b21d79d26 100644 --- a/doc/source/ray-core/actors.rst +++ b/doc/source/ray-core/actors.rst @@ -9,81 +9,85 @@ An actor is essentially a stateful worker (or a service). When a new actor is instantiated, a new worker is created, and methods of the actor are scheduled on that specific worker and can access and mutate the state of that worker. -.. tabbed:: Python +.. tab-set:: - The ``ray.remote`` decorator indicates that instances of the ``Counter`` class will be actors. Each actor runs in its own Python process. + .. tab-item:: Python - .. code-block:: python + The ``ray.remote`` decorator indicates that instances of the ``Counter`` class will be actors. Each actor runs in its own Python process. - @ray.remote - class Counter(object): - def __init__(self): - self.value = 0 + .. testcode:: - def increment(self): - self.value += 1 - return self.value + import ray - def get_counter(self): - return self.value + @ray.remote + class Counter: + def __init__(self): + self.value = 0 - # Create an actor from this class. - counter = Counter.remote() + def increment(self): + self.value += 1 + return self.value -.. tabbed:: Java + def get_counter(self): + return self.value - ``Ray.actor`` is used to create actors from regular Java classes. + # Create an actor from this class. + counter = Counter.remote() - .. code-block:: java + .. tab-item:: Java - // A regular Java class. - public class Counter { + ``Ray.actor`` is used to create actors from regular Java classes. - private int value = 0; + .. code-block:: java - public int increment() { - this.value += 1; - return this.value; - } - } + // A regular Java class. + public class Counter { - // Create an actor from this class. - // `Ray.actor` takes a factory method that can produce - // a `Counter` object. Here, we pass `Counter`'s constructor - // as the argument. - ActorHandle counter = Ray.actor(Counter::new).remote(); + private int value = 0; -.. tabbed:: C++ + public int increment() { + this.value += 1; + return this.value; + } + } + + // Create an actor from this class. + // `Ray.actor` takes a factory method that can produce + // a `Counter` object. Here, we pass `Counter`'s constructor + // as the argument. + ActorHandle counter = Ray.actor(Counter::new).remote(); - ``ray::Actor`` is used to create actors from regular C++ classes. + .. tab-item:: C++ - .. code-block:: c++ + ``ray::Actor`` is used to create actors from regular C++ classes. - // A regular C++ class. - class Counter { + .. code-block:: c++ - private: - int value = 0; + // A regular C++ class. + class Counter { - public: - int Increment() { - value += 1; - return value; - } - }; + private: + int value = 0; - // Factory function of Counter class. - static Counter *CreateCounter() { - return new Counter(); - }; + public: + int Increment() { + value += 1; + return value; + } + }; + + // Factory function of Counter class. + static Counter *CreateCounter() { + return new Counter(); + }; - RAY_REMOTE(&Counter::Increment, CreateCounter); + RAY_REMOTE(&Counter::Increment, CreateCounter); - // Create an actor from this class. - // `ray::Actor` takes a factory method that can produce - // a `Counter` object. Here, we pass `Counter`'s factory function - // as the argument. - auto counter = ray::Actor(CreateCounter).Remote(); + // Create an actor from this class. + // `ray::Actor` takes a factory method that can produce + // a `Counter` object. Here, we pass `Counter`'s factory function + // as the argument. + auto counter = ray::Actor(CreateCounter).Remote(); Specifying required resources ----------------------------- @@ -92,28 +96,30 @@ Specifying required resources You can specify resource requirements in actors too (see :ref:`resource-requirements` for more details.) -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. code-block:: python + .. testcode:: - # Specify required resources for an actor. - @ray.remote(num_cpus=2, num_gpus=0.5) - class Actor(object): - pass + # Specify required resources for an actor. + @ray.remote(num_cpus=2, num_gpus=0.5) + class Actor: + pass -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - // Specify required resources for an actor. - Ray.actor(Counter::new).setResource("CPU", 2.0).setResource("GPU", 0.5).remote(); + // Specify required resources for an actor. + Ray.actor(Counter::new).setResource("CPU", 2.0).setResource("GPU", 0.5).remote(); -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - // Specify required resources for an actor. - ray::Actor(CreateCounter).SetResource("CPU", 2.0).SetResource("GPU", 0.5).Remote(); + // Specify required resources for an actor. + ray::Actor(CreateCounter).SetResource("CPU", 2.0).SetResource("GPU", 0.5).Remote(); Calling the actor @@ -123,202 +129,233 @@ We can interact with the actor by calling its methods with the ``remote`` operator. We can then call ``get`` on the object ref to retrieve the actual value. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - # Call the actor. - obj_ref = counter.increment.remote() - assert ray.get(obj_ref) == 1 + .. testcode:: -.. tabbed:: Java + # Call the actor. + obj_ref = counter.increment.remote() + print(ray.get(obj_ref)) - .. code-block:: java + .. testoutput:: - // Call the actor. - ObjectRef objectRef = counter.task(&Counter::increment).remote(); - Assert.assertTrue(objectRef.get() == 1); + 1 -.. tabbed:: C++ + .. tab-item:: Java - .. code-block:: c++ + .. code-block:: java - // Call the actor. - auto object_ref = counter.Task(&Counter::increment).Remote(); - assert(*object_ref.Get() == 1); + // Call the actor. + ObjectRef objectRef = counter.task(&Counter::increment).remote(); + Assert.assertTrue(objectRef.get() == 1); + + .. tab-item:: C++ + + .. code-block:: c++ + + // Call the actor. + auto object_ref = counter.Task(&Counter::increment).Remote(); + assert(*object_ref.Get() == 1); Methods called on different actors can execute in parallel, and methods called on the same actor are executed serially in the order that they are called. Methods on the same actor will share state with one another, as shown below. -.. tabbed:: Python - - .. code-block:: python - - # Create ten Counter actors. - counters = [Counter.remote() for _ in range(10)] - - # Increment each Counter once and get the results. These tasks all happen in - # parallel. - results = ray.get([c.increment.remote() for c in counters]) - print(results) # prints [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - - # Increment the first Counter five times. These tasks are executed serially - # and share state. - results = ray.get([counters[0].increment.remote() for _ in range(5)]) - print(results) # prints [2, 3, 4, 5, 6] - -.. tabbed:: Java - - .. code-block:: java - - // Create ten Counter actors. - List> counters = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - counters.add(Ray.actor(Counter::new).remote()); - } - - // Increment each Counter once and get the results. These tasks all happen in - // parallel. - List> objectRefs = new ArrayList<>(); - for (ActorHandle counterActor : counters) { - objectRefs.add(counterActor.task(Counter::increment).remote()); - } - // prints [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - System.out.println(Ray.get(objectRefs)); - - // Increment the first Counter five times. These tasks are executed serially - // and share state. - objectRefs = new ArrayList<>(); - for (int i = 0; i < 5; i++) { - objectRefs.add(counters.get(0).task(Counter::increment).remote()); - } - // prints [2, 3, 4, 5, 6] - System.out.println(Ray.get(objectRefs)); - -.. tabbed:: C++ - - .. code-block:: c++ - - // Create ten Counter actors. - std::vector> counters; - for (int i = 0; i < 10; i++) { - counters.emplace_back(ray::Actor(CreateCounter).Remote()); - } - - // Increment each Counter once and get the results. These tasks all happen in - // parallel. - std::vector> object_refs; - for (ray::ActorHandle counter_actor : counters) { - object_refs.emplace_back(counter_actor.Task(&Counter::Increment).Remote()); - } - // prints 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - auto results = ray::Get(object_refs); - for (const auto &result : results) { - std::cout << *result; - } - - // Increment the first Counter five times. These tasks are executed serially - // and share state. - object_refs.clear(); - for (int i = 0; i < 5; i++) { - object_refs.emplace_back(counters[0].Task(&Counter::Increment).Remote()); - } - // prints 2, 3, 4, 5, 6 - results = ray::Get(object_refs); - for (const auto &result : results) { - std::cout << *result; - } +.. tab-set:: + + .. tab-item:: Python + + .. testcode:: + + # Create ten Counter actors. + counters = [Counter.remote() for _ in range(10)] + + # Increment each Counter once and get the results. These tasks all happen in + # parallel. + results = ray.get([c.increment.remote() for c in counters]) + print(results) + + # Increment the first Counter five times. These tasks are executed serially + # and share state. + results = ray.get([counters[0].increment.remote() for _ in range(5)]) + print(results) + + .. testoutput:: + + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + [2, 3, 4, 5, 6] + + .. tab-item:: Java + + .. code-block:: java + + // Create ten Counter actors. + List> counters = new ArrayList<>(); + for (int i = 0; i < 10; i++) { + counters.add(Ray.actor(Counter::new).remote()); + } + + // Increment each Counter once and get the results. These tasks all happen in + // parallel. + List> objectRefs = new ArrayList<>(); + for (ActorHandle counterActor : counters) { + objectRefs.add(counterActor.task(Counter::increment).remote()); + } + // prints [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + System.out.println(Ray.get(objectRefs)); + + // Increment the first Counter five times. These tasks are executed serially + // and share state. + objectRefs = new ArrayList<>(); + for (int i = 0; i < 5; i++) { + objectRefs.add(counters.get(0).task(Counter::increment).remote()); + } + // prints [2, 3, 4, 5, 6] + System.out.println(Ray.get(objectRefs)); + + .. tab-item:: C++ + + .. code-block:: c++ + + // Create ten Counter actors. + std::vector> counters; + for (int i = 0; i < 10; i++) { + counters.emplace_back(ray::Actor(CreateCounter).Remote()); + } + + // Increment each Counter once and get the results. These tasks all happen in + // parallel. + std::vector> object_refs; + for (ray::ActorHandle counter_actor : counters) { + object_refs.emplace_back(counter_actor.Task(&Counter::Increment).Remote()); + } + // prints 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + auto results = ray::Get(object_refs); + for (const auto &result : results) { + std::cout << *result; + } + + // Increment the first Counter five times. These tasks are executed serially + // and share state. + object_refs.clear(); + for (int i = 0; i < 5; i++) { + object_refs.emplace_back(counters[0].Task(&Counter::Increment).Remote()); + } + // prints 2, 3, 4, 5, 6 + results = ray::Get(object_refs); + for (const auto &result : results) { + std::cout << *result; + } Passing Around Actor Handles ---------------------------- Actor handles can be passed into other tasks. We can define remote functions (or actor methods) that use actor handles. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - import time + .. testcode:: - @ray.remote - def f(counter): - for _ in range(1000): - time.sleep(0.1) - counter.increment.remote() + import time + + @ray.remote + def f(counter): + for _ in range(10): + time.sleep(0.1) + counter.increment.remote() -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - public static class MyRayApp { + public static class MyRayApp { - public static void foo(ActorHandle counter) throws InterruptedException { - for (int i = 0; i < 1000; i++) { - TimeUnit.MILLISECONDS.sleep(100); - counter.task(Counter::increment).remote(); + public static void foo(ActorHandle counter) throws InterruptedException { + for (int i = 0; i < 1000; i++) { + TimeUnit.MILLISECONDS.sleep(100); + counter.task(Counter::increment).remote(); + } + } } - } - } -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - void Foo(ray::ActorHandle counter) { - for (int i = 0; i < 1000; i++) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - counter.Task(&Counter::Increment).Remote(); + void Foo(ray::ActorHandle counter) { + for (int i = 0; i < 1000; i++) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + counter.Task(&Counter::Increment).Remote(); + } } - } If we instantiate an actor, we can pass the handle around to various tasks. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - counter = Counter.remote() + .. testcode:: - # Start some tasks that use the actor. - [f.remote(counter) for _ in range(3)] + counter = Counter.remote() - # Print the counter value. - for _ in range(10): - time.sleep(1) - print(ray.get(counter.get_counter.remote())) + # Start some tasks that use the actor. + [f.remote(counter) for _ in range(3)] -.. tabbed:: Java + # Print the counter value. + for _ in range(10): + time.sleep(0.1) + print(ray.get(counter.get_counter.remote())) - .. code-block:: java + .. testoutput:: + :options: +SKIP - ActorHandle counter = Ray.actor(Counter::new).remote(); + 0 + 3 + 8 + 10 + 15 + 18 + 20 + 25 + 30 + 30 - // Start some tasks that use the actor. - for (int i = 0; i < 3; i++) { - Ray.task(MyRayApp::foo, counter).remote(); - } + .. tab-item:: Java - // Print the counter value. - for (int i = 0; i < 10; i++) { - TimeUnit.SECONDS.sleep(1); - System.out.println(counter.task(Counter::getCounter).remote().get()); - } + .. code-block:: java -.. tabbed:: C++ + ActorHandle counter = Ray.actor(Counter::new).remote(); - .. code-block:: c++ + // Start some tasks that use the actor. + for (int i = 0; i < 3; i++) { + Ray.task(MyRayApp::foo, counter).remote(); + } - auto counter = ray::Actor(CreateCounter).Remote(); + // Print the counter value. + for (int i = 0; i < 10; i++) { + TimeUnit.SECONDS.sleep(1); + System.out.println(counter.task(Counter::getCounter).remote().get()); + } - // Start some tasks that use the actor. - for (int i = 0; i < 3; i++) { - ray::Task(Foo).Remote(counter); - } + .. tab-item:: C++ - // Print the counter value. - for (int i = 0; i < 10; i++) { - std::this_thread::sleep_for(std::chrono::seconds(1)); - std::cout << *counter.Task(&Counter::GetCounter).Remote().Get() << std::endl; - } + .. code-block:: c++ + + auto counter = ray::Actor(CreateCounter).Remote(); + + // Start some tasks that use the actor. + for (int i = 0; i < 3; i++) { + ray::Task(Foo).Remote(counter); + } + + // Print the counter value. + for (int i = 0; i < 10; i++) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + std::cout << *counter.Task(&Counter::GetCounter).Remote().Get() << std::endl; + } Scheduling diff --git a/doc/source/ray-core/actors/actor-utils.rst b/doc/source/ray-core/actors/actor-utils.rst index 6e5b3da20d4d..ccdae25a5973 100644 --- a/doc/source/ray-core/actors/actor-utils.rst +++ b/doc/source/ray-core/actors/actor-utils.rst @@ -4,22 +4,24 @@ Utility Classes Actor Pool ~~~~~~~~~~ -.. tabbed:: Python +.. tab-set:: - The ``ray.util`` module contains a utility class, ``ActorPool``. - This class is similar to multiprocessing.Pool and lets you schedule Ray tasks over a fixed pool of actors. + .. tab-item:: Python - .. literalinclude:: ../doc_code/actor-pool.py + The ``ray.util`` module contains a utility class, ``ActorPool``. + This class is similar to multiprocessing.Pool and lets you schedule Ray tasks over a fixed pool of actors. - See the :class:`package reference ` for more information. + .. literalinclude:: ../doc_code/actor-pool.py -.. tabbed:: Java + See the :class:`package reference ` for more information. - Actor pool hasn't been implemented in Java yet. + .. tab-item:: Java -.. tabbed:: C++ + Actor pool hasn't been implemented in Java yet. - Actor pool hasn't been implemented in C++ yet. + .. tab-item:: C++ + + Actor pool hasn't been implemented in C++ yet. Message passing using Ray Queue ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/ray-core/actors/async_api.rst b/doc/source/ray-core/actors/async_api.rst index dba3fca9805c..94eb0d1cb79a 100644 --- a/doc/source/ray-core/actors/async_api.rst +++ b/doc/source/ray-core/actors/async_api.rst @@ -25,14 +25,10 @@ Since Python 3.5, it is possible to write concurrent code using the Ray natively integrates with asyncio. You can use ray alongside with popular async frameworks like aiohttp, aioredis, etc. -You can try it about by running the following snippet in ``ipython`` or a shell -that supports top level ``await``: - -.. code-block:: python +.. testcode:: import ray import asyncio - ray.init() @ray.remote class AsyncActor: @@ -49,8 +45,21 @@ that supports top level ``await``: ray.get([actor.run_concurrent.remote() for _ in range(4)]) # async ray.get - await actor.run_concurrent.remote() - + async def async_get(): + await actor.run_concurrent.remote() + asyncio.run(async_get()) + +.. testoutput:: + :options: +SKIP + + (AsyncActor pid=40293) started + (AsyncActor pid=40293) started + (AsyncActor pid=40293) started + (AsyncActor pid=40293) started + (AsyncActor pid=40293) finished + (AsyncActor pid=40293) finished + (AsyncActor pid=40293) finished + (AsyncActor pid=40293) finished ObjectRefs as asyncio.Futures ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -60,7 +69,9 @@ applications. Instead of: -.. code-block:: python +.. testcode:: + + import ray @ray.remote def some_task(): @@ -71,23 +82,34 @@ Instead of: you can do: -.. code-block:: python +.. testcode:: + + import ray + import asyncio @ray.remote def some_task(): return 1 - await some_task.remote() - await asyncio.wait([some_task.remote()]) + async def await_obj_ref(): + await some_task.remote() + await asyncio.wait([some_task.remote()]) + + asyncio.run(await_obj_ref()) Please refer to `asyncio doc `__ for more `asyncio` patterns including timeouts and ``asyncio.gather``. If you need to directly access the future object, you can call: -.. code-block:: python +.. testcode:: - fut: asyncio.Future = asyncio.wrap_future(ref.future()) + import asyncio + + async def convert_to_asyncio_future(): + ref = some_task.remote() + fut: asyncio.Future = asyncio.wrap_future(ref.future()) + asyncio.run(convert_to_asyncio_future()) .. _async-ref-to-futures: @@ -96,21 +118,29 @@ ObjectRefs as concurrent.futures.Futures ObjectRefs can also be wrapped into ``concurrent.futures.Future`` objects. This is useful for interfacing with existing ``concurrent.futures`` APIs: -.. code-block:: python +.. testcode:: - refs = [fun.remote() for _ in range(4)] + import concurrent + + refs = [some_task.remote() for _ in range(4)] futs = [ref.future() for ref in refs] for fut in concurrent.futures.as_completed(futs): assert fut.done() print(fut.result()) +.. testoutput:: + + 1 + 1 + 1 + 1 Defining an Async Actor ~~~~~~~~~~~~~~~~~~~~~~~ By using `async` method definitions, Ray will automatically detect whether an actor support `async` calls or not. -.. code-block:: python +.. testcode:: import asyncio @@ -118,13 +148,27 @@ By using `async` method definitions, Ray will automatically detect whether an ac class AsyncActor: async def run_task(self): print("started") - await asyncio.sleep(1) # Network, I/O task here + await asyncio.sleep(2) # Network, I/O task here print("ended") actor = AsyncActor.remote() - # All 50 tasks should start at once. After 1 second they should all finish. + # All 5 tasks should start at once. After 2 second they should all finish. # they should finish at the same time - ray.get([actor.run_task.remote() for _ in range(50)]) + ray.get([actor.run_task.remote() for _ in range(5)]) + +.. testoutput:: + :options: +SKIP + + (AsyncActor pid=3456) started + (AsyncActor pid=3456) started + (AsyncActor pid=3456) started + (AsyncActor pid=3456) started + (AsyncActor pid=3456) started + (AsyncActor pid=3456) ended + (AsyncActor pid=3456) ended + (AsyncActor pid=3456) ended + (AsyncActor pid=3456) ended + (AsyncActor pid=3456) ended Under the hood, Ray runs all of the methods inside a single python event loop. Please note that running blocking ``ray.get`` or ``ray.wait`` inside async @@ -139,7 +183,7 @@ Setting concurrency in Async Actors You can set the number of "concurrent" task running at once using the ``max_concurrency`` flag. By default, 1000 tasks can be running concurrently. -.. code-block:: python +.. testcode:: import asyncio @@ -150,10 +194,30 @@ You can set the number of "concurrent" task running at once using the await asyncio.sleep(1) # Network, I/O task here print("ended") - actor = AsyncActor.options(max_concurrency=10).remote() - - # Only 10 tasks will be running concurrently. Once 10 finish, the next 10 should run. - ray.get([actor.run_task.remote() for _ in range(50)]) + actor = AsyncActor.options(max_concurrency=2).remote() + + # Only 2 tasks will be running concurrently. Once 2 finish, the next 2 should run. + ray.get([actor.run_task.remote() for _ in range(8)]) + +.. testoutput:: + :options: +SKIP + + (AsyncActor pid=5859) started + (AsyncActor pid=5859) started + (AsyncActor pid=5859) ended + (AsyncActor pid=5859) ended + (AsyncActor pid=5859) started + (AsyncActor pid=5859) started + (AsyncActor pid=5859) ended + (AsyncActor pid=5859) ended + (AsyncActor pid=5859) started + (AsyncActor pid=5859) started + (AsyncActor pid=5859) ended + (AsyncActor pid=5859) ended + (AsyncActor pid=5859) started + (AsyncActor pid=5859) started + (AsyncActor pid=5859) ended + (AsyncActor pid=5859) ended .. _threaded-actors: @@ -172,7 +236,7 @@ Instead, you can use the ``max_concurrency`` Actor options without any async met will recognize the actor as AsyncActor instead of ThreadedActor. -.. code-block:: python +.. testcode:: @ray.remote class ThreadedActor: @@ -182,6 +246,11 @@ Instead, you can use the ``max_concurrency`` Actor options without any async met a = ThreadedActor.options(max_concurrency=2).remote() ray.get([a.task_1.remote(), a.task_2.remote()]) +.. testoutput:: + :options: +SKIP + + (ThreadedActor pid=4822) I'm running in a thread! + (ThreadedActor pid=4822) I'm running in another thread! Each invocation of the threaded actor will be running in a thread pool. The size of the threadpool is limited by the ``max_concurrency`` value. @@ -190,7 +259,8 @@ AsyncIO for Remote Tasks We don't support asyncio for remote tasks. The following snippet will fail: -.. code-block:: python +.. testcode:: + :skipif: True @ray.remote async def f(): @@ -198,7 +268,7 @@ We don't support asyncio for remote tasks. The following snippet will fail: Instead, you can wrap the ``async`` function with a wrapper to run the task synchronously: -.. code-block:: python +.. testcode:: async def f(): pass @@ -207,7 +277,3 @@ Instead, you can wrap the ``async`` function with a wrapper to run the task sync def wrapper(): import asyncio asyncio.run(f()) - # For python < 3.7: - # asyncio.get_event_loop().run_until_complete(f()) - - diff --git a/doc/source/ray-core/actors/concurrency_group_api.rst b/doc/source/ray-core/actors/concurrency_group_api.rst index f27e4725fd16..9945f530e2de 100644 --- a/doc/source/ray-core/actors/concurrency_group_api.rst +++ b/doc/source/ray-core/actors/concurrency_group_api.rst @@ -17,95 +17,97 @@ into the "compute" group. Note that there is always a default concurrency group, which has a default concurrency of 1000 in Python and 1 in Java. -.. tabbed:: Python +.. tab-set:: - You can define concurrency groups for asyncio actors using the ``concurrency_group`` decorator argument: + .. tab-item:: Python - .. code-block:: python + You can define concurrency groups for asyncio actors using the ``concurrency_group`` decorator argument: - @ray.remote(concurrency_groups={"io": 2, "compute": 4}) - class AsyncIOActor: - def __init__(self): - pass + .. code-block:: python - @ray.method(concurrency_group="io") - async def f1(self): - pass + @ray.remote(concurrency_groups={"io": 2, "compute": 4}) + class AsyncIOActor: + def __init__(self): + pass - @ray.method(concurrency_group="io") - async def f2(self): - pass + @ray.method(concurrency_group="io") + async def f1(self): + pass - @ray.method(concurrency_group="compute") - async def f3(self): - pass + @ray.method(concurrency_group="io") + async def f2(self): + pass - @ray.method(concurrency_group="compute") - async def f4(self): - pass + @ray.method(concurrency_group="compute") + async def f3(self): + pass - async def f5(self): - pass + @ray.method(concurrency_group="compute") + async def f4(self): + pass - a = AsyncIOActor.remote() - a.f1.remote() # executed in the "io" group. - a.f2.remote() # executed in the "io" group. - a.f3.remote() # executed in the "compute" group. - a.f4.remote() # executed in the "compute" group. - a.f5.remote() # executed in the default group. + async def f5(self): + pass -.. tabbed:: Java + a = AsyncIOActor.remote() + a.f1.remote() # executed in the "io" group. + a.f2.remote() # executed in the "io" group. + a.f3.remote() # executed in the "compute" group. + a.f4.remote() # executed in the "compute" group. + a.f5.remote() # executed in the default group. - You can define concurrency groups for concurrent actors using the API ``setConcurrencyGroups()`` argument: + .. tab-item:: Java - .. code-block:: java + You can define concurrency groups for concurrent actors using the API ``setConcurrencyGroups()`` argument: - class ConcurrentActor { - public long f1() { - return Thread.currentThread().getId(); - } + .. code-block:: java - public long f2() { - return Thread.currentThread().getId(); - } + class ConcurrentActor { + public long f1() { + return Thread.currentThread().getId(); + } - public long f3(int a, int b) { - return Thread.currentThread().getId(); - } + public long f2() { + return Thread.currentThread().getId(); + } - public long f4() { - return Thread.currentThread().getId(); - } + public long f3(int a, int b) { + return Thread.currentThread().getId(); + } + + public long f4() { + return Thread.currentThread().getId(); + } - public long f5() { - return Thread.currentThread().getId(); + public long f5() { + return Thread.currentThread().getId(); + } } - } - - ConcurrencyGroup group1 = - new ConcurrencyGroupBuilder() - .setName("io") - .setMaxConcurrency(1) - .addMethod(ConcurrentActor::f1) - .addMethod(ConcurrentActor::f2) - .build(); - ConcurrencyGroup group2 = - new ConcurrencyGroupBuilder() - .setName("compute") - .setMaxConcurrency(1) - .addMethod(ConcurrentActor::f3) - .addMethod(ConcurrentActor::f4) - .build(); - - ActorHandle myActor = Ray.actor(ConcurrentActor::new) - .setConcurrencyGroups(group1, group2) - .remote(); - - myActor.task(ConcurrentActor::f1).remote(); // executed in the "io" group. - myActor.task(ConcurrentActor::f2).remote(); // executed in the "io" group. - myActor.task(ConcurrentActor::f3, 3, 5).remote(); // executed in the "compute" group. - myActor.task(ConcurrentActor::f4).remote(); // executed in the "compute" group. - myActor.task(ConcurrentActor::f5).remote(); // executed in the "default" group. + + ConcurrencyGroup group1 = + new ConcurrencyGroupBuilder() + .setName("io") + .setMaxConcurrency(1) + .addMethod(ConcurrentActor::f1) + .addMethod(ConcurrentActor::f2) + .build(); + ConcurrencyGroup group2 = + new ConcurrencyGroupBuilder() + .setName("compute") + .setMaxConcurrency(1) + .addMethod(ConcurrentActor::f3) + .addMethod(ConcurrentActor::f4) + .build(); + + ActorHandle myActor = Ray.actor(ConcurrentActor::new) + .setConcurrencyGroups(group1, group2) + .remote(); + + myActor.task(ConcurrentActor::f1).remote(); // executed in the "io" group. + myActor.task(ConcurrentActor::f2).remote(); // executed in the "io" group. + myActor.task(ConcurrentActor::f3, 3, 5).remote(); // executed in the "compute" group. + myActor.task(ConcurrentActor::f4).remote(); // executed in the "compute" group. + myActor.task(ConcurrentActor::f5).remote(); // executed in the "default" group. .. _default-concurrency-group: @@ -116,43 +118,45 @@ Default Concurrency Group By default, methods are placed in a default concurrency group which has a concurrency limit of 1000 in Python, 1 in Java. The concurrency of the default group can be changed by setting the ``max_concurrency`` actor option. -.. tabbed:: Python +.. tab-set:: - The following AsyncIOActor has 2 concurrency groups: "io" and "default". - The max concurrency of "io" is 2, and the max concurrency of "default" is 10. + .. tab-item:: Python - .. code-block:: python + The following AsyncIOActor has 2 concurrency groups: "io" and "default". + The max concurrency of "io" is 2, and the max concurrency of "default" is 10. - @ray.remote(concurrency_groups={"io": 2}) - class AsyncIOActor: - async def f1(self): - pass + .. code-block:: python - actor = AsyncIOActor.options(max_concurrency=10).remote() + @ray.remote(concurrency_groups={"io": 2}) + class AsyncIOActor: + async def f1(self): + pass -.. tabbed:: Java + actor = AsyncIOActor.options(max_concurrency=10).remote() - The following concurrent actor has 2 concurrency groups: "io" and "default". - The max concurrency of "io" is 2, and the max concurrency of "default" is 10. + .. tab-item:: Java - .. code-block:: java + The following concurrent actor has 2 concurrency groups: "io" and "default". + The max concurrency of "io" is 2, and the max concurrency of "default" is 10. - class ConcurrentActor: - public long f1() { - return Thread.currentThread().getId(); - } + .. code-block:: java + + class ConcurrentActor: + public long f1() { + return Thread.currentThread().getId(); + } - ConcurrencyGroup group = - new ConcurrencyGroupBuilder() - .setName("io") - .setMaxConcurrency(2) - .addMethod(ConcurrentActor::f1) - .build(); + ConcurrencyGroup group = + new ConcurrencyGroupBuilder() + .setName("io") + .setMaxConcurrency(2) + .addMethod(ConcurrentActor::f1) + .build(); - ActorHandle myActor = Ray.actor(ConcurrentActor::new) - .setConcurrencyGroups(group1) - .setMaxConcurrency(10) - .remote(); + ActorHandle myActor = Ray.actor(ConcurrentActor::new) + .setConcurrencyGroups(group1) + .setMaxConcurrency(10) + .remote(); .. _setting-the-concurrency-group-at-runtime: @@ -165,26 +169,28 @@ You can also dispatch actor methods into a specific concurrency group at runtime The following snippet demonstrates setting the concurrency group of the ``f2`` method dynamically at runtime. -.. tabbed:: Python - - You can use the ``.options`` method. +.. tab-set:: + + .. tab-item:: Python + + You can use the ``.options`` method. - .. code-block:: python + .. code-block:: python - # Executed in the "io" group (as defined in the actor class). - a.f2.options().remote() + # Executed in the "io" group (as defined in the actor class). + a.f2.options().remote() - # Executed in the "compute" group. - a.f2.options(concurrency_group="compute").remote() + # Executed in the "compute" group. + a.f2.options(concurrency_group="compute").remote() -.. tabbed:: Java + .. tab-item:: Java - You can use ``setConcurrencyGroup`` method. + You can use ``setConcurrencyGroup`` method. - .. code-block:: java + .. code-block:: java - // Executed in the "io" group (as defined in the actor creation). - myActor.task(ConcurrentActor::f2).remote(); + // Executed in the "io" group (as defined in the actor creation). + myActor.task(ConcurrentActor::f2).remote(); - // Executed in the "compute" group. - myActor.task(ConcurrentActor::f2).setConcurrencyGroup("compute").remote(); + // Executed in the "compute" group. + myActor.task(ConcurrentActor::f2).setConcurrencyGroup("compute").remote(); diff --git a/doc/source/ray-core/actors/named-actors.rst b/doc/source/ray-core/actors/named-actors.rst index 79b7367d24ac..51a624bca1b3 100644 --- a/doc/source/ray-core/actors/named-actors.rst +++ b/doc/source/ray-core/actors/named-actors.rst @@ -9,118 +9,122 @@ access an actor launched by another driver. Note that the actor will still be garbage-collected if no handles to it exist. See :ref:`actor-lifetimes` for more details. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - # Create an actor with a name - counter = Counter.options(name="some_name").remote() + .. code-block:: python - ... + # Create an actor with a name + counter = Counter.options(name="some_name").remote() - # Retrieve the actor later somewhere - counter = ray.get_actor("some_name") + ... -.. tabbed:: Java + # Retrieve the actor later somewhere + counter = ray.get_actor("some_name") - .. code-block:: java + .. tab-item:: Java - // Create an actor with a name. - ActorHandle counter = Ray.actor(Counter::new).setName("some_name").remote(); + .. code-block:: java - ... + // Create an actor with a name. + ActorHandle counter = Ray.actor(Counter::new).setName("some_name").remote(); - // Retrieve the actor later somewhere - Optional> counter = Ray.getActor("some_name"); - Assert.assertTrue(counter.isPresent()); + ... -.. tabbed:: C++ + // Retrieve the actor later somewhere + Optional> counter = Ray.getActor("some_name"); + Assert.assertTrue(counter.isPresent()); - .. code-block:: c++ + .. tab-item:: C++ - // Create an actor with a globally unique name - ActorHandle counter = ray::Actor(CreateCounter).SetGlobalName("some_name").Remote(); + .. code-block:: c++ - ... + // Create an actor with a globally unique name + ActorHandle counter = ray::Actor(CreateCounter).SetGlobalName("some_name").Remote(); - // Retrieve the actor later somewhere - boost::optional> counter = ray::GetGlobalActor("some_name"); + ... - We also support non-global named actors in C++, which means that the actor name is only valid within the job and the actor cannot be accessed from another job + // Retrieve the actor later somewhere + boost::optional> counter = ray::GetGlobalActor("some_name"); - .. code-block:: c++ + We also support non-global named actors in C++, which means that the actor name is only valid within the job and the actor cannot be accessed from another job - // Create an actor with a job-scope-unique name - ActorHandle counter = ray::Actor(CreateCounter).SetName("some_name").Remote(); + .. code-block:: c++ - ... + // Create an actor with a job-scope-unique name + ActorHandle counter = ray::Actor(CreateCounter).SetName("some_name").Remote(); - // Retrieve the actor later somewhere in the same job - boost::optional> counter = ray::GetActor("some_name"); + ... + + // Retrieve the actor later somewhere in the same job + boost::optional> counter = ray::GetActor("some_name"); .. note:: Named actors are scoped by namespace. If no namespace is assigned, they will be placed in an anonymous namespace by default. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. code-block:: python + .. code-block:: python - import ray + import ray - @ray.remote - class Actor: - pass + @ray.remote + class Actor: + pass - # driver_1.py - # Job 1 creates an actor, "orange" in the "colors" namespace. - ray.init(address="auto", namespace="colors") - Actor.options(name="orange", lifetime="detached").remote() + # driver_1.py + # Job 1 creates an actor, "orange" in the "colors" namespace. + ray.init(address="auto", namespace="colors") + Actor.options(name="orange", lifetime="detached").remote() - # driver_2.py - # Job 2 is now connecting to a different namespace. - ray.init(address="auto", namespace="fruit") - # This fails because "orange" was defined in the "colors" namespace. - ray.get_actor("orange") - # You can also specify the namespace explicitly. - ray.get_actor("orange", namespace="colors") + # driver_2.py + # Job 2 is now connecting to a different namespace. + ray.init(address="auto", namespace="fruit") + # This fails because "orange" was defined in the "colors" namespace. + ray.get_actor("orange") + # You can also specify the namespace explicitly. + ray.get_actor("orange", namespace="colors") - # driver_3.py - # Job 3 connects to the original "colors" namespace - ray.init(address="auto", namespace="colors") - # This returns the "orange" actor we created in the first job. - ray.get_actor("orange") + # driver_3.py + # Job 3 connects to the original "colors" namespace + ray.init(address="auto", namespace="colors") + # This returns the "orange" actor we created in the first job. + ray.get_actor("orange") -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - import ray + import ray - class Actor { - } + class Actor { + } - // Driver1.java - // Job 1 creates an actor, "orange" in the "colors" namespace. - System.setProperty("ray.job.namespace", "colors"); - Ray.init(); - Ray.actor(Actor::new).setName("orange").remote(); + // Driver1.java + // Job 1 creates an actor, "orange" in the "colors" namespace. + System.setProperty("ray.job.namespace", "colors"); + Ray.init(); + Ray.actor(Actor::new).setName("orange").remote(); - // Driver2.java - // Job 2 is now connecting to a different namespace. - System.setProperty("ray.job.namespace", "fruits"); - Ray.init(); - // This fails because "orange" was defined in the "colors" namespace. - Optional> actor = Ray.getActor("orange"); - Assert.assertFalse(actor.isPresent()); // actor.isPresent() is false. + // Driver2.java + // Job 2 is now connecting to a different namespace. + System.setProperty("ray.job.namespace", "fruits"); + Ray.init(); + // This fails because "orange" was defined in the "colors" namespace. + Optional> actor = Ray.getActor("orange"); + Assert.assertFalse(actor.isPresent()); // actor.isPresent() is false. - // Driver3.java - System.setProperty("ray.job.namespace", "colors"); - Ray.init(); - // This returns the "orange" actor we created in the first job. - Optional> actor = Ray.getActor("orange"); - Assert.assertTrue(actor.isPresent()); // actor.isPresent() is true. + // Driver3.java + System.setProperty("ray.job.namespace", "colors"); + Ray.init(); + // This returns the "orange" actor we created in the first job. + Optional> actor = Ray.getActor("orange"); + Assert.assertTrue(actor.isPresent()); // actor.isPresent() is true. Get-Or-Create a Named Actor --------------------------- @@ -133,21 +137,23 @@ If the actor already exists, a handle to the actor will be returned and the arguments will be ignored. Otherwise, a new actor will be created with the specified arguments. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ../doc_code/get_or_create.py + .. literalinclude:: ../doc_code/get_or_create.py -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - // This feature is not yet available in Java. + // This feature is not yet available in Java. -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - // This feature is not yet available in C++. + // This feature is not yet available in C++. .. _actor-lifetimes: @@ -157,47 +163,49 @@ Actor Lifetimes Separately, actor lifetimes can be decoupled from the job, allowing an actor to persist even after the driver process of the job exits. We call these actors *detached*. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python + + .. code-block:: python - .. code-block:: python + counter = Counter.options(name="CounterActor", lifetime="detached").remote() - counter = Counter.options(name="CounterActor", lifetime="detached").remote() + The ``CounterActor`` will be kept alive even after the driver running above script + exits. Therefore it is possible to run the following script in a different + driver: - The ``CounterActor`` will be kept alive even after the driver running above script - exits. Therefore it is possible to run the following script in a different - driver: + .. code-block:: python - .. code-block:: python + counter = ray.get_actor("CounterActor") + print(ray.get(counter.get_counter.remote())) - counter = ray.get_actor("CounterActor") - print(ray.get(counter.get_counter.remote())) + Note that an actor can be named but not detached. If we only specified the + name without specifying ``lifetime="detached"``, then the CounterActor can + only be retrieved as long as the original driver is still running. - Note that an actor can be named but not detached. If we only specified the - name without specifying ``lifetime="detached"``, then the CounterActor can - only be retrieved as long as the original driver is still running. + .. tab-item:: Java -.. tabbed:: Java + .. code-block:: java - .. code-block:: java + System.setProperty("ray.job.namespace", "lifetime"); + Ray.init(); + ActorHandle counter = Ray.actor(Counter::new).setName("some_name").setLifetime(ActorLifetime.DETACHED).remote(); - System.setProperty("ray.job.namespace", "lifetime"); - Ray.init(); - ActorHandle counter = Ray.actor(Counter::new).setName("some_name").setLifetime(ActorLifetime.DETACHED).remote(); - - The CounterActor will be kept alive even after the driver running above process - exits. Therefore it is possible to run the following code in a different - driver: + The CounterActor will be kept alive even after the driver running above process + exits. Therefore it is possible to run the following code in a different + driver: - .. code-block:: java + .. code-block:: java - System.setProperty("ray.job.namespace", "lifetime"); - Ray.init(); - Optional> counter = Ray.getActor("some_name"); - Assert.assertTrue(counter.isPresent()); + System.setProperty("ray.job.namespace", "lifetime"); + Ray.init(); + Optional> counter = Ray.getActor("some_name"); + Assert.assertTrue(counter.isPresent()); -.. tabbed:: C++ + .. tab-item:: C++ - Customizing lifetime of an actor hasn't been implemented in C++ yet. + Customizing lifetime of an actor hasn't been implemented in C++ yet. Unlike normal actors, detached actors are not automatically garbage-collected by Ray. diff --git a/doc/source/ray-core/actors/out-of-band-communication.rst b/doc/source/ray-core/actors/out-of-band-communication.rst index 7d9eff73c99f..063b9a26f69d 100644 --- a/doc/source/ray-core/actors/out-of-band-communication.rst +++ b/doc/source/ray-core/actors/out-of-band-communication.rst @@ -22,9 +22,11 @@ HTTP Server You can start a http server inside the actor and expose http endpoints to clients so users outside of the ray cluster can communicate with the actor. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ../doc_code/actor-http-server.py + .. tab-item:: Python + + .. literalinclude:: ../doc_code/actor-http-server.py Similarly, you can expose other types of servers as well (e.g., gRPC servers). diff --git a/doc/source/ray-core/actors/task-orders.rst b/doc/source/ray-core/actors/task-orders.rst index 0131abdd66fe..bfd665a519b4 100644 --- a/doc/source/ray-core/actors/task-orders.rst +++ b/doc/source/ray-core/actors/task-orders.rst @@ -11,77 +11,81 @@ them following the submission order. In other words, a given task will not be executed until previously submitted tasks from the same submitter have finished execution. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - import ray + .. code-block:: python - @ray.remote - class Counter: - def __init__(self): - self.value = 0 + import ray - def add(self, addition): - self.value += addition - return self.value + @ray.remote + class Counter: + def __init__(self): + self.value = 0 - counter = Counter.remote() + def add(self, addition): + self.value += addition + return self.value - # For tasks from the same submitter, - # they are executed according to submission order. - value0 = counter.add.remote(1) - value1 = counter.add.remote(2) + counter = Counter.remote() - # Output: 1. The first submitted task is executed first. - print(ray.get(value0)) - # Output: 3. The later submitted task is executed later. - print(ray.get(value1)) + # For tasks from the same submitter, + # they are executed according to submission order. + value0 = counter.add.remote(1) + value1 = counter.add.remote(2) + + # Output: 1. The first submitted task is executed first. + print(ray.get(value0)) + # Output: 3. The later submitted task is executed later. + print(ray.get(value1)) However, the actor does not guarantee the execution order of the tasks from different submitters. For example, suppose an unfulfilled argument blocks a previously submitted task. In this case, the actor can still execute tasks submitted by a different worker. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. code-block:: python + .. code-block:: python - import time - import ray + import time + import ray - @ray.remote - class Counter: - def __init__(self): - self.value = 0 + @ray.remote + class Counter: + def __init__(self): + self.value = 0 - def add(self, addition): - self.value += addition - return self.value + def add(self, addition): + self.value += addition + return self.value - counter = Counter.remote() + counter = Counter.remote() - # Submit task from a worker - @ray.remote - def submitter(value): - return ray.get(counter.add.remote(value)) + # Submit task from a worker + @ray.remote + def submitter(value): + return ray.get(counter.add.remote(value)) - # Simulate delayed result resolution. - @ray.remote - def delayed_resolution(value): - time.sleep(5) - return value + # Simulate delayed result resolution. + @ray.remote + def delayed_resolution(value): + time.sleep(5) + return value - # Submit tasks from different workers, with - # the first submitted task waiting for - # dependency resolution. - value0 = submitter.remote(delayed_resolution.remote(1)) - value1 = submitter.remote(2) + # Submit tasks from different workers, with + # the first submitted task waiting for + # dependency resolution. + value0 = submitter.remote(delayed_resolution.remote(1)) + value1 = submitter.remote(2) - # Output: 3. The first submitted task is executed later. - print(ray.get(value0)) - # Output: 2. The later submitted task is executed first. - print(ray.get(value1)) + # Output: 3. The first submitted task is executed later. + print(ray.get(value0)) + # Output: 2. The later submitted task is executed first. + print(ray.get(value1)) Asynchronous or Threaded Actor @@ -90,37 +94,39 @@ Asynchronous or Threaded Actor task execution order. This means the system might execute a task even though previously submitted tasks are pending execution. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. code-block:: python + .. code-block:: python - import time - import ray + import time + import ray - @ray.remote - class AsyncCounter: - def __init__(self): - self.value = 0 + @ray.remote + class AsyncCounter: + def __init__(self): + self.value = 0 - async def add(self, addition): - self.value += addition - return self.value + async def add(self, addition): + self.value += addition + return self.value - counter = AsyncCounter.remote() + counter = AsyncCounter.remote() - # Simulate delayed result resolution. - @ray.remote - def delayed_resolution(value): - time.sleep(5) - return value + # Simulate delayed result resolution. + @ray.remote + def delayed_resolution(value): + time.sleep(5) + return value - # Submit tasks from the driver, with - # the first submitted task waiting for - # dependency resolution. - value0 = counter.add.remote(delayed_resolution.remote(1)) - value1 = counter.add.remote(2) + # Submit tasks from the driver, with + # the first submitted task waiting for + # dependency resolution. + value0 = counter.add.remote(delayed_resolution.remote(1)) + value1 = counter.add.remote(2) - # Output: 3. The first submitted task is executed later. - print(ray.get(value0)) - # Output: 2. The later submitted task is executed first. - print(ray.get(value1)) + # Output: 3. The first submitted task is executed later. + print(ray.get(value0)) + # Output: 2. The later submitted task is executed first. + print(ray.get(value1)) diff --git a/doc/source/ray-core/actors/terminating-actors.rst b/doc/source/ray-core/actors/terminating-actors.rst index d71d8f3312bf..11ba78462f2e 100644 --- a/doc/source/ray-core/actors/terminating-actors.rst +++ b/doc/source/ray-core/actors/terminating-actors.rst @@ -18,33 +18,35 @@ be reserved for cases where an actor is unexpectedly hanging or leaking resources, and for :ref:`detached actors `, which must be manually destroyed. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - ray.kill(actor_handle) - # This will not go through the normal Python sys.exit - # teardown logic, so any exit handlers installed in - # the actor using ``atexit`` will not be called. + .. code-block:: python + ray.kill(actor_handle) + # This will not go through the normal Python sys.exit + # teardown logic, so any exit handlers installed in + # the actor using ``atexit`` will not be called. -.. tabbed:: Java - .. code-block:: java + .. tab-item:: Java - actorHandle.kill(); - // This will not go through the normal Java System.exit teardown logic, so any - // shutdown hooks installed in the actor using ``Runtime.addShutdownHook(...)`` will - // not be called. + .. code-block:: java -.. tabbed:: C++ + actorHandle.kill(); + // This will not go through the normal Java System.exit teardown logic, so any + // shutdown hooks installed in the actor using ``Runtime.addShutdownHook(...)`` will + // not be called. - .. code-block:: c++ + .. tab-item:: C++ - actor_handle.Kill(); - // This will not go through the normal C++ std::exit - // teardown logic, so any exit handlers installed in - // the actor using ``std::atexit`` will not be called. + .. code-block:: c++ + + actor_handle.Kill(); + // This will not go through the normal C++ std::exit + // teardown logic, so any exit handlers installed in + // the actor using ``std::atexit`` will not be called. This will cause the actor to immediately exit its process, causing any current, @@ -63,37 +65,39 @@ Manual termination within the actor If necessary, you can manually terminate an actor from within one of the actor methods. This will kill the actor process and release resources associated/assigned to the actor. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. code-block:: python + .. code-block:: python - ray.actor.exit_actor() + ray.actor.exit_actor() - This approach should generally not be necessary as actors are automatically garbage - collected. The ``ObjectRef`` resulting from the task can be waited on to wait - for the actor to exit (calling ``ray.get()`` on it will raise a ``RayActorError``). + This approach should generally not be necessary as actors are automatically garbage + collected. The ``ObjectRef`` resulting from the task can be waited on to wait + for the actor to exit (calling ``ray.get()`` on it will raise a ``RayActorError``). -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - Ray.exitActor(); + Ray.exitActor(); - Garbage collection for actors haven't been implemented yet, so this is currently the - only way to terminate an actor gracefully. The ``ObjectRef`` resulting from the task - can be waited on to wait for the actor to exit (calling ``ObjectRef::get`` on it will - throw a ``RayActorException``). + Garbage collection for actors haven't been implemented yet, so this is currently the + only way to terminate an actor gracefully. The ``ObjectRef`` resulting from the task + can be waited on to wait for the actor to exit (calling ``ObjectRef::get`` on it will + throw a ``RayActorException``). -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - ray::ExitActor(); + ray::ExitActor(); - Garbage collection for actors haven't been implemented yet, so this is currently the - only way to terminate an actor gracefully. The ``ObjectRef`` resulting from the task - can be waited on to wait for the actor to exit (calling ``ObjectRef::Get`` on it will - throw a ``RayActorException``). + Garbage collection for actors haven't been implemented yet, so this is currently the + only way to terminate an actor gracefully. The ``ObjectRef`` resulting from the task + can be waited on to wait for the actor to exit (calling ``ObjectRef::Get`` on it will + throw a ``RayActorException``). Note that this method of termination will wait until any previously submitted tasks finish executing and then exit the process gracefully with sys.exit. diff --git a/doc/source/ray-core/api/core.rst b/doc/source/ray-core/api/core.rst index c78e9afc418c..5bd7776d6dd6 100644 --- a/doc/source/ray-core/api/core.rst +++ b/doc/source/ray-core/api/core.rst @@ -7,6 +7,7 @@ Core API ray.init ray.shutdown ray.is_initialized + ray.job_config.JobConfig Tasks ----- diff --git a/doc/source/ray-core/configure.rst b/doc/source/ray-core/configure.rst index 6a2c5c78272d..5707b75013e2 100644 --- a/doc/source/ray-core/configure.rst +++ b/doc/source/ray-core/configure.rst @@ -19,18 +19,38 @@ Cluster Resources Ray by default detects available resources. -.. code-block:: python +.. testcode:: + :hide: + + import ray + ray.shutdown() + +.. testcode:: + + import ray # This automatically detects available resources in the single machine. ray.init() If not running cluster mode, you can specify cluster resources overrides through ``ray.init`` as follows. -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: # If not connecting to an existing cluster, you can specify resources overrides: ray.init(num_cpus=8, num_gpus=1) +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: + # Specifying custom resources ray.init(num_gpus=1, resources={'Resource1': 4, 'Resource2': 16}) @@ -49,7 +69,8 @@ When starting Ray from the command line, pass the ``--num-cpus`` and ``--num-gpu If using the command line, connect to the Ray cluster as follow: -.. code-block:: python +.. testcode:: + :skipif: True # Connect to ray. Notice if connected to existing cluster, you don't specify resources. ray.init(address=
    ) @@ -96,7 +117,7 @@ Look :ref:`Logging Directory Structure ` for more d Ports configurations -------------------- -Ray requires bi-directional communication among its nodes in a cluster. Each of node is supposed to open specific ports to receive incoming network requests. +Ray requires bi-directional communication among its nodes in a cluster. Each node opens specific ports to receive incoming network requests. All Nodes ~~~~~~~~~ @@ -127,6 +148,7 @@ In addition to ports specified above, the head node needs to open several more p - ``--port``: Port of Ray (GCS server). The head node will start a GCS server listening on this port. Default: 6379. - ``--ray-client-server-port``: Listening port for Ray Client Server. Default: 10001. - ``--redis-shard-ports``: Comma-separated list of ports for non-primary Redis shards. Default: Random values. +- ``--dashboard-grpc-port``: The gRPC port used by the dashboard. Default: Random value. - If ``--include-dashboard`` is true (the default), then the head node must open ``--dashboard-port``. Default: 8265. @@ -172,23 +194,99 @@ TLS Authentication ------------------ Ray can be configured to use TLS on it's gRPC channels. -This means that connecting to the Ray client on the head node will -require an appropriate set of credentials and also that data exchanged between -various processes (client, head, workers) will be encrypted. +This means that connecting to the Ray head requires +an appropriate set of credentials and also that data exchanged between +various processes (client, head, workers) is encrypted. -Enabling TLS will cause a performance hit due to the extra overhead of mutual -authentication and encryption. -Testing has shown that this overhead is large for small workloads and becomes -relatively smaller for large workloads. -The exact overhead will depend on the nature of your workload. +In TLS, the private key and public key are used for encryption and decryption. The +former is kept secret by the owner and the latter is shared with the other party. +This pattern ensures that only the intended recipient can read the message. + +A Certificate Authority (CA) is a trusted third party that certifies the identity of the +public key owner. The digital certificate issued by the CA contains the public key itself, +the identity of the public key owner, and the expiration date of the certificate. Note that +if the owner of the public key does not want to obtain a digital certificate from a CA, +they can generate a self-signed certificate with some tools like OpenSSL. + +To obtain a digital certificate, the owner of the public key must generate a Certificate Signing +Request (CSR). The CSR contains information about the owner of the public +key and the public key itself. For Ray, some additional steps are required for achieving +a successful TLS encryption. + +Here is a step-by-step guide for adding TLS Authentication to a static Kubernetes Ray cluster using +a self-signed certificates: + +Step 1: Generate a private key and self-signed certificate for CA +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + openssl req -x509 \ + -sha256 -days 3650 \ + -nodes \ + -newkey rsa:2048 \ + -subj "/CN=*.ray.io/C=US/L=San Francisco" \ + -keyout ca.key -out ca.crt + +Use the following command to encode the private key file and the self-signed certificate file, +then paste encoded strings to the secret.yaml. + +.. code-block:: bash + + cat ca.key | base64 + cat ca.crt | base64 + +# Alternatively, the command automatically encode and create the secret for the CA keypair. +kubectl create secret generic ca-tls --from-file=ca.crt= --from-file=ca.key= + +Step 2: Generate individual private keys and self-signed certificates for the Ray head and workers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `YAML file +`__, has a ConfigMap named `tls` that +includes two shell scripts: `gencert_head.sh` and `gencert_worker.sh`. These scripts produce the private key +and self-signed certificate files (`tls.key` and `tls.crt`) for both head and worker Pods in the initContainer +of each deployment. By using the initContainer, we can dynamically retrieve the `POD_IP` to the `[alt_names]` section. + +The scripts perform the following steps: first, a 2048-bit RSA private key is generated and saved as +`/etc/ray/tls/tls.key`. Then, a Certificate Signing Request (CSR) is generated using the `tls.key` file +and the `csr.conf` configuration file. Finally, a self-signed certificate (`tls.crt`) is created using +the Certificate Authority's (`ca.key and ca.crt`) keypair and the CSR (`ca.csr`). + +Step 3: Set the environment variables for both Ray head and worker to enable TLS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TLS is enabled by setting environment variables. - ``RAY_USE_TLS``: Either 1 or 0 to use/not-use TLS. If this is set to 1 then all of the environment variables below must be set. Default: 0. -- ``RAY_TLS_SERVER_CERT``: Location of a `certificate file` which is presented to other endpoints so as to achieve mutual authentication. -- ``RAY_TLS_SERVER_KEY``: Location of a `private key file` which is the cryptographic means to prove to other endpoints that you are the authorized user of a given certificate. -- ``RAY_TLS_CA_CERT``: Location of a `CA certificate file` which allows TLS to decide whether an endpoint's certificate has been signed by the correct authority. +- ``RAY_TLS_SERVER_CERT``: Location of a `certificate file (tls.crt)`, which is presented to other endpoints to achieve mutual authentication. +- ``RAY_TLS_SERVER_KEY``: Location of a `private key file (tls.key)`, which is the cryptographic means to prove to other endpoints that you are the authorized user of a given certificate. +- ``RAY_TLS_CA_CERT``: Location of a `CA certificate file (ca.crt)`, which allows TLS to decide whether an endpoint's certificate has been signed by the correct authority. +Step 4: Verify TLS authentication +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + # Log in to the worker Pod + kubectl exec -it ${WORKER_POD} -- bash + + # Since the head Pod has the certificate of the full qualified DNS resolution for the Ray head service, the connection to the worker Pods + # is established successfully + ray health-check --address service-ray-head.default.svc.cluster.local:6379 + + # Since service-ray-head hasn't added to the alt_names section in the certificate, the connection fails and an error + # message similar to the following is displayed: "Peer name service-ray-head is not in peer certificate". + ray health-check --address service-ray-head:6379 + + # After you add `DNS.3 = service-ray-head` to the alt_names sections and deploy the YAML again, the connection is able to work. + + +Enabling TLS causes a performance hit due to the extra overhead of mutual +authentication and encryption. +Testing has shown that this overhead is large for small workloads and becomes +relatively smaller for large workloads. +The exact overhead depends on the nature of your workload. Java Applications ----------------- diff --git a/doc/source/ray-core/cross-language.rst b/doc/source/ray-core/cross-language.rst index 4c24375a95a7..487d2263daa9 100644 --- a/doc/source/ray-core/cross-language.rst +++ b/doc/source/ray-core/cross-language.rst @@ -10,39 +10,43 @@ Setup the driver We need to set :ref:`code_search_path` in your driver. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ./doc_code/cross_language.py - :language: python - :start-after: __crosslang_init_start__ - :end-before: __crosslang_init_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: ./doc_code/cross_language.py + :language: python + :start-after: __crosslang_init_start__ + :end-before: __crosslang_init_end__ - .. code-block:: bash + .. tab-item:: Java - java -classpath \ - -Dray.address=
    \ - -Dray.job.code-search-path=/path/to/code/ \ - + .. code-block:: bash + + java -classpath \ + -Dray.address=
    \ + -Dray.job.code-search-path=/path/to/code/ \ + You may want to include multiple directories to load both Python and Java code for workers, if they are placed in different directories. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ./doc_code/cross_language.py - :language: python - :start-after: __crosslang_multidir_start__ - :end-before: __crosslang_multidir_end__ + .. literalinclude:: ./doc_code/cross_language.py + :language: python + :start-after: __crosslang_multidir_start__ + :end-before: __crosslang_multidir_end__ -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: bash + .. code-block:: bash - java -classpath \ - -Dray.address=
    \ - -Dray.job.code-search-path=/path/to/jars:/path/to/pys \ - + java -classpath \ + -Dray.address=
    \ + -Dray.job.code-search-path=/path/to/jars:/path/to/pys \ + Python calling Java ------------------- diff --git a/doc/source/ray-core/examples/batch_prediction.ipynb b/doc/source/ray-core/examples/batch_prediction.ipynb index 382979127766..9cb708dd8212 100644 --- a/doc/source/ray-core/examples/batch_prediction.ipynb +++ b/doc/source/ray-core/examples/batch_prediction.ipynb @@ -213,7 +213,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that the ActorPool is fixed in size, unlike task-based approach where the number of parallel tasks can be dynamic (as long as it's not exceeding max_in_flight_tasks). To have autoscaling actor pool, you will need to use the {doc}`Ray Datasets batch prediction `." + "Note that the ActorPool is fixed in size, unlike task-based approach where the number of parallel tasks can be dynamic (as long as it's not exceeding max_in_flight_tasks). To have autoscaling actor pool, you will need to use the {doc}`Ray Data batch prediction `." ] }, { diff --git a/doc/source/ray-core/examples/lm/lm-cluster.yaml b/doc/source/ray-core/examples/lm/lm-cluster.yaml index e53cf692f687..021b85c4eca6 100644 --- a/doc/source/ray-core/examples/lm/lm-cluster.yaml +++ b/doc/source/ray-core/examples/lm/lm-cluster.yaml @@ -91,7 +91,7 @@ setup_commands: # Custom commands that will be run on the head node after common setup. head_setup_commands: - - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions + - pip install boto3>=1.4.8 # 1.4.8 adds InstanceMarketOptions # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/doc/source/ray-core/examples/overview.rst b/doc/source/ray-core/examples/overview.rst index 00753e8b2090..19d746baae7c 100644 --- a/doc/source/ray-core/examples/overview.rst +++ b/doc/source/ray-core/examples/overview.rst @@ -7,60 +7,57 @@ Ray Tutorials and Examples Machine Learning Examples ------------------------- -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /images/timeseries.png - - +++ - .. link-button:: automl_for_time_series - :type: ref - :text: Build Simple AutoML for Time Series Using Ray - :classes: btn-link btn-block stretched-link - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: batch_prediction - :type: ref - :text: Build Batch Prediction Using Ray - :classes: btn-link btn-block stretched-link - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: batch_training - :type: ref - :text: Build Batch Training Using Ray - :classes: btn-link btn-block stretched-link - --- - :img-top: images/param_actor.png - - +++ - .. link-button:: plot_parameter_server - :type: ref - :text: Build a Simple Parameter Server Using Ray - :classes: btn-link btn-block stretched-link - --- - :img-top: images/hyperparameter.png - - +++ - .. link-button:: plot_hyperparameter - :type: ref - :text: Simple Parallel Model Selection - :classes: btn-link btn-block stretched-link - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: plot_example-lm - :type: ref - :text: Fault-Tolerant Fairseq Training - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /images/timeseries.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: automl_for_time_series + + Build Simple AutoML for Time Series Using Ray + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: batch_prediction + + Build Batch Prediction Using Ray + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: batch_training + + Build Batch Training Using Ray + + .. grid-item-card:: + :img-top: images/param_actor.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: plot_parameter_server + + Build a Simple Parameter Server Using Ray + + .. grid-item-card:: + :img-top: images/hyperparameter.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: plot_hyperparameter + + Simple Parallel Model Selection + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: plot_example-lm + + Fault-Tolerant Fairseq Training Reinforcement Learning Examples @@ -70,61 +67,54 @@ These are simple examples that show you how to leverage Ray Core. For Ray's production-grade reinforcement learning library, see `RLlib `__. -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: images/pong.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: images/pong.png + .. button-ref:: plot_pong_example - +++ - .. link-button:: plot_pong_example - :type: ref - :text: Learning to Play Pong - :classes: btn-link btn-block stretched-link + Learning to Play Pong - --- - :img-top: images/a3c.png + .. grid-item-card:: + :img-top: images/a3c.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - +++ - .. link-button:: plot_example-a3c - :type: ref - :text: Asynchronous Advantage Actor Critic (A3C) - :classes: btn-link btn-block stretched-link + .. button-ref:: plot_example-a3c + + Asynchronous Advantage Actor Critic (A3C) Basic Examples -------------- -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: gentle_walkthrough - :type: ref - :text: A Gentle Introduction to Ray Core by Example - :classes: btn-link btn-block stretched-link - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: highly_parallel - :type: ref - :text: Using Ray for Highly Parallelizable Tasks - :classes: btn-link btn-block stretched-link - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: map_reduce - :type: ref - :text: Running a Simple MapReduce Example with Ray Core - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: gentle_walkthrough + + A Gentle Introduction to Ray Core by Example + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: highly_parallel + + Using Ray for Highly Parallelizable Tasks + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: map_reduce + + Running a Simple MapReduce Example with Ray Core diff --git a/doc/source/ray-core/fault_tolerance/gcs.rst b/doc/source/ray-core/fault_tolerance/gcs.rst index d94f9db27b85..9f995e518a80 100644 --- a/doc/source/ray-core/fault_tolerance/gcs.rst +++ b/doc/source/ray-core/fault_tolerance/gcs.rst @@ -22,33 +22,35 @@ However, running Ray tasks and actors remain alive and any existing objects will Setting up Redis ---------------- -.. tabbed:: KubeRay (officially supported) +.. tab-set:: - If you are using :ref:`KubeRay `, please refer to `KubeRay docs on GCS Fault Tolerance `_. + .. tab-item:: KubeRay (officially supported) -.. tabbed:: ray start + If you are using :ref:`KubeRay `, please refer to `KubeRay docs on GCS Fault Tolerance `_. - If you are using :ref:`ray start ` to start the Ray head node, - set the OS environment ``RAY_REDIS_ADDRESS`` to - the Redis address, and supply the ``--redis-password`` flag with the password when calling ``ray start``: + .. tab-item:: ray start - .. code-block:: shell + If you are using :ref:`ray start ` to start the Ray head node, + set the OS environment ``RAY_REDIS_ADDRESS`` to + the Redis address, and supply the ``--redis-password`` flag with the password when calling ``ray start``: - RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD + .. code-block:: shell -.. tabbed:: ray up + RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD - If you are using :ref:`ray up ` to start the Ray cluster, change :ref:`head_start_ray_commands ` field to add ``RAY_REDIS_ADDRESS`` and ``--redis-password`` to the ``ray start`` command: + .. tab-item:: ray up - .. code-block:: yaml + If you are using :ref:`ray up ` to start the Ray cluster, change :ref:`head_start_ray_commands ` field to add ``RAY_REDIS_ADDRESS`` and ``--redis-password`` to the ``ray start`` command: - head_start_ray_commands: - - ray stop - - ulimit -n 65536; RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 + .. code-block:: yaml -.. tabbed:: Kubernetes + head_start_ray_commands: + - ray stop + - ulimit -n 65536; RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 - If you are using Kubernetes but not :ref:`KubeRay `, please refer to :ref:`this doc `. + .. tab-item:: Kubernetes + + If you are using Kubernetes but not :ref:`KubeRay `, please refer to :ref:`this doc `. Once the GCS is backed by Redis, when it restarts, it'll recover the diff --git a/doc/source/ray-core/handling-dependencies.rst b/doc/source/ray-core/handling-dependencies.rst index 9bdd069e6509..940209475219 100644 --- a/doc/source/ray-core/handling-dependencies.rst +++ b/doc/source/ray-core/handling-dependencies.rst @@ -66,22 +66,30 @@ In contrast with the base cluster environment, a runtime environment will only b Runtime environments also allow you to set dependencies per-task, per-actor, and per-job on a long-running Ray cluster. -.. - TODO(architkulkarni): run working_dir doc example in CI +.. testcode:: + :hide: -.. code-block:: python + import ray + ray.shutdown() + +.. testcode:: import ray - import requests - runtime_env = {"working_dir": "/data/my_files", "pip": ["requests", "pendulum==2.1.2"]} + runtime_env = {"pip": ["emoji"]} ray.init(runtime_env=runtime_env) @ray.remote def f(): - open("my_datafile.txt").read() - return requests.get("https://www.ray.io") + import emoji + return emoji.emojize('Python is :thumbs_up:') + + print(ray.get(f.remote())) + +.. testoutput:: + + Python is 👍 A runtime environment can be described by a Python `dict`: @@ -117,7 +125,8 @@ You can specify a runtime environment for your whole job, whether running a scri :start-after: __ray_init_start__ :end-before: __ray_init_end__ -.. code-block:: python +.. testcode:: + :skipif: True # Option 2: Using Ray Jobs API (Python SDK) from ray.job_submission import JobSubmissionClient @@ -129,14 +138,14 @@ You can specify a runtime environment for your whole job, whether running a scri ) .. code-block:: bash - + # Option 3: Using Ray Jobs API (CLI). (Note: can use --runtime-env to pass a YAML file instead of an inline JSON string.) $ ray job submit --address="http://:8265" --runtime-env-json='{"working_dir": "/data/my_files", "pip": ["emoji"]}' -- python my_ray_script.py .. warning:: - If using the Ray Jobs API (either the Python SDK or the CLI), specify the ``runtime_env`` argument in the ``submit_job`` call or the ``ray job submit``, not in the ``ray.init()`` call in the entrypoint script (in this example, ``my_ray_script.py``). - + If using the Ray Jobs API (either the Python SDK or the CLI), specify the ``runtime_env`` argument in the ``submit_job`` call or the ``ray job submit``, not in the ``ray.init()`` call in the entrypoint script (in this example, ``my_ray_script.py``). + This ensures the runtime environment is installed on the cluster before the entrypoint script is run. .. note:: @@ -182,24 +191,36 @@ For a development workflow, these might live on your local machine, but when it The following simple example explains how to get your local files on the cluster. -.. code-block:: python +.. testcode:: + :hide: - # /path/to/files is a directory on the local machine. - # /path/to/files/hello.txt contains the string "Hello World!" + import ray + ray.shutdown() +.. testcode:: + + import os import ray + os.makedirs("/tmp/runtime_env_working_dir", exist_ok=True) + with open("/tmp/runtime_env_working_dir/hello.txt", "w") as hello_file: + hello_file.write("Hello World!") + # Specify a runtime environment for the entire Ray job - ray.init(runtime_env={"working_dir": "/path/to/files"}) + ray.init(runtime_env={"working_dir": "/tmp/runtime_env_working_dir"}) # Create a Ray task, which inherits the above runtime env. @ray.remote def f(): # The function will have its working directory changed to its node's - # local copy of /path/to/files. + # local copy of /tmp/runtime_env_working_dir. return open("hello.txt").read() - print(ray.get(f.remote())) # Hello World! + print(ray.get(f.remote())) + +.. testoutput:: + + Hello World! .. note:: The example above is written to run on a local machine, but as for all of these examples, it also works when specifying a Ray cluster to connect to @@ -218,7 +239,13 @@ Ray ordinarily expects all imported packages to be preinstalled on every node of However, using runtime environments you can dynamically specify packages to be automatically downloaded and installed in a virtual environment for your Ray job, or for specific Ray tasks or actors. -.. code-block:: python +.. testcode:: + :hide: + + import ray + ray.shutdown() + +.. testcode:: import ray import requests @@ -229,20 +256,24 @@ However, using runtime environments you can dynamically specify packages to be a @ray.remote def reqs(): - return requests.get("https://www.ray.io/") + return requests.get("https://www.ray.io/").status_code - print(ray.get(reqs.remote())) # + print(ray.get(reqs.remote())) +.. testoutput:: -You may also specify your ``pip`` dependencies either via a Python list or a ``requirements.txt`` file. -Alternatively, you can specify a ``conda`` environment, either as a Python dictionary or via a ``environment.yml`` file. This conda environment can include ``pip`` packages. + 200 + + +You may also specify your ``pip`` dependencies either via a Python list or a local ``requirements.txt`` file. +Alternatively, you can specify a ``conda`` environment, either as a Python dictionary or via a local ``environment.yml`` file. This conda environment can include ``pip`` packages. For details, head to the :ref:`API Reference `. .. warning:: Since the packages in the ``runtime_env`` are installed at runtime, be cautious when specifying ``conda`` or ``pip`` packages whose installations involve building from source, as this can be slow. -.. note:: +.. note:: When using the ``"pip"`` field, the specified packages will be installed "on top of" the base environment using ``virtualenv``, so existing packages on your cluster will still be importable. By contrast, when using the ``conda`` field, your Ray tasks and actors will run in an isolated environment. The ``conda`` and ``pip`` fields cannot both be used in a single ``runtime_env``. @@ -250,7 +281,7 @@ For details, head to the :ref:`API Reference `. The ``ray[default]`` package itself will automatically be installed in the environment. For the ``conda`` field only, if you are using any other Ray libraries (for example, Ray Serve), then you will need to specify the library in the runtime environment (e.g. ``runtime_env = {"conda": {"dependencies": ["pytorch", "pip", {"pip": ["requests", "ray[serve]"]}]}}``.) -.. note:: +.. note:: ``conda`` environments must have the same Python version as the Ray cluster. Do not list ``ray`` in the ``conda`` dependencies, as it will be automatically installed. @@ -266,7 +297,8 @@ A typical iteration cycle will involve To ensure your local changes show up across all Ray workers and can be imported properly, use the ``py_modules`` field. -.. code-block:: python +.. testcode:: + :skipif: True import ray import my_module @@ -336,7 +368,7 @@ The ``runtime_env`` is a Python dictionary or a Python class :class:`ray.runtime - Example: ``{"working_dir": "/Users/my_working_dir/", "excludes": ["my_file.txt", "/subdir/, "path/to/dir", "*.log"]}`` -- ``pip`` (dict | List[str] | str): Either (1) a list of pip `requirements specifiers `_, (2) a string containing the path to a pip +- ``pip`` (dict | List[str] | str): Either (1) a list of pip `requirements specifiers `_, (2) a string containing the path to a local pip `“requirements.txt” `_ file, or (3) a python dictionary that has three fields: (a) ``packages`` (required, List[str]): a list of pip packages, (b) ``pip_check`` (optional, bool): whether to enable `pip check `_ at the end of pip install, defaults to ``False``. (c) ``pip_version`` (optional, str): the version of pip; Ray will spell the package name "pip" in front of the ``pip_version`` to form the final requirement string. @@ -351,9 +383,10 @@ The ``runtime_env`` is a Python dictionary or a Python class :class:`ray.runtime - Example: ``{"packages":["tensorflow", "requests"], "pip_check": False, "pip_version": "==22.0.2;python_version=='3.8.11'"}`` - When specifying a ``requirements.txt`` file, referencing local files `within` that file is not supported (e.g. ``-r ./my-laptop/more-requirements.txt``, ``./my-pkg.whl``). + When specifying a path to a ``requirements.txt`` file, the file must be present on your local machine and it must be a valid absolute path or relative filepath relative to your local current working directory, *not* relative to the `working_dir` specified in the `runtime_env`. + Furthermore, referencing local files `within` a `requirements.txt` file is not supported (e.g., ``-r ./my-laptop/more-requirements.txt``, ``./my-pkg.whl``). -- ``conda`` (dict | str): Either (1) a dict representing the conda environment YAML, (2) a string containing the path to a +- ``conda`` (dict | str): Either (1) a dict representing the conda environment YAML, (2) a string containing the path to a local `conda “environment.yml” `_ file, or (3) the name of a local conda environment already installed on each node in your cluster (e.g., ``"pytorch_p36"``). In the first two cases, the Ray and Python dependencies will be automatically injected into the environment to ensure compatibility, so there is no need to manually include them. @@ -366,11 +399,19 @@ The ``runtime_env`` is a Python dictionary or a Python class :class:`ray.runtime - Example: ``"pytorch_p36"`` + When specifying a path to a ``environment.yml`` file, the file must be present on your local machine and it must be a valid absolute path or a relative filepath relative to your local current working directory, *not* relative to the `working_dir` specified in the `runtime_env`. + Furthermore, referencing local files `within` a `environment.yml` file is not supported. + - ``env_vars`` (Dict[str, str]): Environment variables to set. Environment variables already set on the cluster will still be visible to the Ray workers; so there is no need to include ``os.environ`` or similar in the ``env_vars`` field. + By default, these environment variables override the same name environment variables on the cluster. + You can also reference existing environment variables using ${ENV_VAR} to achieve the appending behavior. + Only PATH, LD_LIBRARY_PATH, DYLD_LIBRARY_PATH, and LD_PRELOAD are supported. See below for an example: - Example: ``{"OMP_NUM_THREADS": "32", "TF_WARNINGS": "none"}`` + - Example: ``{"LD_LIBRARY_PATH": "${LD_LIBRARY_PATH}:/home/admin/my_lib"}`` + - ``container`` (dict): Require a given (Docker) image, and the worker process will run in a container with this image. The `worker_path` is the default_worker.py path. It is required only if ray installation directory in the container is different from raylet host. The `run_options` list spec is `here `__. @@ -416,7 +457,7 @@ If an actor or task specifies a new ``runtime_env``, it will override the parent Example: -.. code-block:: python +.. testcode:: # Parent's `runtime_env` {"pip": ["requests", "chess"], @@ -424,11 +465,11 @@ Example: # Child's specified `runtime_env` {"pip": ["torch", "ray[serve]"], - "env_vars": {"B": "new", "C", "c"}} + "env_vars": {"B": "new", "C": "c"}} # Child's actual `runtime_env` (merged with parent's) {"pip": ["torch", "ray[serve]"], - "env_vars": {"A": "a", "B": "new", "C", "c"}} + "env_vars": {"A": "a", "B": "new", "C": "c"}} .. _runtime-env-faq: @@ -456,8 +497,8 @@ Any local files downloaded by the environments are cached at ``/tmp/ray/session_ How long does it take to install or to load from cache? """"""""""""""""""""""""""""""""""""""""""""""""""""""" -The install time usually mostly consists of the time it takes to run ``pip install`` or ``conda create`` / ``conda activate``, or to upload/download a ``working_dir``, depending on which ``runtime_env`` options you're using. -This could take seconds or minutes. +The install time usually mostly consists of the time it takes to run ``pip install`` or ``conda create`` / ``conda activate``, or to upload/download a ``working_dir``, depending on which ``runtime_env`` options you're using. +This could take seconds or minutes. On the other hand, loading a runtime environment from the cache should be nearly as fast as the ordinary Ray worker startup time, which is on the order of a few seconds. A new Ray worker is started for every Ray actor or task that requires a new runtime environment. (Note that loading a cached ``conda`` environment could still be slow, since the ``conda activate`` command sometimes takes a few seconds.) @@ -467,7 +508,7 @@ You can set ``setup_timeout_seconds`` config to avoid the installation hanging f What is the relationship between runtime environments and Docker? """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -They can be used independently or together. +They can be used independently or together. A container image can be specified in the :ref:`Cluster Launcher ` for large or static dependencies, and runtime environments can be specified per-job or per-task/actor for more dynamic use cases. The runtime environment will inherit packages, files, and environment variables from the container image. @@ -490,9 +531,10 @@ The contents of this directory will be directly accessed as the ``working_dir`` For example, suppose you want to use the contents in your local ``/some_path/example_dir`` directory as your ``working_dir``. If you want to specify this directory as a local path, your ``runtime_env`` dictionary should contain: -.. code-block:: python +.. testcode:: + :skipif: True - runtime_env = {..., "working_dir": "/some_path/example_dir", ...} + runtime_env = {..., "working_dir": "/some_path/example_dir", ...} Suppose instead you want to host your files in your ``/some_path/example_dir`` directory remotely and provide a remote URI. You would need to first compress the ``example_dir`` directory into a zip file. @@ -521,9 +563,10 @@ You can check that the zip file contains a single top-level directory by running Suppose you upload the compressed ``example_dir`` directory to AWS S3 at the S3 URI ``s3://example_bucket/example.zip``. Your ``runtime_env`` dictionary should contain: -.. code-block:: python +.. testcode:: + :skipif: True - runtime_env = {..., "working_dir": "s3://example_bucket/example.zip", ...} + runtime_env = {..., "working_dir": "s3://example_bucket/example.zip", ...} .. warning:: @@ -626,7 +669,7 @@ To create the URL, pick a URL template below that fits your use case, and fill i For instance, suppose your GitHub username is ``example_user``, the repository's name is ``example_repository``, and the desired commit hash is ``abcdefg``. If ``example_repository`` is public and you want to retrieve the ``abcdefg`` commit (which matches the first example use case), the URL would be: -.. code-block:: python +.. testcode:: runtime_env = {"working_dir": ("https://github.com" "/example_user/example_repository/archive/abcdefg.zip")} @@ -635,28 +678,28 @@ Here is a list of different use cases and corresponding URLs: - Example: Retrieve package from a specific commit hash on a public GitHub repository -.. code-block:: python +.. testcode:: runtime_env = {"working_dir": ("https://github.com" "/[username]/[repository]/archive/[commit hash].zip")} - Example: Retrieve package from a private GitHub repository using a Personal Access Token -.. code-block:: python +.. testcode:: runtime_env = {"working_dir": ("https://[username]:[personal access token]@github.com" "/[username]/[private repository]/archive/[commit hash].zip")} - Example: Retrieve package from a public GitHub repository's latest commit -.. code-block:: python +.. testcode:: runtime_env = {"working_dir": ("https://github.com" "/[username]/[repository]/archive/HEAD.zip")} - Example: Retrieve package from a specific commit hash on a public Bitbucket repository -.. code-block:: python +.. testcode:: runtime_env = {"working_dir": ("https://bitbucket.org" "/[owner]/[repository]/get/[commit hash].tar.gz")} @@ -678,7 +721,10 @@ If runtime_env cannot be set up (e.g., network issues, download failures, etc.), that require the runtime_env. If you call ``ray.get``, it will raise ``RuntimeEnvSetupError`` with the error message in detail. -.. code-block:: python +.. testcode:: + + import ray + import time @ray.remote def f(): @@ -693,11 +739,23 @@ the error message in detail. bad_env = {"conda": {"dependencies": ["this_doesnt_exist"]}} # [Tasks] will raise `RuntimeEnvSetupError`. - ray.get(f.options(runtime_env=bad_env).remote()) + try: + ray.get(f.options(runtime_env=bad_env).remote()) + except ray.exceptions.RuntimeEnvSetupError: + print("Task fails with RuntimeEnvSetupError") # [Actors] will raise `RuntimeEnvSetupError`. a = A.options(runtime_env=bad_env).remote() - ray.get(a.f.remote()) + try: + ray.get(a.f.remote()) + except ray.exceptions.RuntimeEnvSetupError: + print("Actor fails with RuntimeEnvSetupError") + +.. testoutput:: + + Task fails with RuntimeEnvSetupError + Actor fails with RuntimeEnvSetupError + Full logs can always be found in the file ``runtime_env_setup-[job_id].log`` for per-actor, per-task and per-job environments, or in ``runtime_env_setup-ray_client_server_[port].log`` for per-job environments when using Ray Client. @@ -707,9 +765,17 @@ This will print the full ``runtime_env`` setup log messages to the driver (the s Example log output: -.. code-block:: text +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: + + ray.init(runtime_env={"pip": ["requests"]}) - >>> ray.init(runtime_env={"pip" ["requests"]}) +.. testoutput:: + :options: +SKIP (pid=runtime_env) 2022-02-28 14:12:33,653 INFO pip.py:188 -- Creating virtualenv at /tmp/ray/session_2022-02-28_14-12-29_909064_87908/runtime_resources/pip/0cc818a054853c3841171109300436cad4dcf594/virtualenv, current python dir /Users/user/anaconda3/envs/ray-py38 (pid=runtime_env) 2022-02-28 14:12:33,653 INFO utils.py:76 -- Run cmd[1] ['/Users/user/anaconda3/envs/ray-py38/bin/python', '-m', 'virtualenv', '--app-data', '/tmp/ray/session_2022-02-28_14-12-29_909064_87908/runtime_resources/pip/0cc818a054853c3841171109300436cad4dcf594/virtualenv_app_data', '--reset-app-data', '--no-periodic-update', '--system-site-packages', '--no-download', '/tmp/ray/session_2022-02-28_14-12-29_909064_87908/runtime_resources/pip/0cc818a054853c3841171109300436cad4dcf594/virtualenv'] diff --git a/doc/source/ray-core/miscellaneous.rst b/doc/source/ray-core/miscellaneous.rst index 7eebbdf76e61..0629fcf961c5 100644 --- a/doc/source/ray-core/miscellaneous.rst +++ b/doc/source/ray-core/miscellaneous.rst @@ -13,7 +13,9 @@ You can dynamically adjust resource requirements or return values of ``ray.remot For example, here we instantiate many copies of the same actor with varying resource requirements. Note that to create these actors successfully, Ray will need to be started with sufficient CPU resources and the relevant custom resources: -.. code-block:: python +.. testcode:: + + import ray @ray.remote(num_cpus=4) class Counter(object): @@ -30,21 +32,28 @@ For example, here we instantiate many copies of the same actor with varying reso You can specify different resource requirements for tasks (but not for actor methods): -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: + + ray.init(num_cpus=1, num_gpus=1) @ray.remote def g(): return ray.get_gpu_ids() object_gpu_ids = g.remote() - assert ray.get(object_gpu_ids) == [0] + assert ray.get(object_gpu_ids) == [] dynamic_object_gpu_ids = g.options(num_cpus=1, num_gpus=1).remote() assert ray.get(dynamic_object_gpu_ids) == [0] And vary the number of return values for tasks (and actor methods too): -.. code-block:: python +.. testcode:: @ray.remote def f(n): @@ -56,7 +65,7 @@ And vary the number of return values for tasks (and actor methods too): And specify a name for tasks (and actor methods too) at task submission time: -.. code-block:: python +.. testcode:: import setproctitle @@ -154,16 +163,21 @@ To get information about the current nodes in your cluster, you can use ``ray.no .. autofunction:: ray.nodes :noindex: +.. testcode:: + :hide: -.. code-block:: python + ray.shutdown() + +.. testcode:: import ray ray.init() - print(ray.nodes()) - """ +.. testoutput:: + :options: +SKIP + [{'NodeID': '2691a0c1aed6f45e262b2372baf58871734332d7', 'Alive': True, 'NodeManagerAddress': '192.168.1.82', @@ -175,7 +189,6 @@ To get information about the current nodes in your cluster, you can use ``ray.no 'MetricsExportPort': 64860, 'alive': True, 'Resources': {'CPU': 16.0, 'memory': 100.0, 'object_store_memory': 34.0, 'node:192.168.1.82': 1.0}}] - """ The above information includes: diff --git a/doc/source/ray-core/namespaces.rst b/doc/source/ray-core/namespaces.rst index 636cc138dab2..c74c263ebecb 100644 --- a/doc/source/ray-core/namespaces.rst +++ b/doc/source/ray-core/namespaces.rst @@ -9,117 +9,121 @@ named, its name must be unique within the namespace. In order to set your applications namespace, it should be specified when you first connect to the cluster. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ./doc_code/namespaces.py - :language: python - :start-after: __init_namespace_start__ - :end-before: __init_namespace_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: ./doc_code/namespaces.py + :language: python + :start-after: __init_namespace_start__ + :end-before: __init_namespace_end__ - .. code-block:: java + .. tab-item:: Java - System.setProperty("ray.job.namespace", "hello"); // set it before Ray.init() - Ray.init(); + .. code-block:: java -.. tabbed:: C++ + System.setProperty("ray.job.namespace", "hello"); // set it before Ray.init() + Ray.init(); - .. code-block:: c++ + .. tab-item:: C++ - ray::RayConfig config; - config.ray_namespace = "hello"; - ray::Init(config); + .. code-block:: c++ + + ray::RayConfig config; + config.ray_namespace = "hello"; + ray::Init(config); Please refer to `Driver Options `__ for ways of configuring a Java application. Named actors are only accessible within their namespaces. -.. tabbed:: Python - - .. literalinclude:: ./doc_code/namespaces.py - :language: python - :start-after: __actor_namespace_start__ - :end-before: __actor_namespace_end__ - -.. tabbed:: Java - - .. code-block:: java - - // `ray start --head` has been run to launch a local cluster. - - // Job 1 creates two actors, "orange" and "purple" in the "colors" namespace. - System.setProperty("ray.address", "localhost:10001"); - System.setProperty("ray.job.namespace", "colors"); - try { - Ray.init(); - Ray.actor(Actor::new).setName("orange").remote(); - Ray.actor(Actor::new).setName("purple").remote(); - } finally { - Ray.shutdown(); - } - - // Job 2 is now connecting to a different namespace. - System.setProperty("ray.address", "localhost:10001"); - System.setProperty("ray.job.namespace", "fruits"); - try { - Ray.init(); +.. tab-set:: + + .. tab-item:: Python + + .. literalinclude:: ./doc_code/namespaces.py + :language: python + :start-after: __actor_namespace_start__ + :end-before: __actor_namespace_end__ + + .. tab-item:: Java + + .. code-block:: java + + // `ray start --head` has been run to launch a local cluster. + + // Job 1 creates two actors, "orange" and "purple" in the "colors" namespace. + System.setProperty("ray.address", "localhost:10001"); + System.setProperty("ray.job.namespace", "colors"); + try { + Ray.init(); + Ray.actor(Actor::new).setName("orange").remote(); + Ray.actor(Actor::new).setName("purple").remote(); + } finally { + Ray.shutdown(); + } + + // Job 2 is now connecting to a different namespace. + System.setProperty("ray.address", "localhost:10001"); + System.setProperty("ray.job.namespace", "fruits"); + try { + Ray.init(); + // This fails because "orange" was defined in the "colors" namespace. + Ray.getActor("orange").isPresent(); // return false + // This succceeds because the name "orange" is unused in this namespace. + Ray.actor(Actor::new).setName("orange").remote(); + Ray.actor(Actor::new).setName("watermelon").remote(); + } finally { + Ray.shutdown(); + } + + // Job 3 connects to the original "colors" namespace. + System.setProperty("ray.address", "localhost:10001"); + System.setProperty("ray.job.namespace", "colors"); + try { + Ray.init(); + // This fails because "watermelon" was in the fruits namespace. + Ray.getActor("watermelon").isPresent(); // return false + // This returns the "orange" actor we created in the first job, not the second. + Ray.getActor("orange").isPresent(); // return true + } finally { + Ray.shutdown(); + } + + .. tab-item:: C++ + + .. code-block:: c++ + + // `ray start --head` has been run to launch a local cluster. + + // Job 1 creates two actors, "orange" and "purple" in the "colors" namespace. + ray::RayConfig config; + config.ray_namespace = "colors"; + ray::Init(config); + ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("orange").Remote(); + ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("purple").Remote(); + ray::Shutdown(); + + // Job 2 is now connecting to a different namespace. + ray::RayConfig config; + config.ray_namespace = "fruits"; + ray::Init(config); // This fails because "orange" was defined in the "colors" namespace. - Ray.getActor("orange").isPresent(); // return false - // This succceeds because the name "orange" is unused in this namespace. - Ray.actor(Actor::new).setName("orange").remote(); - Ray.actor(Actor::new).setName("watermelon").remote(); - } finally { - Ray.shutdown(); - } - - // Job 3 connects to the original "colors" namespace. - System.setProperty("ray.address", "localhost:10001"); - System.setProperty("ray.job.namespace", "colors"); - try { - Ray.init(); + ray::GetActor("orange"); // return nullptr; + // This succeeds because the name "orange" is unused in this namespace. + ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("orange").Remote(); + ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("watermelon").Remote(); + ray::Shutdown(); + + // Job 3 connects to the original "colors" namespace. + ray::RayConfig config; + config.ray_namespace = "colors"; + ray::Init(config); // This fails because "watermelon" was in the fruits namespace. - Ray.getActor("watermelon").isPresent(); // return false + ray::GetActor("watermelon"); // return nullptr; // This returns the "orange" actor we created in the first job, not the second. - Ray.getActor("orange").isPresent(); // return true - } finally { - Ray.shutdown(); - } - -.. tabbed:: C++ - - .. code-block:: c++ - - // `ray start --head` has been run to launch a local cluster. - - // Job 1 creates two actors, "orange" and "purple" in the "colors" namespace. - ray::RayConfig config; - config.ray_namespace = "colors"; - ray::Init(config); - ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("orange").Remote(); - ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("purple").Remote(); - ray::Shutdown(); - - // Job 2 is now connecting to a different namespace. - ray::RayConfig config; - config.ray_namespace = "fruits"; - ray::Init(config); - // This fails because "orange" was defined in the "colors" namespace. - ray::GetActor("orange"); // return nullptr; - // This succceeds because the name "orange" is unused in this namespace. - ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("orange").Remote(); - ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("watermelon").Remote(); - ray::Shutdown(); - - // Job 3 connects to the original "colors" namespace. - ray::RayConfig config; - config.ray_namespace = "colors"; - ray::Init(config); - // This fails because "watermelon" was in the fruits namespace. - ray::GetActor("watermelon"); // return nullptr; - // This returns the "orange" actor we created in the first job, not the second. - ray::GetActor("orange"); - ray::Shutdown(); + ray::GetActor("orange"); + ray::Shutdown(); Specifying namespace for named actors ------------------------------------- @@ -127,45 +131,47 @@ Specifying namespace for named actors You can specify a namespace for a named actor while creating it. The created actor belongs to the specified namespace, no matter what namespace of the current job is. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ./doc_code/namespaces.py - :language: python - :start-after: __specify_actor_namespace_start__ - :end-before: __specify_actor_namespace_end__ + .. tab-item:: Python + .. literalinclude:: ./doc_code/namespaces.py + :language: python + :start-after: __specify_actor_namespace_start__ + :end-before: __specify_actor_namespace_end__ -.. tabbed:: Java - .. code-block:: java + .. tab-item:: Java - // `ray start --head` has been run to launch a local cluster. + .. code-block:: java - System.setProperty("ray.address", "localhost:10001"); - try { - Ray.init(); - // Create an actor with specified namespace. - Ray.actor(Actor::new).setName("my_actor", "actor_namespace").remote(); - // It is accessible in its namespace. - Ray.getActor("my_actor", "actor_namespace").isPresent(); // return true + // `ray start --head` has been run to launch a local cluster. - } finally { - Ray.shutdown(); - } + System.setProperty("ray.address", "localhost:10001"); + try { + Ray.init(); + // Create an actor with specified namespace. + Ray.actor(Actor::new).setName("my_actor", "actor_namespace").remote(); + // It is accessible in its namespace. + Ray.getActor("my_actor", "actor_namespace").isPresent(); // return true -.. tabbed:: C++ + } finally { + Ray.shutdown(); + } - .. code-block:: c++ + .. tab-item:: C++ - // `ray start --head` has been run to launch a local cluster. + .. code-block:: + + // `ray start --head` has been run to launch a local cluster. + ray::RayConfig config; + ray::Init(config); + // Create an actor with specified namespace. + ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("my_actor", "actor_namespace").Remote(); + // It is accessible in its namespace. + ray::GetActor("orange"); + ray::Shutdown();` - ray::RayConfig config; - ray::Init(config); - // Create an actor with specified namespace. - ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("my_actor", "actor_namespace").Remote(); - // It is accessible in its namespace. - ray::GetActor("orange"); - ray::Shutdown(); Anonymous namespaces -------------------- @@ -174,56 +180,58 @@ When a namespace is not specified, Ray will place your job in an anonymous namespace. In an anonymous namespace, your job will have its own namespace and will not have access to actors in other namespaces. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ./doc_code/namespaces.py - :language: python - :start-after: __anonymous_namespace_start__ - :end-before: __anonymous_namespace_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: ./doc_code/namespaces.py + :language: python + :start-after: __anonymous_namespace_start__ + :end-before: __anonymous_namespace_end__ - .. code-block:: java + .. tab-item:: Java - // `ray start --head` has been run to launch a local cluster. + .. code-block:: java - // Job 1 connects to an anonymous namespace by default. - System.setProperty("ray.address", "localhost:10001"); - try { - Ray.init(); - Ray.actor(Actor::new).setName("my_actor").remote(); - } finally { - Ray.shutdown(); - } + // `ray start --head` has been run to launch a local cluster. - // Job 2 connects to a _different_ anonymous namespace by default - System.setProperty("ray.address", "localhost:10001"); - try { - Ray.init(); - // This succeeds because the second job is in its own namespace. - Ray.actor(Actor::new).setName("my_actor").remote(); - } finally { - Ray.shutdown(); - } + // Job 1 connects to an anonymous namespace by default. + System.setProperty("ray.address", "localhost:10001"); + try { + Ray.init(); + Ray.actor(Actor::new).setName("my_actor").remote(); + } finally { + Ray.shutdown(); + } -.. tabbed:: C++ + // Job 2 connects to a _different_ anonymous namespace by default + System.setProperty("ray.address", "localhost:10001"); + try { + Ray.init(); + // This succeeds because the second job is in its own namespace. + Ray.actor(Actor::new).setName("my_actor").remote(); + } finally { + Ray.shutdown(); + } - .. code-block:: c++ + .. tab-item:: C++ - // `ray start --head` has been run to launch a local cluster. + .. code-block:: c++ - // Job 1 connects to an anonymous namespace by default. - ray::RayConfig config; - ray::Init(config); - ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("my_actor").Remote(); - ray::Shutdown(); + // `ray start --head` has been run to launch a local cluster. - // Job 2 connects to a _different_ anonymous namespace by default - ray::RayConfig config; - ray::Init(config); - // This succeeds because the second job is in its own namespace. - ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("my_actor").Remote(); - ray::Shutdown(); + // Job 1 connects to an anonymous namespace by default. + ray::RayConfig config; + ray::Init(config); + ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("my_actor").Remote(); + ray::Shutdown(); + + // Job 2 connects to a _different_ anonymous namespace by default + ray::RayConfig config; + ray::Init(config); + // This succeeds because the second job is in its own namespace. + ray::Actor(RAY_FUNC(Counter::FactoryCreate)).SetName("my_actor").Remote(); + ray::Shutdown(); .. note:: @@ -236,34 +244,36 @@ Getting the current namespace ----------------------------- You can access to the current namespace using :ref:`runtime_context APIs `. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ./doc_code/namespaces.py - :language: python - :start-after: __get_namespace_start__ - :end-before: __get_namespace_end__ + .. tab-item:: Python + .. literalinclude:: ./doc_code/namespaces.py + :language: python + :start-after: __get_namespace_start__ + :end-before: __get_namespace_end__ -.. tabbed:: Java - .. code-block:: java + .. tab-item:: Java - System.setProperty("ray.job.namespace", "colors"); - try { - Ray.init(); - // Will print namespace name "colors". - System.out.println(Ray.getRuntimeContext().getNamespace()); - } finally { - Ray.shutdown(); - } + .. code-block:: java -.. tabbed:: C++ + System.setProperty("ray.job.namespace", "colors"); + try { + Ray.init(); + // Will print namespace name "colors". + System.out.println(Ray.getRuntimeContext().getNamespace()); + } finally { + Ray.shutdown(); + } - .. code-block:: c++ + .. tab-item:: C++ - ray::RayConfig config; - config.ray_namespace = "colors"; - ray::Init(config); - // Will print namespace name "colors". - std::cout << ray::GetNamespace() << std::endl; - ray::Shutdown(); + .. code-block:: c++ + + ray::RayConfig config; + config.ray_namespace = "colors"; + ray::Init(config); + // Will print namespace name "colors". + std::cout << ray::GetNamespace() << std::endl; + ray::Shutdown(); diff --git a/doc/source/ray-core/objects.rst b/doc/source/ray-core/objects.rst index 721924c12c2c..78294535d420 100644 --- a/doc/source/ray-core/objects.rst +++ b/doc/source/ray-core/objects.rst @@ -14,29 +14,33 @@ Object refs can be created in two ways. 1. They are returned by remote function calls. 2. They are returned by :func:`ray.put() `. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - # Put an object in Ray's object store. - y = 1 - object_ref = ray.put(y) + .. testcode:: -.. tabbed:: Java + import ray - .. code-block:: java + # Put an object in Ray's object store. + y = 1 + object_ref = ray.put(y) - // Put an object in Ray's object store. - int y = 1; - ObjectRef objectRef = Ray.put(y); + .. tab-item:: Java -.. tabbed:: C++ + .. code-block:: java - .. code-block:: c++ + // Put an object in Ray's object store. + int y = 1; + ObjectRef objectRef = Ray.put(y); - // Put an object in Ray's object store. - int y = 1; - ray::ObjectRef object_ref = ray::Put(y); + .. tab-item:: C++ + + .. code-block:: c++ + + // Put an object in Ray's object store. + int y = 1; + ray::ObjectRef object_ref = ray::Put(y); .. note:: @@ -51,82 +55,91 @@ Fetching Object Data You can use the :func:`ray.get() ` method to fetch the result of a remote object from an object ref. If the current node's object store does not contain the object, the object is downloaded. -.. tabbed:: Python - - If the object is a `numpy array `__ - or a collection of numpy arrays, the ``get`` call is zero-copy and returns arrays backed by shared object store memory. - Otherwise, we deserialize the object data into a Python object. - - .. code-block:: python - - # Get the value of one object ref. - obj_ref = ray.put(1) - assert ray.get(obj_ref) == 1 - - # Get the values of multiple object refs in parallel. - assert ray.get([ray.put(i) for i in range(3)]) == [0, 1, 2] - - # You can also set a timeout to return early from a ``get`` - # that's blocking for too long. - from ray.exceptions import GetTimeoutError - # ``GetTimeoutError`` is a subclass of ``TimeoutError``. - - @ray.remote - def long_running_function(): - time.sleep(8) - - obj_ref = long_running_function.remote() - try: - ray.get(obj_ref, timeout=4) - except GetTimeoutError: # You can capture the standard "TimeoutError" instead - print("`get` timed out.") - -.. tabbed:: Java - - .. code-block:: java - - // Get the value of one object ref. - ObjectRef objRef = Ray.put(1); - Assert.assertTrue(objRef.get() == 1); - // You can also set a timeout(ms) to return early from a ``get`` that's blocking for too long. - Assert.assertTrue(objRef.get(1000) == 1); - - // Get the values of multiple object refs in parallel. - List> objectRefs = new ArrayList<>(); - for (int i = 0; i < 3; i++) { - objectRefs.add(Ray.put(i)); - } - List results = Ray.get(objectRefs); - Assert.assertEquals(results, ImmutableList.of(0, 1, 2)); - - // Ray.get timeout example: Ray.get will throw an RayTimeoutException if time out. - public class MyRayApp { - public static int slowFunction() throws InterruptedException { - TimeUnit.SECONDS.sleep(10); - return 1; - } - } - Assert.assertThrows(RayTimeoutException.class, - () -> Ray.get(Ray.task(MyRayApp::slowFunction).remote(), 3000)); - -.. tabbed:: C++ - - .. code-block:: c++ - - // Get the value of one object ref. - ray::ObjectRef obj_ref = ray::Put(1); - assert(*obj_ref.Get() == 1); - - // Get the values of multiple object refs in parallel. - std::vector> obj_refs; - for (int i = 0; i < 3; i++) { - obj_refs.emplace_back(ray::Put(i)); - } - auto results = ray::Get(obj_refs); - assert(results.size() == 3); - assert(*results[0] == 0); - assert(*results[1] == 1); - assert(*results[2] == 2); +.. tab-set:: + + .. tab-item:: Python + + If the object is a `numpy array `__ + or a collection of numpy arrays, the ``get`` call is zero-copy and returns arrays backed by shared object store memory. + Otherwise, we deserialize the object data into a Python object. + + .. testcode:: + + import ray + import time + + # Get the value of one object ref. + obj_ref = ray.put(1) + assert ray.get(obj_ref) == 1 + + # Get the values of multiple object refs in parallel. + assert ray.get([ray.put(i) for i in range(3)]) == [0, 1, 2] + + # You can also set a timeout to return early from a ``get`` + # that's blocking for too long. + from ray.exceptions import GetTimeoutError + # ``GetTimeoutError`` is a subclass of ``TimeoutError``. + + @ray.remote + def long_running_function(): + time.sleep(8) + + obj_ref = long_running_function.remote() + try: + ray.get(obj_ref, timeout=4) + except GetTimeoutError: # You can capture the standard "TimeoutError" instead + print("`get` timed out.") + + .. testoutput:: + + `get` timed out. + + .. tab-item:: Java + + .. code-block:: java + + // Get the value of one object ref. + ObjectRef objRef = Ray.put(1); + Assert.assertTrue(objRef.get() == 1); + // You can also set a timeout(ms) to return early from a ``get`` that's blocking for too long. + Assert.assertTrue(objRef.get(1000) == 1); + + // Get the values of multiple object refs in parallel. + List> objectRefs = new ArrayList<>(); + for (int i = 0; i < 3; i++) { + objectRefs.add(Ray.put(i)); + } + List results = Ray.get(objectRefs); + Assert.assertEquals(results, ImmutableList.of(0, 1, 2)); + + // Ray.get timeout example: Ray.get will throw an RayTimeoutException if time out. + public class MyRayApp { + public static int slowFunction() throws InterruptedException { + TimeUnit.SECONDS.sleep(10); + return 1; + } + } + Assert.assertThrows(RayTimeoutException.class, + () -> Ray.get(Ray.task(MyRayApp::slowFunction).remote(), 3000)); + + .. tab-item:: C++ + + .. code-block:: c++ + + // Get the value of one object ref. + ray::ObjectRef obj_ref = ray::Put(1); + assert(*obj_ref.Get() == 1); + + // Get the values of multiple object refs in parallel. + std::vector> obj_refs; + for (int i = 0; i < 3; i++) { + obj_refs.emplace_back(ray::Put(i)); + } + auto results = ray::Get(obj_refs); + assert(results.size() == 3); + assert(*results[0] == 0); + assert(*results[1] == 1); + assert(*results[2] == 2); Passing Object Arguments ------------------------ @@ -145,7 +158,17 @@ There are two different ways one can pass an object to a Ray task or method. Dep The top-level vs not top-level passing convention also applies to actor constructors and actor method calls: -.. code-block:: python +.. testcode:: + + @ray.remote + class Actor: + def __init__(self, arg): + pass + + def method(self, arg): + pass + + obj = ray.put(2) # Examples of passing objects to actor constructors. actor_handle = Actor.remote(obj) # by-value @@ -167,7 +190,7 @@ Nested Objects Ray also supports nested object references. This allows you to build composite objects that themselves hold references to further sub-objects. -.. code-block:: python +.. testcode:: # Objects can be nested within each other. Ray will keep the inner object # alive via reference counting until all outer object references are deleted. diff --git a/doc/source/ray-core/objects/object-spilling.rst b/doc/source/ray-core/objects/object-spilling.rst index b8eba3be732c..845215aa2a44 100644 --- a/doc/source/ray-core/objects/object-spilling.rst +++ b/doc/source/ray-core/objects/object-spilling.rst @@ -11,7 +11,13 @@ Ray uses object spilling by default. Without any setting, objects are spilled to To configure the directory where objects are spilled to, use: -.. code-block:: python +.. testcode:: + :hide: + + import ray + ray.shutdown() + +.. testcode:: import json import ray @@ -27,7 +33,12 @@ To configure the directory where objects are spilled to, use: You can also specify multiple directories for spilling to spread the IO load and disk space usage across multiple physical devices if needed (e.g., SSD devices): -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: import json import ray @@ -59,7 +70,12 @@ usage across multiple physical devices if needed (e.g., SSD devices): If you are using an HDD, it is recommended that you specify a large buffer size (> 1MB) to reduce IO requests during spilling. -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: import json import ray @@ -82,7 +98,12 @@ To prevent running out of disk space, local object spilling will throw ``OutOfDi If multiple physical devices are used, any physical device's over-usage will trigger the ``OutOfDiskError``. The default threshold is 0.95 (95%). You can adjust the threshold by setting ``local_fs_capacity_threshold``, or set it to 1 to disable the protection. -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: import json import ray @@ -97,6 +118,7 @@ The default threshold is 0.95 (95%). You can adjust the threshold by setting ``l "type": "filesystem", "params": { "directory_path": "/tmp/spill", + } }, ) }, @@ -105,7 +127,13 @@ The default threshold is 0.95 (95%). You can adjust the threshold by setting ``l To enable object spilling to remote storage (any URI supported by `smart_open `__): -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: + :skipif: True import json import ray @@ -130,7 +158,13 @@ It is recommended that you specify a large buffer size (> 1MB) to reduce IO requ Spilling to multiple remote storages is also supported. -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: + :skipif: True import json import ray diff --git a/doc/source/ray-core/objects/serialization.rst b/doc/source/ray-core/objects/serialization.rst index b73447c8d3d1..f7007adce862 100644 --- a/doc/source/ray-core/objects/serialization.rst +++ b/doc/source/ray-core/objects/serialization.rst @@ -3,7 +3,7 @@ Serialization ============= -Since Ray processes do not share memory space, data transferred between workers and nodes will need to **serialized** and **deserialized**. Ray uses the `Plasma object store `_ to efficiently transfer objects across different processes and different nodes. Numpy arrays in the object store are shared between workers on the same node (zero-copy deserialization). +Since Ray processes do not share memory space, data transferred between workers and nodes will need to **serialized** and **deserialized**. Ray uses the `Plasma object store `_ to efficiently transfer objects across different processes and different nodes. Numpy arrays in the object store are shared between workers on the same node (zero-copy deserialization). Overview -------- @@ -47,9 +47,11 @@ Serialization notes - For non-native objects, Ray will always keep a single copy even it is referred multiple times in an object: - .. code-block:: python + .. testcode:: + import ray import numpy as np + obj = [np.zeros(42)] * 99 l = ray.get(ray.put(obj)) assert l[0] is l[1] # no problem! @@ -72,13 +74,11 @@ There are at least 3 ways to define your custom serialization process: function inside the corresponding class. This is commonly done by most Python libraries. Example code: - .. code-block:: python + .. testcode:: import ray import sqlite3 - ray.init() - class DBConnection: def __init__(self, path): self.path = path @@ -96,11 +96,18 @@ There are at least 3 ways to define your custom serialization process: copied = ray.get(ray.put(original)) print(copied.conn) + .. testoutput:: + :options: +ELLIPSIS + + + + + 2. If you want to customize the serialization of a type of objects, but you cannot access or modify the corresponding class, you can register the class with the serializer you use: - .. code-block:: python + .. testcode:: import ray import threading @@ -110,7 +117,10 @@ There are at least 3 ways to define your custom serialization process: self.x = x self.lock = threading.Lock() # could not be serialized! - ray.get(ray.put(A(1))) # fail! + try: + ray.get(ray.put(A(1))) # fail! + except TypeError: + pass def custom_serializer(a): return a.x @@ -125,7 +135,10 @@ There are at least 3 ways to define your custom serialization process: # You can deregister the serializer at any time. ray.util.deregister_serializer(A) - ray.get(ray.put(A(1))) # fail! + try: + ray.get(ray.put(A(1))) # fail! + except TypeError: + pass # Nothing happens when deregister an unavailable serializer. ray.util.deregister_serializer(A) @@ -141,7 +154,7 @@ There are at least 3 ways to define your custom serialization process: 3. We also provide you an example, if you want to customize the serialization of a specific object: - .. code-block:: python + .. testcode:: import threading @@ -150,7 +163,10 @@ There are at least 3 ways to define your custom serialization process: self.x = x self.lock = threading.Lock() # could not serialize! - ray.get(ray.put(A(1))) # fail! + try: + ray.get(ray.put(A(1))) # fail! + except TypeError: + pass class SerializationHelperForA: """A helper class for serialization.""" @@ -163,7 +179,10 @@ There are at least 3 ways to define your custom serialization process: ray.get(ray.put(SerializationHelperForA(A(1)))) # success! # the serializer only works for a specific object, not all A # instances, so we still expect failure here. - ray.get(ray.put(A(1))) # still fail! + try: + ray.get(ray.put(A(1))) # still fail! + except TypeError: + pass Troubleshooting @@ -173,7 +192,7 @@ Use ``ray.util.inspect_serializability`` to identify tricky pickling issues. Thi Below, we demonstrate this behavior on a function with a non-serializable object (threading lock): -.. code-block:: python +.. testcode:: from ray.util import inspect_serializability import threading @@ -187,24 +206,26 @@ Below, we demonstrate this behavior on a function with a non-serializable object The resulting output is: - -.. code-block:: bash +.. testoutput:: + :options: +SKIP ============================================================= - Checking Serializability of + Checking Serializability of ============================================================= - !!! FAIL serialization: can't pickle _thread.lock objects + !!! FAIL serialization: cannot pickle '_thread.lock' object Detected 1 global variables. Checking serializability... - Serializing 'lock' ... - !!! FAIL serialization: can't pickle _thread.lock objects - WARNING: Did not find non-serializable object in . This may be an oversight. + Serializing 'lock' ... + !!! FAIL serialization: cannot pickle '_thread.lock' object + WARNING: Did not find non-serializable object in . This may be an oversight. ============================================================= Variable: - lock [obj=, parent=] + FailTuple(lock [obj=, parent=]) was found to be non-serializable. There may be multiple other undetected variables that were non-serializable. Consider either removing the instantiation/imports of these variables or moving the instantiation into the scope of the function/class. + ============================================================= + Check https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting for more information. If you have any suggestions on how to improve this error message, please reach out to the Ray developers on github.com/ray-project/ray/issues/ ============================================================= diff --git a/doc/source/ray-core/patterns/pipelining.rst b/doc/source/ray-core/patterns/pipelining.rst index cf9e63bdb2ce..e11e26be2455 100644 --- a/doc/source/ray-core/patterns/pipelining.rst +++ b/doc/source/ray-core/patterns/pipelining.rst @@ -7,7 +7,7 @@ you can use the `pipelining .. note:: Pipelining is an important technique to improve the performance and is heavily used by Ray libraries. - See :ref:`Ray Dataset pipelines ` as an example. + See :ref:`DatasetPipelines ` as an example. .. figure:: ../images/pipelining.svg diff --git a/doc/source/ray-core/ray-dag.rst b/doc/source/ray-core/ray-dag.rst index c1645f0b32f0..80fe6c4fdc63 100644 --- a/doc/source/ray-core/ray-dag.rst +++ b/doc/source/ray-core/ray-dag.rst @@ -40,12 +40,14 @@ functions to form more complex DAGs. Any IR node can be executed directly ``dag_node.execute()`` that acts as root of the DAG, where all other non-reachable nodes from the root will be igored. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ./doc_code/ray-dag.py - :language: python - :start-after: __dag_tasks_begin__ - :end-before: __dag_tasks_end__ + .. tab-item:: Python + + .. literalinclude:: ./doc_code/ray-dag.py + :language: python + :start-after: __dag_tasks_begin__ + :end-before: __dag_tasks_end__ Ray DAG with classes and class methods @@ -59,13 +61,15 @@ function calls specific to the parent actor instance. DAG IR nodes generated from a function, class or classmethod can be combined together to form a DAG. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python + + .. literalinclude:: ./doc_code/ray-dag.py + :language: python + :start-after: __dag_actors_begin__ + :end-before: __dag_actors_end__ - .. literalinclude:: ./doc_code/ray-dag.py - :language: python - :start-after: __dag_actors_begin__ - :end-before: __dag_actors_end__ - Ray DAG with custom InputNode @@ -75,12 +79,14 @@ Ray DAG with custom InputNode runtime. It should be used within a context manager with no args, and called as args of ``dag_node.execute()`` -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ./doc_code/ray-dag.py - :language: python - :start-after: __dag_input_node_begin__ - :end-before: __dag_input_node_end__ + .. literalinclude:: ./doc_code/ray-dag.py + :language: python + :start-after: __dag_input_node_begin__ + :end-before: __dag_input_node_end__ More Resources -------------- diff --git a/doc/source/ray-core/ray-dashboard.rst b/doc/source/ray-core/ray-dashboard.rst index f24a050e693b..ef9e0553a17b 100644 --- a/doc/source/ray-core/ray-dashboard.rst +++ b/doc/source/ray-core/ray-dashboard.rst @@ -3,7 +3,7 @@ Ray Dashboard ============= Ray provides a web-based dashboard for monitoring and debugging Ray applications. -The dashboard provides a visual representation of the system state, allowing users to track the performance +The dashboard provides a visual representation of the system state, allowing users to track the performance of their applications and troubleshoot issues. .. raw:: html @@ -36,20 +36,32 @@ To use the dashboard, you should use the `ray[default]` installation: You can access the dashboard through a URL printed when Ray is initialized (the default URL is **http://localhost:8265**) or via the context object returned from `ray.init`. -.. code-block:: python +.. testcode:: + :hide: + + import ray + ray.shutdown() + +.. testcode:: + + import ray context = ray.init() print(context.dashboard_url) +.. testoutput:: + + 127.0.0.1:8265 + .. code-block:: text INFO worker.py:1487 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265. Ray cluster comes with the dashboard. See :ref:`Cluster Monitoring ` for more details. -.. note:: +.. note:: - When using the Ray dashboard, it is highly recommended to also set up Prometheus and Grafana. + When using the Ray dashboard, it is highly recommended to also set up Prometheus and Grafana. They are necessary for critical features such as :ref:`Metrics View `. See :ref:`Ray Metrics ` to learn how to set up Prometheus and Grafana. @@ -68,7 +80,7 @@ View the application logs and errors If the Ray job is submitted by :ref:`Ray job API `, the job logs are available from the dashboard. The log file follows the following format; ``job-driver-.log``. -.. note:: +.. note:: If the driver is executed directly on the head node of the Ray cluster (without the job API) or run via :ref:`Ray client `, the driver logs are not accessible from the dashboard. In this case, see the terminal output to view the driver logs. @@ -81,7 +93,7 @@ If the Ray job is submitted by :ref:`Ray job API `, the job log :align: center Task and actor logs are accessible from the :ref:`task and actor table view `. Click the log button. -You can see the worker logs (``worker-[worker_id]-[job_id]-[pid].[out|err]``) that execute the task and actor. ``.out`` (stdout) and ``.err`` (stderr) logs contain the logs emitted from the tasks and actors. +You can see the worker logs (``worker-[worker_id]-[job_id]-[pid].[out|err]``) that execute the task and actor. ``.out`` (stdout) and ``.err`` (stderr) logs contain the logs emitted from the tasks and actors. The core worker logs (``python-core-worker-[worker_id]_[pid].log``) contain the system-level logs for the corresponding worker. **Task and Actor Errors** @@ -89,7 +101,7 @@ The core worker logs (``python-core-worker-[worker_id]_[pid].log``) contain the .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/failed_task_progress-bar.png :align: center -You can easily identify failed tasks or actors by looking at the job progress bar, which links to the table. +You can easily identify failed tasks or actors by looking at the job progress bar, which links to the table. .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/task_error_button.png :align: center @@ -104,8 +116,8 @@ The table displays the name of the failed tasks or actors and provides access to Analyze the CPU and memory usage of tasks and actors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The :ref:`Metrics View ` in the Ray dashboard provides a "per-component CPU/memory usage graph" that displays CPU and memory usage over time for each task and actor in the application (as well as system components). -This allows users to identify tasks and actors that may be consuming more resources than expected and optimize the performance of the application. +The :ref:`Metrics View ` in the Ray dashboard provides a "per-component CPU/memory usage graph" that displays CPU and memory usage over time for each task and actor in the application (as well as system components). +This allows users to identify tasks and actors that may be consuming more resources than expected and optimize the performance of the application. .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/node_cpu_by_comp.png :align: center @@ -128,7 +140,7 @@ Additionally, users can see a snapshot of hardware utilization from the :ref:`cl View the Resource Utilization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Ray requires users to specify the number of :ref:`resources ` their tasks and actors will use through arguments such as ``num_cpus``, ``num_gpus``, ``memory``, and ``resource``. +Ray requires users to specify the number of :ref:`resources ` their tasks and actors will use through arguments such as ``num_cpus``, ``num_gpus``, ``memory``, and ``resource``. These values are used for scheduling, but may not always match the actual resource utilization (physical resource utilization). - You can see the logical and physical resource utilization over time from the :ref:`Metrics View `. @@ -191,7 +203,7 @@ A job is a ray workload that uses Ray APIs (e.g., ``ray.init``). It can be submi .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/job_list.png :align: center -The job page displays a list of active, finished, and failed jobs, and clicking on an ID allows users to view detailed information about that job. +The job page displays a list of active, finished, and failed jobs, and clicking on an ID allows users to view detailed information about that job. For more information on Ray jobs, see the Ray Job Overview section. Job Profiling @@ -210,7 +222,7 @@ Advanced Task and Actor Breakdown .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/advanced-progress.png :align: left -The job page allows you to see tasks and actors broken down by their states. +The job page allows you to see tasks and actors broken down by their states. Tasks and actors are grouped and nested by default. You can see the nested entries by clicking the expand button. Tasks and actors are grouped and nested by the following criteria. @@ -222,7 +234,7 @@ Tasks and actors are grouped and nested by the following criteria. - Child actors (actors created within an actor) are nested under their parent actor's row. - Actor tasks (remote methods within an actor) are nested under the actor for the corresponding actor method. -.. note:: +.. note:: Ray dashboard can only display or retrieve up to 10K tasks at a time. If there are more than 10K tasks from your job, they are unaccounted. The number of unaccounted tasks is available from the task breakdown. @@ -230,7 +242,7 @@ Tasks and actors are grouped and nested by the following criteria. Task Timeline ~~~~~~~~~~~~~ -The :ref:`timeline API ` is available from the dashboard. +The :ref:`timeline API ` is available from the dashboard. .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/profile-button.png :align: center @@ -245,7 +257,7 @@ Second, you can use tools like ``chrome://tracing`` or the `Perfetto UI `. .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/task-table.png @@ -383,12 +395,12 @@ Actor Detail Page .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/actor-list-id.png :align: center -By clicking the ID, you can also see the detail view of the actor. +By clicking the ID, you can also see the detail view of the actor. .. image:: https://raw.githubusercontent.com/ray-project/Images/master/docs/new-dashboard-v2/dashboard-pics/actor-detail.png :align: center -From the actor detail page, you can see the metadata, state, and the all tasks that have run from this actor. +From the actor detail page, you can see the metadata, state, and the all tasks that have run from this actor. .. _dash-metrics-view: @@ -407,7 +419,7 @@ Ray exports default metrics which are available from the :ref:`Metrics View ` for available metrics. -.. note:: +.. note:: The metrics view required the Prometheus and Grafana setup. See :ref:`Ray Metrics ` to learn how to set up Prometheus and Grafana. @@ -440,26 +452,28 @@ Advanced Usage Changing Dashboard Ports ~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: Single-node local cluster +.. tab-set:: - **CLI** + .. tab-item:: Single-node local cluster - To customize the port on which the dashboard runs, you can pass - the ``--dashboard-port`` argument with ``ray start`` in the command line. + **CLI** - **ray.init** + To customize the port on which the dashboard runs, you can pass + the ``--dashboard-port`` argument with ``ray start`` in the command line. - If you need to customize the port on which the dashboard will run, you can pass the - keyword argument ``dashboard_port`` in your call to ``ray.init()``. + **ray.init** -.. tabbed:: VM Cluster Launcher + If you need to customize the port on which the dashboard will run, you can pass the + keyword argument ``dashboard_port`` in your call to ``ray.init()``. - To disable the dashboard while using the "VM cluster launcher", include the "ray start --head --include-dashboard=False" argument - and specify the desired port number in the "head_start_ray_commands" section of the `cluster launcher's YAML file `_. + .. tab-item:: VM Cluster Launcher -.. tabbed:: Kuberay + To disable the dashboard while using the "VM cluster launcher", include the "ray start --head --include-dashboard=False" argument + and specify the desired port number in the "head_start_ray_commands" section of the `cluster launcher's YAML file `_. - See the `Specifying non-default ports `_ page. + .. tab-item:: Kuberay + + See the `Specifying non-default ports `_ page. Viewing Built-in Dashboard API Metrics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -521,28 +535,35 @@ Dashboard is included in the `ray[default]` installation by default and automati To disable the dashboard, use the following arguments `--include-dashboard`. -.. tabbed:: Single-node local cluster +.. tab-set:: + + .. tab-item:: Single-node local cluster + + **CLI** + + .. code-block:: bash - **CLI** + ray start --include-dashboard=False - .. code-block:: bash + **ray.init** - ray start --include-dashboard=False + .. testcode:: + :hide: - **ray.init** + ray.shutdown() - .. code-block:: python + .. testcode:: - ray.init(include_dashboard=False) + ray.init(include_dashboard=False) -.. tabbed:: VM Cluster Launcher + .. tab-item:: VM Cluster Launcher - To disable the dashboard while using the "VM cluster launcher", include the "ray start --head --include-dashboard=False" argument - in the "head_start_ray_commands" section of the `cluster launcher's YAML file `_. + To disable the dashboard while using the "VM cluster launcher", include the "ray start --head --include-dashboard=False" argument + in the "head_start_ray_commands" section of the `cluster launcher's YAML file `_. -.. tabbed:: Kuberay + .. tab-item:: Kuberay - TODO + TODO .. _dash-reference: diff --git a/doc/source/ray-core/scheduling/placement-group.rst b/doc/source/ray-core/scheduling/placement-group.rst index ab18bd297f96..bc74029d3da8 100644 --- a/doc/source/ray-core/scheduling/placement-group.rst +++ b/doc/source/ray-core/scheduling/placement-group.rst @@ -49,89 +49,93 @@ Bundles are specified by a list of dictionaries, e.g., ``[{"CPU": 1}, {"CPU": 1, Placement group scheduling is asynchronous. The `ray.util.placement_group` returns immediately. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __create_pg_start__ - :end-before: __create_pg_end__ + .. tab-item:: Python + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __create_pg_start__ + :end-before: __create_pg_end__ -.. tabbed:: Java - .. code-block:: java + .. tab-item:: Java - // Initialize Ray. - Ray.init(); + .. code-block:: java - // Construct a list of bundles. - Map bundle = ImmutableMap.of("CPU", 1.0); - List> bundles = ImmutableList.of(bundle); + // Initialize Ray. + Ray.init(); - // Make a creation option with bundles and strategy. - PlacementGroupCreationOptions options = - new PlacementGroupCreationOptions.Builder() - .setBundles(bundles) - .setStrategy(PlacementStrategy.STRICT_SPREAD) - .build(); + // Construct a list of bundles. + Map bundle = ImmutableMap.of("CPU", 1.0); + List> bundles = ImmutableList.of(bundle); - PlacementGroup pg = PlacementGroups.createPlacementGroup(options); + // Make a creation option with bundles and strategy. + PlacementGroupCreationOptions options = + new PlacementGroupCreationOptions.Builder() + .setBundles(bundles) + .setStrategy(PlacementStrategy.STRICT_SPREAD) + .build(); -.. tabbed:: C++ + PlacementGroup pg = PlacementGroups.createPlacementGroup(options); - .. code-block:: c++ + .. tab-item:: C++ - // Initialize Ray. - ray::Init(); + .. code-block:: c++ - // Construct a list of bundles. - std::vector> bundles{{{"CPU", 1.0}}}; + // Initialize Ray. + ray::Init(); - // Make a creation option with bundles and strategy. - ray::internal::PlacementGroupCreationOptions options{ - false, "my_pg", bundles, ray::internal::PlacementStrategy::PACK}; + // Construct a list of bundles. + std::vector> bundles{{{"CPU", 1.0}}}; - ray::PlacementGroup pg = ray::CreatePlacementGroup(options); + // Make a creation option with bundles and strategy. + ray::internal::PlacementGroupCreationOptions options{ + false, "my_pg", bundles, ray::internal::PlacementStrategy::PACK}; + + ray::PlacementGroup pg = ray::CreatePlacementGroup(options); You can block your program until the placement group is ready using one of two APIs: * :func:`ready `, which is compatible with ``ray.get`` * :func:`wait `, which blocks the program until the placement group is ready) -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __ready_pg_start__ - :end-before: __ready_pg_end__ + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __ready_pg_start__ + :end-before: __ready_pg_end__ -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - // Wait for the placement group to be ready within the specified time(unit is seconds). - boolean ready = pg.wait(60); - Assert.assertTrue(ready); + // Wait for the placement group to be ready within the specified time(unit is seconds). + boolean ready = pg.wait(60); + Assert.assertTrue(ready); - // You can look at placement group states using this API. - List allPlacementGroup = PlacementGroups.getAllPlacementGroups(); - for (PlacementGroup group: allPlacementGroup) { - System.out.println(group); - } + // You can look at placement group states using this API. + List allPlacementGroup = PlacementGroups.getAllPlacementGroups(); + for (PlacementGroup group: allPlacementGroup) { + System.out.println(group); + } -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - // Wait for the placement group to be ready within the specified time(unit is seconds). - bool ready = pg.Wait(60); - assert(ready); + // Wait for the placement group to be ready within the specified time(unit is seconds). + bool ready = pg.Wait(60); + assert(ready); - // You can look at placement group states using this API. - std::vector all_placement_group = ray::GetAllPlacementGroups(); - for (const ray::PlacementGroup &group : all_placement_group) { - std::cout << group.GetName() << std::endl; - } + // You can look at placement group states using this API. + std::vector all_placement_group = ray::GetAllPlacementGroups(); + for (const ray::PlacementGroup &group : all_placement_group) { + std::cout << group.GetName() << std::endl; + } Let's verify the placement group is successfully created. @@ -163,12 +167,14 @@ Placement groups are atomically created; if a bundle cannot fit in any of the cu the entire placement group is not ready and no resources are reserved. To illustrate, let's create another placement group that requires ``{"CPU":1}, {"GPU": 2}`` (2 bundles). -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __create_pg_failed_start__ - :end-before: __create_pg_failed_end__ + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __create_pg_failed_start__ + :end-before: __create_pg_failed_end__ You can verify the new placement group is pending creation. @@ -234,68 +240,70 @@ Now let's schedule an actor to the placement group. You can schedule actors or tasks to a placement group using :class:`options(scheduling_strategy=PlacementGroupSchedulingStrategy(...)) `. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __schedule_pg_start__ - :end-before: __schedule_pg_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __schedule_pg_start__ + :end-before: __schedule_pg_end__ - .. code-block:: java + .. tab-item:: Java - public static class Counter { - private int value; + .. code-block:: java - public Counter(int initValue) { - this.value = initValue; - } + public static class Counter { + private int value; - public int getValue() { - return value; - } + public Counter(int initValue) { + this.value = initValue; + } - public static String ping() { - return "pong"; - } - } + public int getValue() { + return value; + } - // Create GPU actors on a gpu bundle. - for (int index = 0; index < 1; index++) { - Ray.actor(Counter::new, 1) - .setPlacementGroup(pg, 0) - .remote(); - } + public static String ping() { + return "pong"; + } + } -.. tabbed:: C++ + // Create GPU actors on a gpu bundle. + for (int index = 0; index < 1; index++) { + Ray.actor(Counter::new, 1) + .setPlacementGroup(pg, 0) + .remote(); + } - .. code-block:: c++ + .. tab-item:: C++ - class Counter { - public: - Counter(int init_value) : value(init_value){} - int GetValue() {return value;} - std::string Ping() { - return "pong"; - } - private: - int value; - }; + .. code-block:: c++ - // Factory function of Counter class. - static Counter *CreateCounter() { - return new Counter(); - }; + class Counter { + public: + Counter(int init_value) : value(init_value){} + int GetValue() {return value;} + std::string Ping() { + return "pong"; + } + private: + int value; + }; - RAY_REMOTE(&Counter::Ping, &Counter::GetValue, CreateCounter); + // Factory function of Counter class. + static Counter *CreateCounter() { + return new Counter(); + }; - // Create GPU actors on a gpu bundle. - for (int index = 0; index < 1; index++) { - ray::Actor(CreateCounter) - .SetPlacementGroup(pg, 0) - .Remote(1); - } + RAY_REMOTE(&Counter::Ping, &Counter::GetValue, CreateCounter); + + // Create GPU actors on a gpu bundle. + for (int index = 0; index < 1; index++) { + ray::Actor(CreateCounter) + .SetPlacementGroup(pg, 0) + .Remote(1); + } .. note:: @@ -364,12 +372,14 @@ For example, a placement group of 2 bundles ``[{"CPU": 1}, {"GPU": 1}]`` has ind and index 1 bundle ``{"GPU": 1}``. Since we only have 1 bundle, we only have index 0. If you don't specify a bundle, the actor (or task) is scheduled on a random bundle that has unallocated reserved resources. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __schedule_pg_3_start__ - :end-before: __schedule_pg_3_end__ + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __schedule_pg_3_start__ + :end-before: __schedule_pg_3_end__ We succeed to schedule the GPU actor! The below image describes 2 actors scheduled into the placement group. @@ -460,30 +470,32 @@ group using the :func:`remove_placement_group ` When you remove the placement group, actors or tasks that still use the reserved resources are forcefully killed. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __remove_pg_start__ - :end-before: __remove_pg_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __remove_pg_start__ + :end-before: __remove_pg_end__ - .. code-block:: java + .. tab-item:: Java - PlacementGroups.removePlacementGroup(placementGroup.getId()); + .. code-block:: java - PlacementGroup removedPlacementGroup = PlacementGroups.getPlacementGroup(placementGroup.getId()); - Assert.assertEquals(removedPlacementGroup.getState(), PlacementGroupState.REMOVED); + PlacementGroups.removePlacementGroup(placementGroup.getId()); -.. tabbed:: C++ + PlacementGroup removedPlacementGroup = PlacementGroups.getPlacementGroup(placementGroup.getId()); + Assert.assertEquals(removedPlacementGroup.getState(), PlacementGroupState.REMOVED); - .. code-block:: c++ + .. tab-item:: C++ - ray::RemovePlacementGroup(placement_group.GetID()); + .. code-block:: c++ - ray::PlacementGroup removed_placement_group = ray::GetPlacementGroup(placement_group.GetID()); - assert(removed_placement_group.GetState(), ray::PlacementGroupState::REMOVED); + ray::RemovePlacementGroup(placement_group.GetID()); + + ray::PlacementGroup removed_placement_group = ray::GetPlacementGroup(placement_group.GetID()); + assert(removed_placement_group.GetState(), ray::PlacementGroupState::REMOVED); .. _ray-placement-group-observability-ref: @@ -496,39 +508,41 @@ Ray provides several useful tools to inspect the placement group states and reso - **Ray Dashboard** is a UI tool for inspecting placement group states. - **Ray State API** is a CLI for inspecting placement group states. -.. tabbed:: ray status (CLI) +.. tab-set:: + + .. tab-item:: ray status (CLI) - The CLI command ``ray status`` provides the autoscaling status of the cluster. - It provides the "resource demands" from unscheduled placement groups as well as the resource reservation status. + The CLI command ``ray status`` provides the autoscaling status of the cluster. + It provides the "resource demands" from unscheduled placement groups as well as the resource reservation status. - .. code-block:: bash + .. code-block:: bash - Resources - --------------------------------------------------------------- - Usage: - 1.0/2.0 CPU (1.0 used of 1.0 reserved in placement groups) - 0.0/2.0 GPU (0.0 used of 1.0 reserved in placement groups) - 0B/4.29GiB memory - 0B/2.00GiB object_store_memory + Resources + --------------------------------------------------------------- + Usage: + 1.0/2.0 CPU (1.0 used of 1.0 reserved in placement groups) + 0.0/2.0 GPU (0.0 used of 1.0 reserved in placement groups) + 0B/4.29GiB memory + 0B/2.00GiB object_store_memory -.. tabbed:: Dashboard + .. tab-item:: Dashboard - The :ref:`dashboard job view ` provides the placement group table that displays the scheduling state and metadata of the placement group. + The :ref:`dashboard job view ` provides the placement group table that displays the scheduling state and metadata of the placement group. - .. note:: + .. note:: - Ray dashboard is only available when you install Ray is with ``pip install "ray[default]"``. + Ray dashboard is only available when you install Ray is with ``pip install "ray[default]"``. -.. tabbed:: Ray State API + .. tab-item:: Ray State API - :ref:`Ray state API ` is a CLI tool for inspecting the state of Ray resources (tasks, actors, placement groups, etc.). + :ref:`Ray state API ` is a CLI tool for inspecting the state of Ray resources (tasks, actors, placement groups, etc.). - ``ray list placement-groups`` provides the metadata and the scheduling state of the placement group. - ``ray list placement-groups --detail`` provides statistics and scheduling state in a greater detail. + ``ray list placement-groups`` provides the metadata and the scheduling state of the placement group. + ``ray list placement-groups --detail`` provides statistics and scheduling state in a greater detail. - .. note:: + .. note:: - State API is only available when you install Ray is with ``pip install "ray[default]"`` + State API is only available when you install Ray is with ``pip install "ray[default]"`` Inspect Placement Group Scheduling State ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -548,16 +562,18 @@ By default, child actors and tasks don't share the same placement group that the To automatically schedule child actors or tasks to the same placement group, set ``placement_group_capture_child_tasks`` to True. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ../doc_code/placement_group_capture_child_tasks_example.py - :language: python - :start-after: __child_capture_pg_start__ - :end-before: __child_capture_pg_end__ + .. literalinclude:: ../doc_code/placement_group_capture_child_tasks_example.py + :language: python + :start-after: __child_capture_pg_start__ + :end-before: __child_capture_pg_end__ -.. tabbed:: Java + .. tab-item:: Java - It's not implemented for Java APIs yet. + It's not implemented for Java APIs yet. When ``placement_group_capture_child_tasks`` is True, but you don't want to schedule child tasks and actors to the same placement group, specify ``PlacementGroupSchedulingStrategy(placement_group=None)``. @@ -577,74 +593,76 @@ the actor or task that needs it, or if you are trying to access a placement group launched by another driver. Note that the placement group is still destroyed if its lifetime isn't `detached`. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __get_pg_start__ - :end-before: __get_pg_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __get_pg_start__ + :end-before: __get_pg_end__ - .. code-block:: java + .. tab-item:: Java - // Create a placement group with a unique name. - Map bundle = ImmutableMap.of("CPU", 1.0); - List> bundles = ImmutableList.of(bundle); + .. code-block:: java - PlacementGroupCreationOptions options = - new PlacementGroupCreationOptions.Builder() - .setBundles(bundles) - .setStrategy(PlacementStrategy.STRICT_SPREAD) - .setName("global_name") - .build(); + // Create a placement group with a unique name. + Map bundle = ImmutableMap.of("CPU", 1.0); + List> bundles = ImmutableList.of(bundle); - PlacementGroup pg = PlacementGroups.createPlacementGroup(options); - pg.wait(60); + PlacementGroupCreationOptions options = + new PlacementGroupCreationOptions.Builder() + .setBundles(bundles) + .setStrategy(PlacementStrategy.STRICT_SPREAD) + .setName("global_name") + .build(); - ... + PlacementGroup pg = PlacementGroups.createPlacementGroup(options); + pg.wait(60); - // Retrieve the placement group later somewhere. - PlacementGroup group = PlacementGroups.getPlacementGroup("global_name"); - Assert.assertNotNull(group); + ... -.. tabbed:: C++ + // Retrieve the placement group later somewhere. + PlacementGroup group = PlacementGroups.getPlacementGroup("global_name"); + Assert.assertNotNull(group); - .. code-block:: c++ + .. tab-item:: C++ - // Create a placement group with a globally unique name. - std::vector> bundles{{{"CPU", 1.0}}}; + .. code-block:: c++ - ray::PlacementGroupCreationOptions options{ - true/*global*/, "global_name", bundles, ray::PlacementStrategy::STRICT_SPREAD}; + // Create a placement group with a globally unique name. + std::vector> bundles{{{"CPU", 1.0}}}; - ray::PlacementGroup pg = ray::CreatePlacementGroup(options); - pg.Wait(60); + ray::PlacementGroupCreationOptions options{ + true/*global*/, "global_name", bundles, ray::PlacementStrategy::STRICT_SPREAD}; - ... + ray::PlacementGroup pg = ray::CreatePlacementGroup(options); + pg.Wait(60); - // Retrieve the placement group later somewhere. - ray::PlacementGroup group = ray::GetGlobalPlacementGroup("global_name"); - assert(!group.Empty()); + ... - We also support non-global named placement group in C++, which means that the placement group name is only valid within the job and cannot be accessed from another job. + // Retrieve the placement group later somewhere. + ray::PlacementGroup group = ray::GetGlobalPlacementGroup("global_name"); + assert(!group.Empty()); - .. code-block:: c++ + We also support non-global named placement group in C++, which means that the placement group name is only valid within the job and cannot be accessed from another job. - // Create a placement group with a job-scope-unique name. - std::vector> bundles{{{"CPU", 1.0}}}; + .. code-block:: c++ - ray::PlacementGroupCreationOptions options{ - false/*non-global*/, "non_global_name", bundles, ray::PlacementStrategy::STRICT_SPREAD}; + // Create a placement group with a job-scope-unique name. + std::vector> bundles{{{"CPU", 1.0}}}; - ray::PlacementGroup pg = ray::CreatePlacementGroup(options); - pg.Wait(60); + ray::PlacementGroupCreationOptions options{ + false/*non-global*/, "non_global_name", bundles, ray::PlacementStrategy::STRICT_SPREAD}; - ... + ray::PlacementGroup pg = ray::CreatePlacementGroup(options); + pg.Wait(60); - // Retrieve the placement group later somewhere in the same job. - ray::PlacementGroup group = ray::GetPlacementGroup("non_global_name"); - assert(!group.Empty()); + ... + + // Retrieve the placement group later somewhere in the same job. + ray::PlacementGroup group = ray::GetPlacementGroup("non_global_name"); + assert(!group.Empty()); .. _placement-group-detached: @@ -659,16 +677,18 @@ By default, the lifetimes of placement groups belong to the driver and actor. To keep the placement group alive regardless of its job or detached actor, specify `lifetime="detached"`. For example: -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ../doc_code/placement_group_example.py - :language: python - :start-after: __detached_pg_start__ - :end-before: __detached_pg_end__ + .. literalinclude:: ../doc_code/placement_group_example.py + :language: python + :start-after: __detached_pg_start__ + :end-before: __detached_pg_end__ -.. tabbed:: Java + .. tab-item:: Java - The lifetime argument is not implemented for Java APIs yet. + The lifetime argument is not implemented for Java APIs yet. Let's terminate the current script and start a new Python script. Call ``ray list placement-groups``, and you can see the placement group is not removed. diff --git a/doc/source/ray-core/scheduling/resources.rst b/doc/source/ray-core/scheduling/resources.rst index e2455cc27032..8b4ffdfbc5f7 100644 --- a/doc/source/ray-core/scheduling/resources.rst +++ b/doc/source/ray-core/scheduling/resources.rst @@ -76,49 +76,51 @@ By default, logical resources are configured by the following rule. However, you can always override that by manually specifying the quantities of pre-defined resources and adding custom resources. There are several ways to do that depending on how you start the Ray cluster: -.. tabbed:: ray.init() +.. tab-set:: - If you are using :func:`ray.init() ` to start a single node Ray cluster, you can do the following to manually specify node resources: + .. tab-item:: ray.init() - .. literalinclude:: ../doc_code/resources.py - :language: python - :start-after: __specifying_node_resources_start__ - :end-before: __specifying_node_resources_end__ + If you are using :func:`ray.init() ` to start a single node Ray cluster, you can do the following to manually specify node resources: -.. tabbed:: ray start + .. literalinclude:: ../doc_code/resources.py + :language: python + :start-after: __specifying_node_resources_start__ + :end-before: __specifying_node_resources_end__ - If you are using :ref:`ray start ` to start a Ray node, you can run: + .. tab-item:: ray start - .. code-block:: shell + If you are using :ref:`ray start ` to start a Ray node, you can run: - ray start --head --num-cpus=3 --num-gpus=4 --resources='{"special_hardware": 1, "custom_label": 1}' + .. code-block:: shell -.. tabbed:: ray up + ray start --head --num-cpus=3 --num-gpus=4 --resources='{"special_hardware": 1, "custom_label": 1}' - If you are using :ref:`ray up ` to start a Ray cluster, you can set the :ref:`resources field ` in the yaml file: + .. tab-item:: ray up - .. code-block:: yaml + If you are using :ref:`ray up ` to start a Ray cluster, you can set the :ref:`resources field ` in the yaml file: - available_node_types: - head: - ... - resources: - CPU: 3 - GPU: 4 - special_hardware: 1 - custom_label: 1 + .. code-block:: yaml -.. tabbed:: KubeRay + available_node_types: + head: + ... + resources: + CPU: 3 + GPU: 4 + special_hardware: 1 + custom_label: 1 - If you are using :ref:`KubeRay ` to start a Ray cluster, you can set the :ref:`rayStartParams field ` in the yaml file: + .. tab-item:: KubeRay - .. code-block:: yaml + If you are using :ref:`KubeRay ` to start a Ray cluster, you can set the :ref:`rayStartParams field ` in the yaml file: - headGroupSpec: - rayStartParams: - num-cpus: "3" - num-gpus: "4" - resources: '"{\"special_hardware\": 1, \"custom_label\": 1}"' + .. code-block:: yaml + + headGroupSpec: + rayStartParams: + num-cpus: "3" + num-gpus: "4" + resources: '"{\"special_hardware\": 1, \"custom_label\": 1}"' .. _resource-requirements: @@ -139,30 +141,32 @@ If resources are specified explicitly, they are required for both scheduling and You can also explicitly specify a task's or actor's resource requirements (for example, one task may require a GPU) instead of using default ones via :func:`ray.remote() ` and :meth:`task.options() `/:meth:`actor.options() `. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: ../doc_code/resources.py - :language: python - :start-after: __specifying_resource_requirements_start__ - :end-before: __specifying_resource_requirements_end__ + .. literalinclude:: ../doc_code/resources.py + :language: python + :start-after: __specifying_resource_requirements_start__ + :end-before: __specifying_resource_requirements_end__ -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - // Specify required resources. - Ray.task(MyRayApp::myFunction).setResource("CPU", 1.0).setResource("GPU", 1.0).setResource("special_hardware", 1.0).remote(); + // Specify required resources. + Ray.task(MyRayApp::myFunction).setResource("CPU", 1.0).setResource("GPU", 1.0).setResource("special_hardware", 1.0).remote(); - Ray.actor(Counter::new).setResource("CPU", 2.0).setResource("GPU", 1.0).remote(); + Ray.actor(Counter::new).setResource("CPU", 2.0).setResource("GPU", 1.0).remote(); -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - // Specify required resources. - ray::Task(MyFunction).SetResource("CPU", 1.0).SetResource("GPU", 1.0).SetResource("special_hardware", 1.0).Remote(); + // Specify required resources. + ray::Task(MyFunction).SetResource("CPU", 1.0).SetResource("GPU", 1.0).SetResource("special_hardware", 1.0).Remote(); - ray::Actor(CreateCounter).SetResource("CPU", 2.0).SetResource("GPU", 1.0).Remote(); + ray::Actor(CreateCounter).SetResource("CPU", 2.0).SetResource("GPU", 1.0).Remote(); Task and actor resource requirements have implications for the Ray's scheduling concurrency. In particular, the sum of the resource requirements of all of the diff --git a/doc/source/ray-core/starting-ray.rst b/doc/source/ray-core/starting-ray.rst index e04a8a399ffd..711049d940f5 100644 --- a/doc/source/ray-core/starting-ray.rst +++ b/doc/source/ray-core/starting-ray.rst @@ -35,114 +35,131 @@ Calling ``ray.init()`` starts a local Ray instance on your laptop/machine. This In recent versions of Ray (>=1.5), ``ray.init()`` will automatically be called on the first use of a Ray remote API. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - import ray - # Other Ray APIs will not work until `ray.init()` is called. - ray.init() + .. testcode:: + :hide: -.. tabbed:: Java + import ray + ray.shutdown() - .. code-block:: java + .. testcode:: - import io.ray.api.Ray; + import ray + # Other Ray APIs will not work until `ray.init()` is called. + ray.init() - public class MyRayApp { + .. tab-item:: Java - public static void main(String[] args) { - // Other Ray APIs will not work until `Ray.init()` is called. - Ray.init(); - ... - } - } + .. code-block:: java + + import io.ray.api.Ray; -.. tabbed:: C++ + public class MyRayApp { - .. code-block:: c++ + public static void main(String[] args) { + // Other Ray APIs will not work until `Ray.init()` is called. + Ray.init(); + ... + } + } + + .. tab-item:: C++ - #include - // Other Ray APIs will not work until `ray::Init()` is called. - ray::Init() + .. code-block:: c++ + + #include + // Other Ray APIs will not work until `ray::Init()` is called. + ray::Init() When the process calling ``ray.init()`` terminates, the Ray runtime will also terminate. To explicitly stop or restart Ray, use the shutdown API. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - import ray - ray.init() - ... # ray program - ray.shutdown() + .. testcode:: + :hide: -.. tabbed:: Java + ray.shutdown() - .. code-block:: java + .. testcode:: - import io.ray.api.Ray; + import ray + ray.init() + ... # ray program + ray.shutdown() - public class MyRayApp { + .. tab-item:: Java - public static void main(String[] args) { - Ray.init(); - ... // ray program - Ray.shutdown(); - } - } + .. code-block:: java -.. tabbed:: C++ + import io.ray.api.Ray; - .. code-block:: c++ + public class MyRayApp { - #include - ray::Init() - ... // ray program - ray::Shutdown() + public static void main(String[] args) { + Ray.init(); + ... // ray program + Ray.shutdown(); + } + } + + .. tab-item:: C++ + + .. code-block:: c++ + + #include + ray::Init() + ... // ray program + ray::Shutdown() To check if Ray is initialized, use the ``is_initialized`` API. -.. tabbed:: Python +.. tab-set:: - .. code-block:: python + .. tab-item:: Python - import ray - ray.init() - assert ray.is_initialized() + .. testcode:: - ray.shutdown() - assert not ray.is_initialized() + import ray + ray.init() + assert ray.is_initialized() -.. tabbed:: Java + ray.shutdown() + assert not ray.is_initialized() - .. code-block:: java + .. tab-item:: Java - import io.ray.api.Ray; + .. code-block:: java - public class MyRayApp { + import io.ray.api.Ray; - public static void main(String[] args) { - Ray.init(); - Assert.assertTrue(Ray.isInitialized()); - Ray.shutdown(); - Assert.assertFalse(Ray.isInitialized()); + public class MyRayApp { + + public static void main(String[] args) { + Ray.init(); + Assert.assertTrue(Ray.isInitialized()); + Ray.shutdown(); + Assert.assertFalse(Ray.isInitialized()); + } } - } -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - #include + #include - int main(int argc, char **argv) { - ray::Init(); - assert(ray::IsInitialized()); + int main(int argc, char **argv) { + ray::Init(); + assert(ray::IsInitialized()); - ray::Shutdown(); - assert(!ray::IsInitialized()); - } + ray::Shutdown(); + assert(!ray::IsInitialized()); + } See the `Configuration `__ documentation for the various ways to configure Ray. @@ -170,47 +187,49 @@ Use ``ray start`` from the CLI to start a 1 node ray runtime on a machine. This You can connect to this Ray instance by starting a driver process on the same node as where you ran ``ray start``. ``ray.init()`` will now automatically connect to the latest Ray instance. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. code-block:: python + .. testcode:: - import ray - ray.init() + import ray + ray.init() -.. tabbed:: java + .. tab-item:: java - .. code-block:: java + .. code-block:: java - import io.ray.api.Ray; + import io.ray.api.Ray; - public class MyRayApp { + public class MyRayApp { - public static void main(String[] args) { - Ray.init(); - ... - } - } + public static void main(String[] args) { + Ray.init(); + ... + } + } - .. code-block:: bash + .. code-block:: bash - java -classpath \ - -Dray.address=
    \ - + java -classpath \ + -Dray.address=
    \ + -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - #include + #include - int main(int argc, char **argv) { - ray::Init(); - ... - } + int main(int argc, char **argv) { + ray::Init(); + ... + } - .. code-block:: bash + .. code-block:: bash - RAY_ADDRESS=
    ./ + RAY_ADDRESS=
    ./ You can connect other nodes to the head node, creating a Ray cluster by also calling ``ray start`` on those nodes. See :ref:`on-prem` for more details. Calling ``ray.init()`` on any of the cluster machines will connect to the same Ray cluster. @@ -227,9 +246,14 @@ Your code **only** needs to execute on one machine in the cluster (usually the h To connect to the Ray cluster, call ``ray.init`` from one of the machines in the cluster. This will connect to the latest Ray cluster: -.. code-block:: python +.. testcode:: + :hide: + + ray.shutdown() + +.. testcode:: - ray.init() + ray.init() Note that the machine calling ``ray up`` will not be considered as part of the Ray cluster, and therefore calling ``ray.init`` on that same machine will not attach to the cluster. diff --git a/doc/source/ray-core/tasks.rst b/doc/source/ray-core/tasks.rst index 429f7f715445..2a5177dcee41 100644 --- a/doc/source/ray-core/tasks.rst +++ b/doc/source/ray-core/tasks.rst @@ -5,79 +5,81 @@ Tasks Ray enables arbitrary functions to be executed asynchronously on separate Python workers. Such functions are called **Ray remote functions** and their asynchronous invocations are called **Ray tasks**. Here is an example. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: doc_code/tasks.py - :language: python - :start-after: __tasks_start__ - :end-before: __tasks_end__ + .. tab-item:: Python - See the `ray.remote package reference `__ page for specific documentation on how to use ``ray.remote``. + .. literalinclude:: doc_code/tasks.py + :language: python + :start-after: __tasks_start__ + :end-before: __tasks_end__ -.. tabbed:: Java + See the `ray.remote package reference `__ page for specific documentation on how to use ``ray.remote``. - .. code-block:: java + .. tab-item:: Java - public class MyRayApp { - // A regular Java static method. - public static int myFunction() { - return 1; - } - } + .. code-block:: java - // Invoke the above method as a Ray task. - // This will immediately return an object ref (a future) and then create - // a task that will be executed on a worker process. - ObjectRef res = Ray.task(MyRayApp::myFunction).remote(); + public class MyRayApp { + // A regular Java static method. + public static int myFunction() { + return 1; + } + } + + // Invoke the above method as a Ray task. + // This will immediately return an object ref (a future) and then create + // a task that will be executed on a worker process. + ObjectRef res = Ray.task(MyRayApp::myFunction).remote(); - // The result can be retrieved with ``ObjectRef::get``. - Assert.assertTrue(res.get() == 1); + // The result can be retrieved with ``ObjectRef::get``. + Assert.assertTrue(res.get() == 1); - public class MyRayApp { - public static int slowFunction() throws InterruptedException { - TimeUnit.SECONDS.sleep(10); - return 1; - } - } + public class MyRayApp { + public static int slowFunction() throws InterruptedException { + TimeUnit.SECONDS.sleep(10); + return 1; + } + } - // Ray tasks are executed in parallel. - // All computation is performed in the background, driven by Ray's internal event loop. - for(int i = 0; i < 4; i++) { - // This doesn't block. - Ray.task(MyRayApp::slowFunction).remote(); - } + // Ray tasks are executed in parallel. + // All computation is performed in the background, driven by Ray's internal event loop. + for(int i = 0; i < 4; i++) { + // This doesn't block. + Ray.task(MyRayApp::slowFunction).remote(); + } -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - // A regular C++ function. - int MyFunction() { - return 1; - } - // Register as a remote function by `RAY_REMOTE`. - RAY_REMOTE(MyFunction); + // A regular C++ function. + int MyFunction() { + return 1; + } + // Register as a remote function by `RAY_REMOTE`. + RAY_REMOTE(MyFunction); - // Invoke the above method as a Ray task. - // This will immediately return an object ref (a future) and then create - // a task that will be executed on a worker process. - auto res = ray::Task(MyFunction).Remote(); + // Invoke the above method as a Ray task. + // This will immediately return an object ref (a future) and then create + // a task that will be executed on a worker process. + auto res = ray::Task(MyFunction).Remote(); - // The result can be retrieved with ``ray::ObjectRef::Get``. - assert(*res.Get() == 1); + // The result can be retrieved with ``ray::ObjectRef::Get``. + assert(*res.Get() == 1); - int SlowFunction() { - std::this_thread::sleep_for(std::chrono::seconds(10)); - return 1; - } - RAY_REMOTE(SlowFunction); + int SlowFunction() { + std::this_thread::sleep_for(std::chrono::seconds(10)); + return 1; + } + RAY_REMOTE(SlowFunction); - // Ray tasks are executed in parallel. - // All computation is performed in the background, driven by Ray's internal event loop. - for(int i = 0; i < 4; i++) { - // This doesn't block. - ray::Task(SlowFunction).Remote(); - } + // Ray tasks are executed in parallel. + // All computation is performed in the background, driven by Ray's internal event loop. + for(int i = 0; i < 4; i++) { + // This doesn't block. + ray::Task(SlowFunction).Remote(); + } Specifying required resources @@ -85,26 +87,28 @@ Specifying required resources You can specify resource requirements in tasks (see :ref:`resource-requirements` for more details.) -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: doc_code/tasks.py - :language: python - :start-after: __resource_start__ - :end-before: __resource_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: doc_code/tasks.py + :language: python + :start-after: __resource_start__ + :end-before: __resource_end__ - .. code-block:: java + .. tab-item:: Java - // Specify required resources. - Ray.task(MyRayApp::myFunction).setResource("CPU", 4.0).setResource("GPU", 2.0).remote(); + .. code-block:: java -.. tabbed:: C++ + // Specify required resources. + Ray.task(MyRayApp::myFunction).setResource("CPU", 4.0).setResource("GPU", 2.0).remote(); - .. code-block:: c++ + .. tab-item:: C++ - // Specify required resources. - ray::Task(MyFunction).SetResource("CPU", 4.0).SetResource("GPU", 2.0).Remote(); + .. code-block:: c++ + + // Specify required resources. + ray::Task(MyFunction).SetResource("CPU", 4.0).SetResource("GPU", 2.0).Remote(); .. _ray-object-refs: @@ -113,45 +117,47 @@ Passing object refs to Ray tasks In addition to values, `Object refs `__ can also be passed into remote functions. When the task gets executed, inside the function body **the argument will be the underlying value**. For example, take this function: -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: doc_code/tasks.py - :language: python - :start-after: __pass_by_ref_start__ - :end-before: __pass_by_ref_end__ + .. tab-item:: Python -.. tabbed:: Java + .. literalinclude:: doc_code/tasks.py + :language: python + :start-after: __pass_by_ref_start__ + :end-before: __pass_by_ref_end__ - .. code-block:: java + .. tab-item:: Java - public class MyRayApp { - public static int functionWithAnArgument(int value) { - return value + 1; + .. code-block:: java + + public class MyRayApp { + public static int functionWithAnArgument(int value) { + return value + 1; + } } - } - ObjectRef objRef1 = Ray.task(MyRayApp::myFunction).remote(); - Assert.assertTrue(objRef1.get() == 1); + ObjectRef objRef1 = Ray.task(MyRayApp::myFunction).remote(); + Assert.assertTrue(objRef1.get() == 1); - // You can pass an object ref as an argument to another Ray task. - ObjectRef objRef2 = Ray.task(MyRayApp::functionWithAnArgument, objRef1).remote(); - Assert.assertTrue(objRef2.get() == 2); + // You can pass an object ref as an argument to another Ray task. + ObjectRef objRef2 = Ray.task(MyRayApp::functionWithAnArgument, objRef1).remote(); + Assert.assertTrue(objRef2.get() == 2); -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - static int FunctionWithAnArgument(int value) { - return value + 1; - } - RAY_REMOTE(FunctionWithAnArgument); + static int FunctionWithAnArgument(int value) { + return value + 1; + } + RAY_REMOTE(FunctionWithAnArgument); - auto obj_ref1 = ray::Task(MyFunction).Remote(); - assert(*obj_ref1.Get() == 1); + auto obj_ref1 = ray::Task(MyFunction).Remote(); + assert(*obj_ref1.Get() == 1); - // You can pass an object ref as an argument to another Ray task. - auto obj_ref2 = ray::Task(FunctionWithAnArgument).Remote(obj_ref1); - assert(*obj_ref2.Get() == 2); + // You can pass an object ref as an argument to another Ray task. + auto obj_ref2 = ray::Task(FunctionWithAnArgument).Remote(obj_ref1); + assert(*obj_ref2.Get() == 2); Note the following behaviors: @@ -167,26 +173,28 @@ Calling **ray.get** on Ray task results will block until the task finished execu finished executing without blocking on all of them. This could be achieved by :func:`ray.wait() `. The function works as follows. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: doc_code/tasks.py - :language: python - :start-after: __wait_start__ - :end-before: __wait_end__ + .. literalinclude:: doc_code/tasks.py + :language: python + :start-after: __wait_start__ + :end-before: __wait_end__ -.. tabbed:: Java + .. tab-item:: Java - .. code-block:: java + .. code-block:: java - WaitResult waitResult = Ray.wait(objectRefs, /*num_returns=*/0, /*timeoutMs=*/1000); - System.out.println(waitResult.getReady()); // List of ready objects. - System.out.println(waitResult.getUnready()); // list of unready objects. + WaitResult waitResult = Ray.wait(objectRefs, /*num_returns=*/0, /*timeoutMs=*/1000); + System.out.println(waitResult.getReady()); // List of ready objects. + System.out.println(waitResult.getUnready()); // list of unready objects. -.. tabbed:: C++ + .. tab-item:: C++ - .. code-block:: c++ + .. code-block:: c++ - ray::WaitResult wait_result = ray::Wait(object_refs, /*num_objects=*/0, /*timeout_ms=*/1000); + ray::WaitResult wait_result = ray::Wait(object_refs, /*num_objects=*/0, /*timeout_ms=*/1000); .. _ray-task-returns: @@ -195,21 +203,25 @@ Multiple returns By default, a Ray task only returns a single Object Ref. However, you can configure Ray tasks to return multiple Object Refs, by setting the ``num_returns`` option. -.. tabbed:: Python +.. tab-set:: - .. literalinclude:: doc_code/tasks.py - :language: python - :start-after: __multiple_returns_start__ - :end-before: __multiple_returns_end__ + .. tab-item:: Python + + .. literalinclude:: doc_code/tasks.py + :language: python + :start-after: __multiple_returns_start__ + :end-before: __multiple_returns_end__ For tasks that return multiple objects, Ray also supports remote generators that allow a task to return one object at a time to reduce memory usage at the worker. Ray also supports an option to set the number of return values dynamically, which can be useful when the task caller does not know how many return values to expect. See the :ref:`user guide ` for more details on use cases. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: doc_code/tasks.py - :language: python - :start-after: __generator_start__ - :end-before: __generator_end__ + .. literalinclude:: doc_code/tasks.py + :language: python + :start-after: __generator_start__ + :end-before: __generator_end__ Cancelling tasks @@ -217,12 +229,14 @@ Cancelling tasks Ray tasks can be canceled by calling :func:`ray.cancel() ` on the returned Object ref. -.. tabbed:: Python +.. tab-set:: + + .. tab-item:: Python - .. literalinclude:: doc_code/tasks.py - :language: python - :start-after: __cancel_start__ - :end-before: __cancel_end__ + .. literalinclude:: doc_code/tasks.py + :language: python + :start-after: __cancel_start__ + :end-before: __cancel_end__ Scheduling diff --git a/doc/source/ray-core/walkthrough.rst b/doc/source/ray-core/walkthrough.rst index 80b42f8ac665..e4f721a7d59e 100644 --- a/doc/source/ray-core/walkthrough.rst +++ b/doc/source/ray-core/walkthrough.rst @@ -65,34 +65,34 @@ Next Steps Ray's key primitives are simple, but can be composed together to express almost any kind of distributed computation. Learn more about Ray's :ref:`key concepts ` with the following user guides: -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-50 d-block mx-auto - - --- - :img-top: /images/tasks.png - - .. link-button:: ray-remote-functions - :type: ref - :text: Using remote functions (Tasks) - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/actors.png - - .. link-button:: ray-remote-classes - :type: ref - :text: Using remote classes (Actors) - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/objects.png - - .. link-button:: objects-in-ray - :type: ref - :text: Working with Ray Objects - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 3 + :gutter: 1 + :class-container: container pb-3 + + + .. grid-item-card:: + :img-top: /images/tasks.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: ray-remote-functions + + Using remote functions (Tasks) + + .. grid-item-card:: + :img-top: /images/actors.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: ray-remote-classes + + Using remote classes (Actors) + + .. grid-item-card:: + :img-top: /images/objects.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: objects-in-ray + + Working with Ray Objects .. include:: /_includes/core/announcement_bottom.rst diff --git a/doc/source/ray-observability/api/state/api.rst b/doc/source/ray-observability/api/state/api.rst index eda123718a72..5056422e5bd7 100644 --- a/doc/source/ray-observability/api/state/api.rst +++ b/doc/source/ray-observability/api/state/api.rst @@ -22,9 +22,9 @@ Summary APIs .. autosummary:: :toctree: doc/ - ray.experimental.state.api.summarize_actors - ray.experimental.state.api.summarize_objects - ray.experimental.state.api.summarize_tasks + ray.util.state.summarize_actors + ray.util.state.summarize_objects + ray.util.state.summarize_tasks List APIs ~~~~~~~~~~ @@ -32,14 +32,14 @@ List APIs .. autosummary:: :toctree: doc/ - ray.experimental.state.api.list_actors - ray.experimental.state.api.list_placement_groups - ray.experimental.state.api.list_nodes - ray.experimental.state.api.list_jobs - ray.experimental.state.api.list_workers - ray.experimental.state.api.list_tasks - ray.experimental.state.api.list_objects - ray.experimental.state.api.list_runtime_envs + ray.util.state.list_actors + ray.util.state.list_placement_groups + ray.util.state.list_nodes + ray.util.state.list_jobs + ray.util.state.list_workers + ray.util.state.list_tasks + ray.util.state.list_objects + ray.util.state.list_runtime_envs Get APIs ~~~~~~~~~ @@ -47,12 +47,12 @@ Get APIs .. autosummary:: :toctree: doc/ - ray.experimental.state.api.get_actor - ray.experimental.state.api.get_placement_group - ray.experimental.state.api.get_node - ray.experimental.state.api.get_worker - ray.experimental.state.api.get_task - ray.experimental.state.api.get_objects + ray.util.state.get_actor + ray.util.state.get_placement_group + ray.util.state.get_node + ray.util.state.get_worker + ray.util.state.get_task + ray.util.state.get_objects Log APIs ~~~~~~~~ @@ -60,8 +60,8 @@ Log APIs .. autosummary:: :toctree: doc/ - ray.experimental.state.api.list_logs - ray.experimental.state.api.get_log + ray.util.state.list_logs + ray.util.state.get_log .. _state-api-schema: @@ -72,21 +72,21 @@ State APIs Schema :toctree: doc/ :template: autosummary/class_without_autosummary.rst - ray.experimental.state.common.ActorState - ray.experimental.state.common.TaskState - ray.experimental.state.common.NodeState - ray.experimental.state.common.PlacementGroupState - ray.experimental.state.common.WorkerState - ray.experimental.state.common.ObjectState - ray.experimental.state.common.RuntimeEnvState - ray.experimental.state.common.JobState - ray.experimental.state.common.StateSummary - ray.experimental.state.common.TaskSummaries - ray.experimental.state.common.TaskSummaryPerFuncOrClassName - ray.experimental.state.common.ActorSummaries - ray.experimental.state.common.ActorSummaryPerClass - ray.experimental.state.common.ObjectSummaries - ray.experimental.state.common.ObjectSummaryPerKey + ray.util.state.common.ActorState + ray.util.state.common.TaskState + ray.util.state.common.NodeState + ray.util.state.common.PlacementGroupState + ray.util.state.common.WorkerState + ray.util.state.common.ObjectState + ray.util.state.common.RuntimeEnvState + ray.util.state.common.JobState + ray.util.state.common.StateSummary + ray.util.state.common.TaskSummaries + ray.util.state.common.TaskSummaryPerFuncOrClassName + ray.util.state.common.ActorSummaries + ray.util.state.common.ActorSummaryPerClass + ray.util.state.common.ObjectSummaries + ray.util.state.common.ObjectSummaryPerKey State APIs Exceptions --------------------- @@ -94,4 +94,4 @@ State APIs Exceptions .. autosummary:: :toctree: doc/ - ray.experimental.state.exception.RayStateApiException + ray.util.state.exception.RayStateApiException diff --git a/doc/source/ray-observability/api/state/cli.rst b/doc/source/ray-observability/api/state/cli.rst index 7b38592eb795..e12dfc45fdb5 100644 --- a/doc/source/ray-observability/api/state/cli.rst +++ b/doc/source/ray-observability/api/state/cli.rst @@ -13,19 +13,19 @@ This section contains commands to access the :ref:`live state of Ray resources ( State CLI allows users to access the state of various resources (e.g., actor, task, object). -.. click:: ray.experimental.state.state_cli:task_summary +.. click:: ray.util.state.state_cli:task_summary :prog: ray summary tasks -.. click:: ray.experimental.state.state_cli:actor_summary +.. click:: ray.util.state.state_cli:actor_summary :prog: ray summary actors -.. click:: ray.experimental.state.state_cli:object_summary +.. click:: ray.util.state.state_cli:object_summary :prog: ray summary objects -.. click:: ray.experimental.state.state_cli:ray_list +.. click:: ray.util.state.state_cli:ray_list :prog: ray list -.. click:: ray.experimental.state.state_cli:ray_get +.. click:: ray.util.state.state_cli:ray_get :prog: ray get .. _ray-logs-api-cli-ref: @@ -41,5 +41,5 @@ This section contains commands to :ref:`access logs ` from Ra Log CLI allows users to access the log from the cluster. Note that only the logs from alive nodes are available through this API. -.. click:: ray.experimental.state.state_cli:logs_state_cli_group +.. click:: ray.util.state.state_cli:logs_state_cli_group :prog: ray logs \ No newline at end of file diff --git a/doc/source/ray-observability/images/coloring-actor-log-prefixes.png b/doc/source/ray-observability/images/coloring-actor-log-prefixes.png new file mode 100644 index 000000000000..1acc3ceca339 Binary files /dev/null and b/doc/source/ray-observability/images/coloring-actor-log-prefixes.png differ diff --git a/doc/source/ray-observability/monitoring-debugging/monitoring-debugging.rst b/doc/source/ray-observability/monitoring-debugging/monitoring-debugging.rst index 4dd9b646d0f7..6d882c34fe59 100644 --- a/doc/source/ray-observability/monitoring-debugging/monitoring-debugging.rst +++ b/doc/source/ray-observability/monitoring-debugging/monitoring-debugging.rst @@ -13,15 +13,8 @@ See :ref:`Getting Help ` if your problem is not s ../overview ../../ray-core/ray-dashboard ../state/state-api - ../ray-debugging ../ray-logging ../ray-metrics profiling - ../ray-tracing - troubleshoot-failures - troubleshoot-hangs - troubleshoot-performance gotchas getting-help - ../../ray-contribute/debugging.rst - ../../ray-contribute/profiling.rst diff --git a/doc/source/ray-observability/overview.rst b/doc/source/ray-observability/overview.rst index 3e073c51a43d..8919d3f29b5a 100644 --- a/doc/source/ray-observability/overview.rst +++ b/doc/source/ray-observability/overview.rst @@ -1,3 +1,5 @@ +.. _observability-overview: + Overview ======== diff --git a/doc/source/ray-observability/ray-logging.rst b/doc/source/ray-observability/ray-logging.rst index 69c40509813f..ac238cc4a715 100644 --- a/doc/source/ray-observability/ray-logging.rst +++ b/doc/source/ray-observability/ray-logging.rst @@ -109,6 +109,15 @@ This produces the following output: (MyActor(index=2) pid=482120) hello there (MyActor(index=1) pid=482119) hello there +Coloring Actor log prefixes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +By default Ray prints Actor logs prefixes in light blue: +Users may instead activate multi-color prefixes by setting the environment variable ``RAY_COLOR_PREFIX=1``. +This will index into an array of colors modulo the PID of each process. + +.. image:: ./images/coloring-actor-log-prefixes.png + :align: center + Distributed progress bars (tqdm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/ray-observability/ray-metrics.rst b/doc/source/ray-observability/ray-metrics.rst index 7b22fd9ff01a..bcedba3a3f77 100644 --- a/doc/source/ray-observability/ray-metrics.rst +++ b/doc/source/ray-observability/ray-metrics.rst @@ -23,7 +23,7 @@ Ray exposes its metrics in Prometheus format. This allows us to easily scrape th First, `download Prometheus `_. Make sure to download the correct binary for your operating system. (Ex: darwin for mac osx) -Then, unzip the the archive into a local directory using the following command. +Then, unzip the archive into a local directory using the following command. .. code-block:: bash @@ -278,9 +278,21 @@ to `RAY_GRAFANA_HOST=http://55.66.77.88:3000`. Troubleshooting --------------- +Getting Prometheus and Grafana to use the Ray configurations when installed via homebrew on macOS X +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With homebrew, Prometheus and Grafana are installed as services that are automatically launched for you. +Therefore, to configure these services, you cannot simply pass in the config files as command line arguments. + +Instead, follow these instructions: +1. Change the --config-file line in `/usr/local/etc/prometheus.args` to read `--config.file /tmp/ray/session_latest/metrics/prometheus/prometheus.yml`. +2. Update `/usr/local/etc/grafana/grafana.ini` file so that it matches the contents of `/tmp/ray/session_latest/metrics/grafana/grafana.ini`. + +You can then start or restart the services with `brew services start grafana` and `brew services start prometheus`. + .. _unverified-developer: -Mac does not trust the developer when installing prometheus or grafana +MacOS does not trust the developer to install Prometheus or Grafana ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You may have received an error that looks like this: @@ -295,6 +307,10 @@ See `these instructions `_! -Ray state APIs allow users to conveniently access the current state (snapshot) of Ray through CLI or Python SDK. +Ray state APIs allow users to conveniently access the current state (snapshot) of Ray through CLI or Python SDK (developer APIs). .. note:: - APIs are :ref:`alpha `. This feature requires a full installation of Ray using ``pip install "ray[default]"``. This feature also requires the dashboard component to be available. The dashboard component needs to be included when starting the ray cluster, which is the default behavior for ``ray start`` and ``ray.init()``. For more in-depth debugging, you could check the dashboard log at ``/dashboard.log``, which is usually ``/tmp/ray/session_latest/logs/dashboard.log``. + This feature requires a full installation of Ray using ``pip install "ray[default]"``. This feature also requires the dashboard component to be available. The dashboard component needs to be included when starting the Ray cluster, which is the default behavior for ``ray start`` and ``ray.init()``. For more in-depth debugging, check the dashboard log at ``/dashboard.log``, which is usually ``/tmp/ray/session_latest/logs/dashboard.log``. + +.. note:: + + State API CLI commands are :ref:`stable `, while python SDKs are :ref:`DeveloperAPI `. CLI usage is recommended over Python SDKs. Getting Started --------------- @@ -43,18 +47,20 @@ Run any workload. In this example, you will use the following script that runs 2 Now, let's see the summarized states of tasks. If it doesn't return the output immediately, retry the command. -.. tabbed:: CLI +.. tabs:: - .. code-block:: bash + .. group-tab:: CLI (Recommended) - ray summary tasks + .. code-block:: bash -.. tabbed:: Python SDK + ray summary tasks - .. code-block:: python + .. group-tab:: Python SDK (Internal Developer API) - from ray.experimental.state.api import summarize_tasks - print(summarize_tasks()) + .. code-block:: python + + from ray.util.state import summarize_tasks + print(summarize_tasks()) .. code-block:: text @@ -74,18 +80,20 @@ Now, let's see the summarized states of tasks. If it doesn't return the output i Let's list all actors. -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray list actors + ray list actors -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import list_actors - print(list_actors()) + from ray.util.state import list_actors + print(list_actors()) .. code-block:: text @@ -102,20 +110,22 @@ Let's list all actors. You can get the state of a single task using the get API. -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - # In this case, 31405554844820381c2f0f8501000000 - ray get actors + # In this case, 31405554844820381c2f0f8501000000 + ray get actors -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import get_actor - # In this case, 31405554844820381c2f0f8501000000 - print(get_actor(id=)) + from ray.util.state import get_actor + # In this case, 31405554844820381c2f0f8501000000 + print(get_actor(id=)) .. code-block:: text @@ -133,23 +143,25 @@ You can get the state of a single task using the get API. You can also access logs through ``ray logs`` API. -.. tabbed:: CLI +.. tabs:: - .. code-block:: bash + .. group-tab:: CLI (Recommended) - ray list actors - # In this case, ACTOR_ID is 31405554844820381c2f0f8501000000 - ray logs actor --id + .. code-block:: bash -.. tabbed:: Python SDK + ray list actors + # In this case, ACTOR_ID is 31405554844820381c2f0f8501000000 + ray logs actor --id - .. code-block:: python + .. group-tab:: Python SDK (Internal Developer API) - from ray.experimental.state.api import get_log + .. code-block:: python - # In this case, ACTOR_ID is 31405554844820381c2f0f8501000000 - for line in get_log(actor_id=): - print(line) + from ray.util.state import get_log + + # In this case, ACTOR_ID is 31405554844820381c2f0f8501000000 + for line in get_log(actor_id=): + print(line) .. code-block:: text @@ -180,34 +192,38 @@ you can use ``list`` or ``get`` APIs to get more details for an individual abnor E.g., Summarize all actors ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray summary actors + ray summary actors -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import summarize_actors - print(summarize_actors()) + from ray.util.state import summarize_actors + print(summarize_actors()) E.g., Summarize all tasks ~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray summary tasks + ray summary tasks -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import summarize_tasks - print(summarize_tasks()) + from ray.util.state import summarize_tasks + print(summarize_tasks()) E.g., Summarize all objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -218,65 +234,70 @@ E.g., Summarize all objects To get callsite info, set env variable `RAY_record_ref_creation_sites=1` when starting the ray cluster RAY_record_ref_creation_sites=1 ray start --head +.. tabs:: -.. tabbed:: CLI + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray summary objects + ray summary objects -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import summarize_objects - print(summarize_objects()) + from ray.util.state import summarize_objects + print(summarize_objects()) List ---- Get a list of resources, possible resources include: -- :ref:`Actors `, e.g., actor id, state, pid, death_cause. (:class:`output schema `) -- :ref:`Tasks `, e.g., name, scheduling state, type, runtime env info (:class:`output schema `) -- :ref:`Objects `, e.g., object id, callsites, reference types. (:class:`output schema `) -- :ref:`Jobs `, e.g., start/end time, entrypoint, status. (:class:`output schema `) -- :ref:`Placement Groups `, e.g., name, bundles, stats. (:class:`output schema `) -- Nodes (Ray worker nodes), e.g., node id, node ip, node state. (:class:`output schema `) -- Workers (Ray worker processes), e.g., worker id, type, exit type and details. (:class:`output schema `) -- :ref:`Runtime environments `, e.g., runtime envs, creation time, nodes (:class:`output schema `) +- :ref:`Actors `, e.g., actor id, state, pid, death_cause. (:class:`output schema `) +- :ref:`Tasks `, e.g., name, scheduling state, type, runtime env info (:class:`output schema `) +- :ref:`Objects `, e.g., object id, callsites, reference types. (:class:`output schema `) +- :ref:`Jobs `, e.g., start/end time, entrypoint, status. (:class:`output schema `) +- :ref:`Placement Groups `, e.g., name, bundles, stats. (:class:`output schema `) +- Nodes (Ray worker nodes), e.g., node id, node ip, node state. (:class:`output schema `) +- Workers (Ray worker processes), e.g., worker id, type, exit type and details. (:class:`output schema `) +- :ref:`Runtime environments `, e.g., runtime envs, creation time, nodes (:class:`output schema `) E.g., List all nodes ~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: - .. code-block:: bash + .. group-tab:: CLI (Recommended) - ray list nodes + .. code-block:: bash -.. tabbed:: Python SDK + ray list nodes - .. code-block:: python + .. group-tab:: Python SDK (Internal Developer API) - from ray.experimental.state.api import list_nodes() - list_nodes() + .. code-block:: python + + from ray.util.state import list_nodes() + list_nodes() E.g., List all placement groups ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray list placement-groups + ray list placement-groups -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import list_placement_groups - list_placement_groups() + from ray.util.state import list_placement_groups + list_placement_groups() E.g., List local referenced objects created by a process @@ -284,100 +305,112 @@ E.g., List local referenced objects created by a process .. tip:: You can list resources with one or multiple filters: using `--filter` or `-f` -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray list objects -f pid= -f reference_type=LOCAL_REFERENCE + ray list objects -f pid= -f reference_type=LOCAL_REFERENCE -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import list_objects - list_objects(filters=[("pid", "=", ), ("reference_type", "=", "LOCAL_REFERENCE")]) + from ray.util.state import list_objects + list_objects(filters=[("pid", "=", ), ("reference_type", "=", "LOCAL_REFERENCE")]) E.g., List alive actors ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: - .. code-block:: bash + .. group-tab:: CLI (Recommended) - ray list actors -f state=ALIVE + .. code-block:: bash -.. tabbed:: Python SDK + ray list actors -f state=ALIVE - .. code-block:: python + .. group-tab:: Python SDK (Internal Developer API) - from ray.experimental.state.api import list_actors - list_actors(filters=[("state", "=", "ALIVE")]) + .. code-block:: python + + from ray.util.state import list_actors + list_actors(filters=[("state", "=", "ALIVE")]) E.g., List running tasks ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray list tasks -f state=RUNNING + ray list tasks -f state=RUNNING -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import list_tasks - list_tasks(filters=[("state", "=", "RUNNING")]) + from ray.util.state import list_tasks + list_tasks(filters=[("state", "=", "RUNNING")]) E.g., List non-running tasks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray list tasks -f state!=RUNNING + ray list tasks -f state!=RUNNING -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import list_tasks - list_tasks(filters=[("state", "!=", "RUNNING")]) + from ray.util.state import list_tasks + list_tasks(filters=[("state", "!=", "RUNNING")]) E.g., List running tasks that have a name func ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: - .. code-block:: bash + .. group-tab:: CLI (Recommended) - ray list tasks -f state=RUNNING -f name="task_running_300_seconds()" + .. code-block:: bash -.. tabbed:: Python SDK + ray list tasks -f state=RUNNING -f name="task_running_300_seconds()" - .. code-block:: python + .. group-tab:: Python SDK (Internal Developer API) - from ray.experimental.state.api import list_tasks - list_tasks(filters=[("state", "=", "RUNNING"), ("name", "=", "task_running_300_seconds()")]) + .. code-block:: python + + from ray.util.state import list_tasks + list_tasks(filters=[("state", "=", "RUNNING"), ("name", "=", "task_running_300_seconds()")]) E.g., List tasks with more details ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. tip:: When ``--detail`` is specified, the API can query more data sources to obtain state information in details. -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray list tasks --detail + ray list tasks --detail -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import list_tasks - list_tasks(detail=True) + from ray.util.state import list_tasks + list_tasks(detail=True) Get --- @@ -385,34 +418,38 @@ Get E.g., Get a task info ~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray get tasks + ray get tasks -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import get_task - get_task(id=) + from ray.util.state import get_task + get_task(id=) E.g., Get a node info ~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: - .. code-block:: bash + .. group-tab:: CLI (Recommended) - ray get nodes + .. code-block:: bash -.. tabbed:: Python SDK + ray get nodes - .. code-block:: python + .. group-tab:: Python SDK (Internal Developer API) - from ray.experimental.state.api import get_node - get_node(id=) + .. code-block:: python + + from ray.util.state import get_node + get_node(id=) Logs ---- @@ -425,110 +462,120 @@ By default, the API prints log from a head node. E.g., Get all retrievable log file names from a head node in a cluster ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray logs cluster + ray logs cluster -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - # You could get the node id / node ip from `ray list nodes` - from ray.experimental.state.api import list_logs - # `ray logs` by default print logs from a head node. - # So in order to list the same logs, you should provide the head node id. - # You could get the node id / node ip from `ray list nodes` - list_logs(node_id=) + # You could get the node id / node ip from `ray list nodes` + from ray.util.state import list_logs + # `ray logs` by default print logs from a head node. + # So in order to list the same logs, you should provide the head node id. + # You could get the node id / node ip from `ray list nodes` + list_logs(node_id=) E.g., Get a particular log file from a node ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - # You could get the node id / node ip from `ray list nodes` - ray logs cluster gcs_server.out --node-id - # `ray logs cluster` is alias to `ray logs` when querying with globs. - ray logs gcs_server.out --node-id + # You could get the node id / node ip from `ray list nodes` + ray logs cluster gcs_server.out --node-id + # `ray logs cluster` is alias to `ray logs` when querying with globs. + ray logs gcs_server.out --node-id -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import get_log + from ray.util.state import get_log - # Node IP could be retrieved from list_nodes() or ray.nodes() - for line in get_log(filename="gcs_server.out", node_id=): - print(line) + # Node IP could be retrieved from list_nodes() or ray.nodes() + for line in get_log(filename="gcs_server.out", node_id=): + print(line) E.g., Stream a log file from a node ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: - .. code-block:: bash + .. group-tab:: CLI (Recommended) - # You could get the node id / node ip from `ray list nodes` - ray logs raylet.out --node-ip --follow - # Or, - ray logs cluster raylet.out --node-ip --follow + .. code-block:: bash + # You could get the node id / node ip from `ray list nodes` + ray logs raylet.out --node-ip --follow + # Or, + ray logs cluster raylet.out --node-ip --follow -.. tabbed:: Python SDK - .. code-block:: python + .. group-tab:: Python SDK (Internal Developer API) - from ray.experimental.state.api import get_log + .. code-block:: python - # Node IP could be retrieved from list_nodes() or ray.nodes() - # The loop will block with `follow=True` - for line in get_log(filename="raylet.out", node_ip=, follow=True): - print(line) + from ray.util.state import get_log + + # Node IP could be retrieved from list_nodes() or ray.nodes() + # The loop will block with `follow=True` + for line in get_log(filename="raylet.out", node_ip=, follow=True): + print(line) E.g., Stream log from an actor with actor id ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray logs actor --id= --follow + ray logs actor --id= --follow -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import get_log + from ray.util.state import get_log - # You could get the actor's ID from the output of `ray list actors`. - # The loop will block with `follow=True` - for line in get_log(actor_id=, follow=True): - print(line) + # You could get the actor's ID from the output of `ray list actors`. + # The loop will block with `follow=True` + for line in get_log(actor_id=, follow=True): + print(line) E.g., Stream log from a pid ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. tabbed:: CLI +.. tabs:: + + .. group-tab:: CLI (Recommended) - .. code-block:: bash + .. code-block:: bash - ray logs worker --pid= --follow + ray logs worker --pid= --follow -.. tabbed:: Python SDK + .. group-tab:: Python SDK (Internal Developer API) - .. code-block:: python + .. code-block:: python - from ray.experimental.state.api import get_log + from ray.util.state import get_log - # Node IP could be retrieved from list_nodes() or ray.nodes() - # You could get the pid of the worker running the actor easily when output - # of worker being directed to the driver (default) - # The loop will block with `follow=True` - for line in get_log(pid=, node_ip=, follow=True): - print(line) + # Node IP could be retrieved from list_nodes() or ray.nodes() + # You could get the pid of the worker running the actor easily when output + # of worker being directed to the driver (default) + # The loop will block with `follow=True` + for line in get_log(pid=, node_ip=, follow=True): + print(line) Failure Semantics ----------------- diff --git a/doc/source/ray-observability/user-guides/index.md b/doc/source/ray-observability/user-guides/index.md new file mode 100644 index 000000000000..a8772bfe58a1 --- /dev/null +++ b/doc/source/ray-observability/user-guides/index.md @@ -0,0 +1,9 @@ +(observability-user-guides)= + +# User Guides + +These guides help you monitor and debug your Ray applications and clusters. + +The guides include: +* {ref}`observability-troubleshoot-user-guides` +* {ref}`ray-tracing` \ No newline at end of file diff --git a/doc/source/ray-observability/ray-tracing.rst b/doc/source/ray-observability/user-guides/ray-tracing.rst similarity index 86% rename from doc/source/ray-observability/ray-tracing.rst rename to doc/source/ray-observability/user-guides/ray-tracing.rst index afc1c45ff8b0..7e0de9e7e490 100644 --- a/doc/source/ray-observability/ray-tracing.rst +++ b/doc/source/ray-observability/user-guides/ray-tracing.rst @@ -54,29 +54,31 @@ Below is an example tracing startup hook that sets up the default tracing provid For open-source users who want to experiment with tracing, Ray has a default tracing startup hook that exports spans to the folder ``/tmp/spans``. To run using this default hook, you can run the following code sample to set up tracing and trace a simple Ray task. -.. tabbed:: ray start +.. tab-set:: - .. code-block:: shell + .. tab-item:: ray start - $ ray start --head --tracing-startup-hook=ray.util.tracing.setup_local_tmp_tracing:setup_tracing - $ python - >>> ray.init() - >>> @ray.remote - def my_function(): - return 1 + .. code-block:: shell - obj_ref = my_function.remote() + $ ray start --head --tracing-startup-hook=ray.util.tracing.setup_local_tmp_tracing:setup_tracing + $ python + >>> ray.init() + >>> @ray.remote + def my_function(): + return 1 -.. tabbed:: ray.init() + obj_ref = my_function.remote() - .. code-block:: python + .. tab-item:: ray.init() - >>> ray.init(_tracing_startup_hook="ray.util.tracing.setup_local_tmp_tracing:setup_tracing") - >>> @ray.remote - def my_function(): - return 1 + .. code-block:: python - obj_ref = my_function.remote() + >>> ray.init(_tracing_startup_hook="ray.util.tracing.setup_local_tmp_tracing:setup_tracing") + >>> @ray.remote + def my_function(): + return 1 + + obj_ref = my_function.remote() If you want to provide your own custom tracing startup hook, provide your startup hook in the format of ``module:attribute`` where the attribute is the ``setup_tracing`` function to be run. diff --git a/doc/source/ray-observability/user-guides/troubleshoot-apps/index.md b/doc/source/ray-observability/user-guides/troubleshoot-apps/index.md new file mode 100644 index 000000000000..cd6562375a40 --- /dev/null +++ b/doc/source/ray-observability/user-guides/troubleshoot-apps/index.md @@ -0,0 +1,10 @@ +(observability-troubleshoot-user-guides)= + +# Troubleshooting Applications + +These guides help you perform common debugging or optimization tasks for your distributed application on Ray: +* {ref}`observability-troubleshoot-failures` +* {ref}`observability-troubleshoot-hangs` +* {ref}`observability-optimize-performance` +* {ref}`ray-debugger` +* {ref}`ray-core-profiling` \ No newline at end of file diff --git a/doc/source/ray-observability/monitoring-debugging/troubleshoot-performance.rst b/doc/source/ray-observability/user-guides/troubleshoot-apps/optimize-performance.rst similarity index 97% rename from doc/source/ray-observability/monitoring-debugging/troubleshoot-performance.rst rename to doc/source/ray-observability/user-guides/troubleshoot-apps/optimize-performance.rst index 1127ea5d1e71..465f7b6b5c52 100644 --- a/doc/source/ray-observability/monitoring-debugging/troubleshoot-performance.rst +++ b/doc/source/ray-observability/user-guides/troubleshoot-apps/optimize-performance.rst @@ -1,5 +1,7 @@ -Troubleshooting Performance -=========================== +.. _observability-optimize-performance: + +Optimizing Performance +====================== No Speedup ---------- diff --git a/doc/source/ray-observability/monitoring-debugging/profiling.rst b/doc/source/ray-observability/user-guides/troubleshoot-apps/profiling.rst similarity index 95% rename from doc/source/ray-observability/monitoring-debugging/profiling.rst rename to doc/source/ray-observability/user-guides/troubleshoot-apps/profiling.rst index c2ff41687fe5..756005eaec21 100644 --- a/doc/source/ray-observability/monitoring-debugging/profiling.rst +++ b/doc/source/ray-observability/user-guides/troubleshoot-apps/profiling.rst @@ -21,21 +21,23 @@ First, install ``memray``. But in this example, we will write them to `/tmp/ray/session_latest/logs` because Ray dashboard allows you to download files inside the log folder. This will allow you to download profiling files from other nodes. -.. tabbed:: Actors +.. tab-set:: - .. literalinclude:: ../doc_code/memray_profiling.py - :language: python - :start-after: __memray_profiling_start__ - :end-before: __memray_profiling_end__ + .. tab-item:: Actors -.. tabbed:: Tasks + .. literalinclude:: ../doc_code/memray_profiling.py + :language: python + :start-after: __memray_profiling_start__ + :end-before: __memray_profiling_end__ - Note that tasks have a shorter lifetime, so there could be lots of memory profiling files. + .. tab-item:: Tasks - .. literalinclude:: ../doc_code/memray_profiling.py - :language: python - :start-after: __memray_profiling_task_start__ - :end-before: __memray_profiling_task_end__ + Note that tasks have a shorter lifetime, so there could be lots of memory profiling files. + + .. literalinclude:: ../doc_code/memray_profiling.py + :language: python + :start-after: __memray_profiling_task_start__ + :end-before: __memray_profiling_task_end__ Once the task or actor runs, go to the :ref:`Logs View ` of the dashboard. Find and click the log file name. diff --git a/doc/source/ray-observability/ray-debugging.rst b/doc/source/ray-observability/user-guides/troubleshoot-apps/ray-debugging.rst similarity index 99% rename from doc/source/ray-observability/ray-debugging.rst rename to doc/source/ray-observability/user-guides/troubleshoot-apps/ray-debugging.rst index 99002237c6ff..afc98df070a6 100644 --- a/doc/source/ray-observability/ray-debugging.rst +++ b/doc/source/ray-observability/user-guides/troubleshoot-apps/ray-debugging.rst @@ -1,7 +1,7 @@ .. _ray-debugger: -Ray Debugger -============= +Using the Ray Debugger +====================== Ray has a built in debugger that allows you to debug your distributed applications. It allows to set breakpoints in your Ray tasks and actors and when hitting the breakpoint you can diff --git a/doc/source/ray-observability/monitoring-debugging/troubleshoot-failures.rst b/doc/source/ray-observability/user-guides/troubleshoot-apps/troubleshoot-failures.rst similarity index 99% rename from doc/source/ray-observability/monitoring-debugging/troubleshoot-failures.rst rename to doc/source/ray-observability/user-guides/troubleshoot-apps/troubleshoot-failures.rst index 0659c4f7e435..046c3e1bb2d2 100644 --- a/doc/source/ray-observability/monitoring-debugging/troubleshoot-failures.rst +++ b/doc/source/ray-observability/user-guides/troubleshoot-apps/troubleshoot-failures.rst @@ -1,3 +1,5 @@ +.. _observability-troubleshoot-failures: + Troubleshooting Failures ======================== diff --git a/doc/source/ray-observability/monitoring-debugging/troubleshoot-hangs.rst b/doc/source/ray-observability/user-guides/troubleshoot-apps/troubleshoot-hangs.rst similarity index 93% rename from doc/source/ray-observability/monitoring-debugging/troubleshoot-hangs.rst rename to doc/source/ray-observability/user-guides/troubleshoot-apps/troubleshoot-hangs.rst index 3a4519eb7ec2..0725e8863bb1 100644 --- a/doc/source/ray-observability/monitoring-debugging/troubleshoot-hangs.rst +++ b/doc/source/ray-observability/user-guides/troubleshoot-apps/troubleshoot-hangs.rst @@ -1,3 +1,5 @@ +.. _observability-troubleshoot-hangs: + Troubleshooting Hangs ===================== diff --git a/doc/source/ray-overview/eco-gallery.yml b/doc/source/ray-overview/eco-gallery.yml index 0cc423dd913b..05e2006ea660 100644 --- a/doc/source/ray-overview/eco-gallery.yml +++ b/doc/source/ray-overview/eco-gallery.yml @@ -1,11 +1,10 @@ meta: - section-titles: false - container: container pb-4 - column: col-md-4 px-1 py-1 - img-top-cls: p-2 w-75 d-block mx-auto fixed-height-img + grid: 1 2 2 3 + gutter: 1 + class-container: container pb-3 -buttons: - classes: btn-outline-info btn-block +classes: + class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img projects: - name: Classy Vision Integration diff --git a/doc/source/ray-overview/getting-started.md b/doc/source/ray-overview/getting-started.md index 61223c8f2563..58e510702e0e 100644 --- a/doc/source/ray-overview/getting-started.md +++ b/doc/source/ray-overview/getting-started.md @@ -9,25 +9,27 @@ This guide gives a quick tour of Ray's features. ## Starting a local Ray cluster To get started, install, import, and initialize Ray. Most of the examples in this guide are based on Python, and some examples use Ray Core in Java. -````{panels} -:container: text-center -:column: col-lg-6 px-2 py-2 -:card: - -Python -^^^ -To use Ray in Python, install it with -``` -pip install ray -``` - ---- - -Java -^^^ +````{eval-rst} +.. grid:: 1 2 2 2 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + + Python + ^^^ + To use Ray in Python, install it with + ``` + pip install ray + ``` + + .. grid-item-card:: -To use Ray in Java, first add the [ray-api](https://mvnrepository.com/artifact/io.ray/ray-api) and -[ray-runtime](https://mvnrepository.com/artifact/io.ray/ray-runtime) dependencies in your project. + Java + ^^^ + + To use Ray in Java, first add the [ray-api](https://mvnrepository.com/artifact/io.ray/ray-api) and + [ray-runtime](https://mvnrepository.com/artifact/io.ray/ray-runtime) dependencies in your project. ```` @@ -125,25 +127,27 @@ Use the trained model for batch prediction with a ``BatchPredictor``. :start-after: __air_xgb_batchpred_start__ :end-before: __air_xgb_batchpred_end__ ``` -````` +```{button-ref} air +:color: primary +:outline: +:expand: -```{link-button} air -:type: ref -:text: Learn more about Ray AIR -:classes: btn-outline-primary btn-block +Learn more about Ray AIR ``` +````` + ## Ray Libraries Quick Start Ray has a rich ecosystem of libraries and frameworks built on top of it. Simply click on the dropdowns below to see examples of our most popular libraries. -`````{dropdown} ray Data: Creating and Transforming Datasets +`````{dropdown} ray Data: Scalable Datasets for ML :animate: fade-in-slide-down -Ray Datasets are the standard way to load and exchange data in Ray libraries and applications. -Datasets provide basic distributed data transformations such as `map`, `filter`, and `repartition`. +Ray Data is the standard way to load and exchange data in Ray libraries and applications. +Ray Data provides basic distributed data transformations such as `map`, `filter`, and `repartition`. They are compatible with a variety of file formats, datasources, and distributed frameworks. ````{note} @@ -154,8 +158,8 @@ pip install "ray[data]" dask ``` ```` -Get started by creating Datasets from synthetic data using ``ray.data.range()`` and ``ray.data.from_items()``. -Datasets can hold either plain Python objects (schema is a Python type), or Arrow records (schema is Arrow). +Get started by creating a Dataset from synthetic data using ``ray.data.range()`` and ``ray.data.from_items()``. +A Dataset can hold either plain Python objects (schema is a Python type), or Arrow records (schema is Arrow). ```{literalinclude} ../data/doc_code/quick_start.py :language: python @@ -182,20 +186,24 @@ Datasets also supports ``.filter()`` and ``.flat_map()``. :end-before: __data_transform_end__ ``` -```{link-button} ../data/dataset -:type: ref -:text: Learn more about Ray Data -:classes: btn-outline-primary btn-block +```{button-ref} ../data/data +:color: primary +:outline: +:expand: + +Learn more about Ray Data ``` ````` -`````{dropdown} ray Train: Distributed Model Training +``````{dropdown} ray Train: Distributed Model Training :animate: fade-in-slide-down Ray Train abstracts away the complexity of setting up a distributed training system. Let's take following simple examples: -````{tabbed} PyTorch +`````{tab-set} + +````{tab-item} PyTorch This example shows how you can use Ray Train with PyTorch. @@ -249,7 +257,7 @@ with 4 workers, and use it to run the new training function! ``` ```` -````{tabbed} TensorFlow +````{tab-item} TensorFlow This example shows how you can use Ray Train to set up `Multi-worker training with Keras `_. @@ -302,15 +310,21 @@ with 4 workers, and use it to run the new training function! :end-before: __tf_trainer_end__ :dedent: 0 ``` -```` -```{link-button} ../train/train -:type: ref -:text: Learn more about Ray Train -:classes: btn-outline-primary btn-block +```{button-ref} ../train/train +:color: primary +:outline: +:expand: + +Learn more about Ray Train ``` + +```` + ````` +`````` + `````{dropdown} ray Tune: Hyperparameter Tuning at Scale :animate: fade-in-slide-down @@ -340,10 +354,12 @@ If TensorBoard is installed, automatically visualize all trial results: tensorboard --logdir ~/ray_results ``` -```{link-button} ../tune/index -:type: ref -:text: Learn more about Ray Tune -:classes: btn-outline-primary btn-block +```{button-ref} ../tune/index +:color: primary +:outline: +:expand: + +Learn more about Ray Tune ``` ````` @@ -371,11 +387,14 @@ This example runs serves a scikit-learn gradient boosting classifier. As a result you will see `{"result": "versicolor"}`. -```{link-button} ../serve/index -:type: ref -:text: Learn more about Ray Serve -:classes: btn-outline-primary btn-block +```{button-ref} ../serve/index +:color: primary +:outline: +:expand: + +Learn more about Ray Serve ``` + ````` @@ -398,10 +417,12 @@ pip install "ray[rllib]" tensorflow # or torch :start-after: __quick_start_begin__ ``` -```{link-button} ../rllib/index -:type: ref -:text: Learn more about Ray RLlib -:classes: btn-outline-primary btn-block +```{button-ref} ../rllib/index +:color: primary +:outline: +:expand: + +Learn more about Ray RLlib ``` ````` @@ -412,10 +433,12 @@ Ray Core provides simple primitives for building and running distributed applica Below you find examples that show you how to turn your functions and classes easily into Ray tasks and actors, for both Python and Java. -`````{dropdown} ray Core: Parallelizing Functions with Ray Tasks +``````{dropdown} ray Core: Parallelizing Functions with Ray Tasks :animate: fade-in-slide-down -````{tabbed} Python +`````{tab-set} + +````{tab-item} Python First, you import Ray and and initialize it with `ray.init()`. Then you decorate your function with ``@ray.remote`` to declare that you want to run this function remotely. @@ -433,11 +456,11 @@ def f(x): futures = [f.remote(i) for i in range(4)] print(ray.get(futures)) # [0, 1, 4, 9] - ``` + ```` -````{tabbed} Java +````{tab-item} Java First, use `Ray.init` to initialize Ray runtime. Then you can use `Ray.task(...).remote()` to convert any Java static method into a Ray task. @@ -470,20 +493,26 @@ public class RayDemo { System.out.println(Ray.get(objectRefList)); // [0, 1, 4, 9] } } -```` +``` In the above code block we defined some Ray Tasks. While these are great for stateless operations, sometimes you must maintain the state of your application. You can do that with Ray Actors. -```{link-button} ../ray-core/walkthrough -:type: ref -:text: Learn more about Ray Core -:classes: btn-outline-primary btn-block +```{button-ref} ../ray-core/walkthrough +:color: primary +:outline: +:expand: + +Learn more about Ray Core ``` +```` + ````` -`````{dropdown} ray Core: Parallelizing Classes with Ray Actors +`````` + +``````{dropdown} ray Core: Parallelizing Classes with Ray Actors :animate: fade-in-slide-down Ray provides actors to allow you to parallelize an instance of a class in Python or Java. @@ -491,7 +520,9 @@ When you instantiate a class that is a Ray actor, Ray will start a remote instan of that class in the cluster. This actor can then execute remote method calls and maintain its own internal state. -````{tabbed} Python +`````{tab-set} + +````{tab-item} Python ```{code-block} python @@ -516,7 +547,7 @@ print(ray.get(futures)) # [1, 1, 1, 1] ``` ```` -````{tabbed} Java +````{tab-item} Java ```{code-block} java import io.ray.api.ActorHandle; @@ -563,16 +594,22 @@ public class RayDemo { System.out.println(Ray.get(objectRefList)); // [1, 1, 1, 1] } } +``` -```` -```{link-button} ../ray-core/walkthrough -:type: ref -:text: Learn more about Ray Core -:classes: btn-outline-primary btn-block +```{button-ref} ../ray-core/walkthrough +:color: primary +:outline: +:expand: + +Learn more about Ray Core ``` +```` + ````` +`````` + ## Ray Cluster Quick Start You can deploy your applications on Ray clusters, often with minimal code changes to your existing code. @@ -611,10 +648,12 @@ Assuming you have stored this configuration in a file called `cluster.yaml`, you ray submit cluster.yaml example.py --start ``` -```{link-button} cluster-index -:type: ref -:text: Learn more about launching Ray Clusters -:classes: btn-outline-primary btn-block +```{button-ref} cluster-index +:color: primary +:outline: +:expand: + +Learn more about launching Ray Clusters ``` ````` @@ -640,10 +679,12 @@ pip install "ray[default]" ``` ```` -```{link-button} ../ray-core/ray-dashboard -:type: ref -:text: Learn more about Ray Dashboard. -:classes: btn-outline-primary btn-block +```{button-ref} ../ray-core/ray-dashboard +:color: primary +:outline: +:expand: + +Learn more about Ray Dashboard ``` ````` @@ -716,10 +757,12 @@ See the summarized statistics of Ray tasks using ``ray summary tasks``. ``` -```{link-button} ../ray-observability/state/state-api -:type: ref -:text: Learn more about Ray State APIs -:classes: btn-outline-primary btn-block +```{button-ref} ../ray-observability/state/state-api +:color: primary +:outline: +:expand: + +Learn more about Ray State APIs ``` ````` diff --git a/doc/source/ray-overview/index.md b/doc/source/ray-overview/index.md index 2839ad014e28..865a2d63b540 100644 --- a/doc/source/ray-overview/index.md +++ b/doc/source/ray-overview/index.md @@ -38,66 +38,74 @@ These are some common ML workloads that individuals, organizations, and companie |:--:| |Stack of Ray libraries - unified toolkit for ML workloads.| -Ray's unified compute framework comprises of three layers: +Ray's unified compute framework consists of three layers: 1. **Ray AI Runtime**--An open-source, Python, domain-specific set of libraries that equip ML engineers, data scientists, and researchers with a scalable and unified toolkit for ML applications. -1. **Ray Core**--An open-source, Python, general purpose, distributed computing library that enables ML engineers and Python developers to scale Python applications and accelerate machine learning workloads. -1. **Ray cluster**--A set of worker nodes connected to a common Ray head node. Ray clusters can be fixed-size, or they can autoscale up and down according to the resources requested by applications running on the cluster. - -````{panels} -:container: text-left -:column: col-lg-4 px-2 py-2 -:card: - -**Scale machine learning workloads** -^^^ -Build ML applications with a toolkit of libraries for distributed -[data processing](../data/dataset.rst), -[model training](../train/train.rst), -[tuning](tune/../index.rst), -[reinforcement learning](../rllib/index.rst), -[model serving](../serve/index.rst), -and [more](../ray-more-libs/index.rst). -+++ -```{link-button} ../ray-air/getting-started -:type: ref -:text: Ray AIR -:classes: btn-outline-info btn-block +2. **Ray Core**--An open-source, Python, general purpose, distributed computing library that enables ML engineers and Python developers to scale Python applications and accelerate machine learning workloads. +3. **Ray cluster**--A set of worker nodes connected to a common Ray head node. Ray clusters can be fixed-size, or they can autoscale up and down according to the resources requested by applications running on the cluster. + +```{eval-rst} +.. grid:: 1 2 3 3 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + + **Scale machine learning workloads** + ^^^ + Build ML applications with a toolkit of libraries for distributed + :doc:`data processing <../data/data>`, + :doc:`model training <../train/train>`, + :doc:`tuning <../tune/index>`, + :doc:`reinforcement learning <../rllib/index>`, + :doc:`model serving <../serve/index>`, + and :doc:`more <../ray-more-libs/index>`. + +++ + .. button-ref:: ../ray-air/getting-started + :color: primary + :outline: + :expand: + + Ray AIR + + .. grid-item-card:: + + **Build distributed applications** + ^^^ + Build and run distributed applications with a + :doc:`simple and flexible API <../ray-core/walkthrough>`. + :doc:`Parallelize <../ray-core/walkthrough>` single machine code with + little to zero code changes. + + +++ + .. button-ref:: ../ray-core/walkthrough + :color: primary + :outline: + :expand: + + Ray Core + + .. grid-item-card:: + + **Deploy large-scale workloads** + ^^^ + Deploy workloads on :doc:`AWS, GCP, Azure <../cluster/getting-started>` or + :doc:`on premise <../cluster/vms/user-guides/launching-clusters/on-premises>`. + Use Ray cluster managers to run Ray on existing + :doc:`Kubernetes <../cluster/kubernetes/index>`, + :doc:`YARN <../cluster/vms/user-guides/community/yarn>`, + or :doc:`Slurm <../cluster/vms/user-guides/community/slurm>` clusters. + +++ + .. button-ref:: ../cluster/getting-started + :color: primary + :outline: + :expand: + + Ray Clusters ``` ---- - -**Build distributed applications** -^^^ -Build and run distributed applications with a [simple and flexible API](../ray-core/walkthrough.rst). -[Parallelize](../ray-core/walkthrough.rst) single machine code with little to zero code changes. - -+++ -```{link-button} ../ray-core/walkthrough -:type: ref -:text: Ray Core -:classes: btn-outline-info btn-block -``` ---- - -**Deploy large-scale workloads** -^^^ -Deploy workloads on [AWS, GCP, Azure](../cluster/getting-started) or -[on premise](../cluster/vms/user-guides/launching-clusters/on-premises). -Use Ray cluster managers to run Ray on existing -[Kubernetes](../cluster/kubernetes/index), -[YARN](../cluster/vms/user-guides/community/yarn), -or [Slurm](../cluster/vms/user-guides/community/slurm) clusters. -+++ - -```{link-button} ../cluster/getting-started -:type: ref -:text: Ray Clusters -:classes: btn-outline-info btn-block -``` -```` Each of [Ray AIR's](../ray-air/getting-started) five native libraries distributes a specific ML task: -- [Data](../data/dataset): Scalable, framework-agnostic data loading and transformation across training, tuning, and prediction. +- [Data](../data/data): Scalable, framework-agnostic data loading and transformation across training, tuning, and prediction. - [Train](../train/train): Distributed multi-node and multi-core model training with fault tolerance that integrates with popular training libraries. - [Tune](../tune/index): Scalable hyperparameter tuning to optimize model performance. - [Serve](../serve/index): Scalable and programmable serving to deploy models for online inference, with optional microbatching to improve performance. diff --git a/doc/source/ray-overview/installation.rst b/doc/source/ray-overview/installation.rst index 1d7132a2eeff..78f7881d6750 100644 --- a/doc/source/ray-overview/installation.rst +++ b/doc/source/ray-overview/installation.rst @@ -11,25 +11,66 @@ Official Releases From Wheels ~~~~~~~~~~~ -You can install the latest official version of Ray from PyPI on Linux, Windows -and macOS as follows: +You can install the latest official version of Ray from PyPI on Linux, Windows, +and macOS by choosing the option that best matches your use case. -.. code-block:: bash +.. tab-set:: - # Install Ray with support for the dashboard + cluster launcher - pip install -U "ray[default]" + .. tab-item:: Recommended - # Install Ray with minimal dependencies - # pip install -U ray + **For machine learning applications** -To install Ray libraries: + .. code-block:: shell -.. code-block:: bash + pip install -U "ray[air]" + + # For reinforcement learning support, install RLlib instead. + # pip install -U "ray[rllib]" + + **For general Python applications** + + .. code-block:: shell + + pip install -U "ray[default]" + + # If you don't want Ray Dashboard or Cluster Launcher, install Ray with minimal dependencies instead. + # pip install -U "ray" + + .. tab-item:: Advanced + + .. list-table:: + :widths: 2 3 + :header-rows: 1 + + * - Command + - Installed components + * - `pip install -U "ray"` + - Core + * - `pip install -U "ray[default]"` + - Core, Dashboard, Cluster Launcher + * - `pip install -U "ray[data]"` + - Core, Data + * - `pip install -U "ray[train]"` + - Core, Train + * - `pip install -U "ray[tune]"` + - Core, Tune + * - `pip install -U "ray[serve]"` + - Core, Dashboard, Cluster Launcher, Serve + * - `pip install -U "ray[rllib]"` + - Core, Tune, RLlib + * - `pip install -U "ray[air]"` + - Core, Dashboard, Cluster Launcher, Data, Train, Tune, Serve + * - `pip install -U "ray[all]"` + - Core, Dashboard, Cluster Launcher, Data, Train, Tune, Serve, RLlib - pip install -U "ray[air]" # installs Ray + dependencies for Ray AI Runtime - pip install -U "ray[tune]" # installs Ray + dependencies for Ray Tune - pip install -U "ray[rllib]" # installs Ray + dependencies for Ray RLlib - pip install -U "ray[serve]" # installs Ray + dependencies for Ray Serve + .. tip:: + + You can combine installation extras. + For example, to install Ray with Dashboard, Cluster Launcher, and Train support, you can run: + + .. code-block:: shell + + pip install -U "ray[default,train]" .. _install-nightlies: @@ -48,42 +89,41 @@ You can install the nightly Ray wheels via the following links. These daily rele # Install Ray with minimal dependencies # pip install -U LINK_TO_WHEEL.whl - - -.. tabbed:: Linux - - =============================================== ================================================ - Linux (x86_64) Linux (arm64/aarch64) - =============================================== ================================================ - `Linux Python 3.10 (x86_64)`_ `Linux Python 3.10 (aarch64)`_ - `Linux Python 3.9 (x86_64)`_ `Linux Python 3.9 (aarch64)`_ - `Linux Python 3.8 (x86_64)`_ `Linux Python 3.8 (aarch64)`_ - `Linux Python 3.7 (x86_64)`_ `Linux Python 3.7 (aarch64)`_ - `Linux Python 3.11 (x86_64) (EXPERIMENTAL)`_ `Linux Python 3.11 (aarch64) (EXPERIMENTAL)`_ - =============================================== ================================================ - -.. tabbed:: MacOS - - ================================ ================================ - MacOS (x86_64) MacOS (arm64) - ================================ ================================ - `MacOS Python 3.10 (x86_64)`_ `MacOS Python 3.10 (arm64)`_ - `MacOS Python 3.9 (x86_64)`_ `MacOS Python 3.9 (arm64)`_ - `MacOS Python 3.8 (x86_64)`_ `MacOS Python 3.8 (arm64)`_ - `MacOS Python 3.7 (x86_64)`_ - ================================ ================================ - -.. tabbed:: Windows (beta) - - .. list-table:: - :header-rows: 1 - - * - Windows (beta) - * - `Windows Python 3.10`_ - * - `Windows Python 3.9`_ - * - `Windows Python 3.8`_ - * - `Windows Python 3.7`_ - +.. tab-set:: + + .. tab-item:: Linux + + =============================================== ================================================ + Linux (x86_64) Linux (arm64/aarch64) + =============================================== ================================================ + `Linux Python 3.10 (x86_64)`_ `Linux Python 3.10 (aarch64)`_ + `Linux Python 3.9 (x86_64)`_ `Linux Python 3.9 (aarch64)`_ + `Linux Python 3.8 (x86_64)`_ `Linux Python 3.8 (aarch64)`_ + `Linux Python 3.7 (x86_64)`_ `Linux Python 3.7 (aarch64)`_ + `Linux Python 3.11 (x86_64) (EXPERIMENTAL)`_ `Linux Python 3.11 (aarch64) (EXPERIMENTAL)`_ + =============================================== ================================================ + + .. tab-item:: MacOS + + ================================ ================================ + MacOS (x86_64) MacOS (arm64) + ================================ ================================ + `MacOS Python 3.10 (x86_64)`_ `MacOS Python 3.10 (arm64)`_ + `MacOS Python 3.9 (x86_64)`_ `MacOS Python 3.9 (arm64)`_ + `MacOS Python 3.8 (x86_64)`_ `MacOS Python 3.8 (arm64)`_ + `MacOS Python 3.7 (x86_64)`_ + ================================ ================================ + + .. tab-item:: Windows (beta) + + .. list-table:: + :header-rows: 1 + + * - Windows (beta) + * - `Windows Python 3.10`_ + * - `Windows Python 3.9`_ + * - `Windows Python 3.8`_ + * - `Windows Python 3.7`_ .. note:: diff --git a/doc/source/ray-overview/learn-more.md b/doc/source/ray-overview/learn-more.md index 7861cb987814..6173a0eacbc7 100644 --- a/doc/source/ray-overview/learn-more.md +++ b/doc/source/ray-overview/learn-more.md @@ -25,7 +25,7 @@ Please raise an issue if any of the below links are broken, or if you'd like to ## Talks (Videos) -- [Unifying Large Scale Data Preprocessing and Machine Learning Pipelines with Ray Datasets \| PyData 2021](https://zoom.us/rec/share/0cjbk_YdCTbiTm7gNhzSeNxxTCCEy1pCDUkkjfBjtvOsKGA8XmDOx82jflHdQCUP.fsjQkj5PWSYplOTz?startTime=1635456658000) [(slides)](https://docs.google.com/presentation/d/19F_wxkpo1JAROPxULmJHYZd3sKryapkbMd0ib3ndMiU/edit?usp=sharing) +- [Unifying Large Scale Data Preprocessing and Machine Learning Pipelines with Ray Data \| PyData 2021](https://zoom.us/rec/share/0cjbk_YdCTbiTm7gNhzSeNxxTCCEy1pCDUkkjfBjtvOsKGA8XmDOx82jflHdQCUP.fsjQkj5PWSYplOTz?startTime=1635456658000) [(slides)](https://docs.google.com/presentation/d/19F_wxkpo1JAROPxULmJHYZd3sKryapkbMd0ib3ndMiU/edit?usp=sharing) - [Programming at any Scale with Ray \| SF Python Meetup Sept 2019](https://www.youtube.com/watch?v=LfpHyIXBhlE) - [Ray for Reinforcement Learning \| Data Council 2019](https://www.youtube.com/watch?v=Ayc0ca150HI) - [Scaling Interactive Pandas Workflows with Modin](https://www.youtube.com/watch?v=-HjLd_3ahCw) diff --git a/doc/source/ray-overview/use-cases.rst b/doc/source/ray-overview/use-cases.rst index 3d9d25e52085..fb581fb65ef8 100644 --- a/doc/source/ray-overview/use-cases.rst +++ b/doc/source/ray-overview/use-cases.rst @@ -3,64 +3,166 @@ Ray Use Cases ============= -This page indexes common Ray use cases for scaling ML. It contains highlighted references to blogs, examples, and tutorials also located elsewhere in the Ray documentation. +This page indexes common Ray use cases for scaling ML. +It contains highlighted references to blogs, examples, and tutorials also located +elsewhere in the Ray documentation. + +.. _ref-use-cases-llm: + +LLMs and Gen AI +--------------- + +Large language models (LLMs) and generative AI are rapidly changing industries, and demand compute at an astonishing pace. Ray provides a distributed compute framework for scaling these models, allowing developers to train and deploy models faster and more efficiently. With specialized libraries for data streaming, training, fine-tuning, hyperparameter tuning, and serving, Ray simplifies the process of developing and deploying large-scale AI models. + +.. figure:: /images/llm-stack.png + +Learn more about how Ray scales LLMs and generative AI with the following resources. + +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.anyscale.com/blog/ray-common-production-challenges-for-generative-ai-infrastructure + + [Blog] How Ray solves common production challenges for generative AI infrastructure + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.anyscale.com/blog/training-175b-parameter-language-models-at-1000-gpu-scale-with-alpa-and-ray + + [Blog] Training 175B Parameter Language Models at 1000 GPU scale with Alpa and Ray + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.anyscale.com/blog/faster-stable-diffusion-fine-tuning-with-ray-air + + [Blog] Faster stable diffusion fine-tuning with Ray AIR + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.anyscale.com/blog/how-to-fine-tune-and-serve-llms-simply-quickly-and-cost-effectively-using + + [Blog] How to fine tune and serve LLMs simply, quickly and cost effectively using Ray + DeepSpeed + HuggingFace + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.businessinsider.com/openai-chatgpt-trained-on-anyscale-ray-generative-lifelike-ai-models-2022-12 + + [Blog] How OpenAI Uses Ray to Train Tools like ChatGPT + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/gptj_deepspeed_fine_tuning + + [Example] GPT-J-6B Fine-Tuning with Ray AIR and DeepSpeed + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/dreambooth_finetuning + + [Example] Fine-tuning DreamBooth with Ray AIR + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/stablediffusion_batch_prediction + + [Example] Stable Diffusion Batch Prediction with Ray AIR + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/gptj_serving + + [Example] GPT-J-6B Serving with Ray AIR + .. _ref-use-cases-batch-infer: Batch Inference --------------- -Batch inference refers to generating model predictions over a set of input observations. The model could be a regression model, neural network, or simply a Python function. Ray can scale batch inference from single GPU machines to large clusters. +Batch inference is the process of generating model predictions on a large "batch" of input data. +Ray for batch inference works with any cloud provider and ML framework, +and is fast and cheap for modern deep learning applications. +It scales from single machines to large clusters with minimal code changes. +As a Python-first framework, you can easily express and interactively develop your inference workloads in Ray. +To learn more about running batch inference with Ray, see the :ref:`batch inference guide`. + +.. figure:: ../data/images/batch_inference.png + + +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 -Performing inference on incoming batches of data can be parallelized by exporting the architecture and weights of a trained model to the shared object store. Using these model replicas, Ray AIR's :ref:`Batch Predictor ` scales predictions on batches across workers. + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img -.. figure:: /images/batch_inference.png - - Using Ray AIR's ``BatchPredictor`` for batch inference. + .. button-link:: https://www.anyscale.com/blog/offline-batch-inference-comparing-ray-apache-spark-and-sagemaker -Learn more about batch inference with the following resources. + [Blog] Offline Batch Inference: Comparing Ray, Apache Spark, and SageMaker -.. panels:: - :container: container pb-3 - :column: col-md-3 px-1 py-1 - :img-top-cls: p-2 w-75 d-block mx-auto fixed-height-img + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: /images/ray_logo.png + .. button-link:: https://www.anyscale.com/blog/streaming-distributed-execution-across-cpus-and-gpus - .. link-button:: https://github.com/ray-project/ray-educational-materials/blob/main/Computer_vision_workloads/Semantic_segmentation/Scaling_batch_inference.ipynb - :type: url - :text: [Tutorial] Architectures for Scalable Batch Inference with Ray - :classes: btn-link btn-block stretched-link scalableBatchInference - --- - :img-top: /images/ray_logo.png + [Blog] Streaming distributed execution across CPUs and GPUs - .. link-button:: https://www.anyscale.com/blog/model-batch-inference-in-ray-actors-actorpool-and-datasets - :type: url - :text: [Blog] Batch Inference in Ray: Actors, ActorPool, and Datasets - :classes: btn-link btn-block stretched-link batchActorPool - --- - :img-top: /images/ray_logo.png + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - .. link-button:: /ray-core/examples/batch_prediction - :type: ref - :text: [Example] Batch Prediction using Ray Core - :classes: btn-link btn-block stretched-link batchCore - --- - :img-top: /images/ray_logo.png + .. button-link:: https://www.anyscale.com/blog/turbocharge-langchain-now-guide-to-20x-faster-embedding - .. link-button:: /data/examples/nyc_taxi_basic_processing - :type: ref - :text: [Example] Batch Inference on NYC taxi data using Ray Data - :classes: btn-link btn-block stretched-link nycTaxiData + [Blog] Using Ray Data to parallelize LangChain inference + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /data/batch_inference + + [Guide] Batch Prediction using Ray Data + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /data/examples/nyc_taxi_basic_processing + + [Example] Batch Inference on NYC taxi data using Ray Data + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /data/examples/ocr_example + + [Example] Batch OCR processing using Ray Data - --- - :img-top: /images/ray_logo.png - .. link-button:: /data/examples/ocr_example - :type: ref - :text: [Example] Batch OCR processing using Ray Data - :classes: btn-link btn-block stretched-link batchOcr .. _ref-use-cases-mmt: @@ -73,7 +175,7 @@ The focus is on training many models on subsets of a dataset. This is in contras When any given model you want to train can fit on a single GPU, Ray can assign each training run to a separate Ray Task. In this way, all available workers are utilized to run independent remote training rather than one worker running jobs sequentially. .. figure:: /images/training_small_models.png - + Data parallelism pattern for distributed training on large datasets. How do I do many model training on Ray? @@ -81,71 +183,77 @@ How do I do many model training on Ray? To train multiple independent models, use the Ray Tune (:ref:`Tutorial `) library. This is the recommended library for most cases. -You can use Tune with your current data preprocessing pipeline if your data source fits into the memory of a single machine (node). -If you need to scale your data, or you want to plan for future scaling, use the :ref:`Ray Data ` library. -Your data must be a :ref:`supported format `, to use Ray Data. +You can use Tune with your current data preprocessing pipeline if your data source fits into the memory of a single machine (node). +If you need to scale your data, or you want to plan for future scaling, use the :ref:`Ray Data ` library. +Your data must be a :ref:`supported format `, to use Ray Data. -Alternative solutions exist for less common cases: +Alternative solutions exist for less common cases: #. If your data is not in a supported format, use Ray Core (:ref:`Tutorial `) for custom applications. This is an advanced option and requires and understanding of :ref:`design patterns and anti-patterns `. -#. If you have a large preprocessing pipeline, you can use the Ray Data library to train multiple models (:ref:`Tutorial `). +#. If you have a large preprocessing pipeline, you can use the Ray Data library to train multiple models (:ref:`Tutorial `). Learn more about many model training with the following resources. -.. panels:: - :container: container pb-3 - :column: col-md-3 px-1 py-1 - :img-top-cls: p-2 w-75 d-block mx-auto fixed-height-img - - --- - :img-top: /images/ray_logo.png - - .. link-button:: https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray - :type: url - :text: [Blog] Training One Million ML Models in Record Time with Ray - :classes: btn-link btn-block stretched-link millionModels - --- - :img-top: /images/ray_logo.png - - .. link-button:: https://www.anyscale.com/blog/many-models-batch-training-at-scale-with-ray-core - :type: url - :text: [Blog] Many Models Batch Training at Scale with Ray Core - :classes: btn-link btn-block stretched-link manyModels - --- - :img-top: /images/ray_logo.png - - .. link-button:: /ray-core/examples/batch_training - :type: ref - :text: [Example] Batch Training with Ray Core - :classes: btn-link btn-block stretched-link batchTrainingCore - --- - :img-top: /images/ray_logo.png - - .. link-button:: /data/examples/batch_training - :type: ref - :text: [Example] Batch Training with Ray Datasets - :classes: btn-link btn-block stretched-link batchTrainingDatasets - --- - :img-top: /images/tune.png - - .. link-button:: /tune/tutorials/tune-run - :type: ref - :text: [Guide] Tune Basic Parallel Experiments - :classes: btn-link btn-block stretched-link tuneBasicParallel - --- - :img-top: /images/tune.png - - .. link-button:: /ray-air/examples/batch_tuning - :type: ref - :text: [Example] Batch Training and Tuning using Ray Tune - :classes: btn-link btn-block stretched-link tuneBatch - --- - :img-top: /images/carrot.png - - .. link-button:: https://www.youtube.com/watch?v=3t26ucTy0Rs - :type: url - :text: [Talk] Scaling Instacart fulfillment ML on Ray - :classes: btn-link btn-block stretched-link instacartFulfillment +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray + + [Blog] Training One Million ML Models in Record Time with Ray + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.anyscale.com/blog/many-models-batch-training-at-scale-with-ray-core + + [Blog] Many Models Batch Training at Scale with Ray Core + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-core/examples/batch_training + + [Example] Batch Training with Ray Core + + .. grid-item-card:: + :img-top: /images/ray_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /data/examples/batch_training + + [Example] Batch Training with Ray Data + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /tune/tutorials/tune-run + + [Guide] Tune Basic Parallel Experiments + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: /ray-air/examples/batch_tuning + + [Example] Batch Training and Tuning using Ray Tune + + .. grid-item-card:: + :img-top: /images/carrot.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-link:: https://www.youtube.com/watch?v=3t26ucTy0Rs + + [Talk] Scaling Instacart fulfillment ML on Ray + Model Serving ------------- @@ -160,53 +268,58 @@ It supports complex `model deployment patterns `. +Ray Data specifically, please see the :ref:`Ray Data Glossary`. .. glossary:: @@ -379,7 +379,7 @@ Ray Data specifically, please see the :ref:`Ray Datasets Glossary` for + :ref:`An interface used to preprocess a Dataset` for training and inference (prediction) with other AIR components. Preprocessors can be stateful, as they can be fitted on the training dataset before being used to transform the training and evaluation datasets. diff --git a/doc/source/rllib/doc_code/catalog_guide.py b/doc/source/rllib/doc_code/catalog_guide.py index ca4714163231..e72c5317e43a 100644 --- a/doc/source/rllib/doc_code/catalog_guide.py +++ b/doc/source/rllib/doc_code/catalog_guide.py @@ -125,6 +125,7 @@ def __init__(self, *args, **kwargs): .environment("CartPole-v1") .framework("torch") .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) ) # Specify the catalog to use for the PPORLModule. diff --git a/doc/source/rllib/doc_code/rlmodule_guide.py b/doc/source/rllib/doc_code/rlmodule_guide.py index 98e153ca6d35..2cad78805811 100644 --- a/doc/source/rllib/doc_code/rlmodule_guide.py +++ b/doc/source/rllib/doc_code/rlmodule_guide.py @@ -1,13 +1,7 @@ # flake8: noqa from ray.rllib.utils.annotations import override - -# TODO (Kourosh): Remove this when the location of the import is fixed. -try: - from ray.rllib.models.specs.typing import SpecType - from ray.rllib.models.specs.specs_torch import TorchTensorSpec -except ImportError: - from ray.rllib.core.models.specs.typing import SpecType - from ray.rllib.core.models.specs.specs_torch import TorchTensorSpec +from ray.rllib.core.models.specs.typing import SpecType +from ray.rllib.core.models.specs.specs_base import TensorSpec # __enabling-rlmodules-in-configs-begin__ @@ -21,6 +15,7 @@ .framework("torch") .environment("CartPole-v1") .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) ) algorithm = config.build() @@ -91,7 +86,10 @@ _enable_rl_module_api=True, rl_module_spec=SingleAgentRLModuleSpec(module_class=DiscreteBCTorchModule), ) - .training(model={"fcnet_hiddens": [32, 32]}) + .training( + model={"fcnet_hiddens": [32, 32]}, + _enable_learner_api=True, + ) ) algo = config.build() @@ -116,7 +114,10 @@ module_specs=SingleAgentRLModuleSpec(module_class=DiscreteBCTorchModule) ), ) - .training(model={"fcnet_hiddens": [32, 32]}) + .training( + model={"fcnet_hiddens": [32, 32]}, + _enable_learner_api=True, + ) ) # __pass-specs-to-configs-ma-end__ @@ -267,7 +268,7 @@ def input_specs_exploration(self) -> SpecType: # and its value is a torch.Tensor with shape (b, h) where b is the # batch size (determined at run-time) and h is the hidden size # (fixed at 10). - return {"obs": TorchTensorSpec("b, h", h=10)} + return {"obs": TensorSpec("b, h", h=10, framework="torch")} # __extend-spec-checking-torch-specs-end__ diff --git a/doc/source/rllib/index.rst b/doc/source/rllib/index.rst index 5fcbd85444bd..2b8c4cc80fae 100644 --- a/doc/source/rllib/index.rst +++ b/doc/source/rllib/index.rst @@ -171,42 +171,51 @@ click on the dropdowns below: Feature Overview ---------------- -.. panels:: - :container: text-center - :column: col-lg-4 px-2 py-2 - :card: - - **RLlib Key Concepts** - ^^^ - Learn more about the core concepts of RLlib, such as environments, algorithms and - policies. - +++ - .. link-button:: rllib-core-concepts - :type: ref - :text: Key Concepts - :classes: btn-outline-info btn-block - --- - - **RLlib Algorithms** - ^^^ - Check out the many available RL algorithms of RLlib for model-free and model-based - RL, on-policy and off-policy training, multi-agent RL, and more. - +++ - .. link-button:: rllib-algorithms-doc - :type: ref - :text: Algorithms - :classes: btn-outline-info btn-block - --- - - **RLlib Environments** - ^^^ - Get started with environments supported by RLlib, such as Farama foundation's Gymnasium, Petting Zoo, - and many custom formats for vectorized and multi-agent environments. - +++ - .. link-button:: rllib-environments-doc - :type: ref - :text: Environments - :classes: btn-outline-info btn-block +.. grid:: 1 2 3 3 + :gutter: 1 + :class-container: container pb-4 + + .. grid-item-card:: + + **RLlib Key Concepts** + ^^^ + Learn more about the core concepts of RLlib, such as environments, algorithms and + policies. + +++ + .. button-ref:: rllib-core-concepts + :color: primary + :outline: + :expand: + + Key Concepts + + .. grid-item-card:: + + **RLlib Algorithms** + ^^^ + Check out the many available RL algorithms of RLlib for model-free and model-based + RL, on-policy and off-policy training, multi-agent RL, and more. + +++ + .. button-ref:: rllib-algorithms-doc + :color: primary + :outline: + :expand: + + Algorithms + + .. grid-item-card:: + + **RLlib Environments** + ^^^ + Get started with environments supported by RLlib, such as Farama foundation's Gymnasium, Petting Zoo, + and many custom formats for vectorized and multi-agent environments. + +++ + .. button-ref:: rllib-environments-doc + :color: primary + :outline: + :expand: + + Environments The following is a summary of RLlib's most striking features. diff --git a/doc/source/rllib/key-concepts.rst b/doc/source/rllib/key-concepts.rst index a5743ab9fc30..a43ccbd1435b 100644 --- a/doc/source/rllib/key-concepts.rst +++ b/doc/source/rllib/key-concepts.rst @@ -63,41 +63,43 @@ can use Ray Tune to tune hyperparameters of your reinforcement learning algorith The following example shows three equivalent ways of interacting with ``PPO``, which implements the proximal policy optimization algorithm in RLlib. -.. tabbed:: Basic RLlib Algorithm +.. tab-set:: - .. code-block:: python + .. tab-item:: Basic RLlib Algorithm - # Configure. - from ray.rllib.algorithms.ppo import PPOConfig - config = PPOConfig().environment(env="CartPole-v1").training(train_batch_size=4000) + .. code-block:: python - # Build. - algo = config.build() + # Configure. + from ray.rllib.algorithms.ppo import PPOConfig + config = PPOConfig().environment(env="CartPole-v1").training(train_batch_size=4000) - # Train. - while True: - print(algo.train()) + # Build. + algo = config.build() + # Train. + while True: + print(algo.train()) -.. tabbed:: RLlib Algorithms and Tune - .. code-block:: python + .. tab-item:: RLlib Algorithms and Tune - from ray import tune + .. code-block:: python - # Configure. - from ray.rllib.algorithms.ppo import PPOConfig - config = PPOConfig().environment(env="CartPole-v1").training(train_batch_size=4000) + from ray import tune - # Train via Ray Tune. - tune.run("PPO", config=config) + # Configure. + from ray.rllib.algorithms.ppo import PPOConfig + config = PPOConfig().environment(env="CartPole-v1").training(train_batch_size=4000) + # Train via Ray Tune. + tune.run("PPO", config=config) -.. tabbed:: RLlib Command Line - .. code-block:: bash + .. tab-item:: RLlib Command Line - rllib train --run=PPO --env=CartPole-v1 --config='{"train_batch_size": 4000}' + .. code-block:: bash + + rllib train --run=PPO --env=CartPole-v1 --config='{"train_batch_size": 4000}' RLlib `Algorithm classes `__ coordinate the distributed workflow of running rollouts and optimizing policies. diff --git a/doc/source/rllib/rllib-connector.rst b/doc/source/rllib/rllib-connector.rst index f0a78e931689..24fe81eb89f9 100644 --- a/doc/source/rllib/rllib-connector.rst +++ b/doc/source/rllib/rllib-connector.rst @@ -152,62 +152,66 @@ Lambda Connector helps turn simple transformation functions into agent or action connectors without having users worry about the high-level list or non-list APIs. Lambda Connector has separate agent and action versions, for example: -.. tabbed:: Lambda Agent Connector +.. tab-set:: - .. code-block:: python + .. tab-item:: Lambda Agent Connector - # An example agent connector that filters INFOS column out of - # observation data. - def filter(d: ActionConnectorDataType): - del d.data[SampleBatch.INFOS] - return d + .. code-block:: python - FilterInfosColumnAgentConnector = register_lambda_agent_connector( - "FilterInfosColumnAgentConnector", filter - ) + # An example agent connector that filters INFOS column out of + # observation data. + def filter(d: ActionConnectorDataType): + del d.data[SampleBatch.INFOS] + return d -.. tabbed:: Lambda Action Connector + FilterInfosColumnAgentConnector = register_lambda_agent_connector( + "FilterInfosColumnAgentConnector", filter + ) - .. code-block:: python + .. tab-item:: Lambda Action Connector - # An example action connector that scales actions output by the - # policy by a factor of 2. - ScaleActionConnector = register_lambda_action_connector( - "ScaleActionConnector", - lambda actions, states, fetches: 2 * actions, states, fetches - ) + .. code-block:: python + + # An example action connector that scales actions output by the + # policy by a factor of 2. + ScaleActionConnector = register_lambda_action_connector( + "ScaleActionConnector", + lambda actions, states, fetches: 2 * actions, states, fetches + ) Multiple connectors can be composed into a ``ConnectorPipeline``, which handles proper running of all children connectors in sequence and provides basic operations to modify and update the composition of connectors. ``ConnectorPipeline`` also has agent and action versions: -.. tabbed:: AgentConnectorPipeline +.. tab-set:: + + .. tab-item:: AgentConnectorPipeline - .. code-block:: python + .. code-block:: python - # Example construction of an AgentConnectorPipeline. - pipeline = ActionConnectorPipeline( - ctx, - [ClipRewardAgentConnector(), ViewRequirementAgentConnector()] - ) + # Example construction of an AgentConnectorPipeline. + pipeline = ActionConnectorPipeline( + ctx, + [ClipRewardAgentConnector(), ViewRequirementAgentConnector()] + ) - # For demonstration purpose, we will add an ObsPreprocessorConnector - # in front of the ViewRequirementAgentConnector. - pipeline.insert_before("ViewRequirementAgentConnector", ObsPreprocessorConnector()) + # For demonstration purpose, we will add an ObsPreprocessorConnector + # in front of the ViewRequirementAgentConnector. + pipeline.insert_before("ViewRequirementAgentConnector", ObsPreprocessorConnector()) -.. tabbed:: Action Lambda Connector + .. tab-item:: Action Lambda Connector - .. code-block:: python + .. code-block:: python - # Example construction of an ActionConnectorPipeline. - pipeline = ActionConnectorPipeline( - ctx, - [ConvertToNumpyConnector(), ClipActionsConnector(), ImmutableActionsConnector()] - ) + # Example construction of an ActionConnectorPipeline. + pipeline = ActionConnectorPipeline( + ctx, + [ConvertToNumpyConnector(), ClipActionsConnector(), ImmutableActionsConnector()] + ) - # For demonstration purpose, we will drop the last ImmutableActionsConnector here. - pipeline.remove("ImmutableActionsConnector") + # For demonstration purpose, we will drop the last ImmutableActionsConnector here. + pipeline.remove("ImmutableActionsConnector") diff --git a/doc/source/rllib/rllib-offline.rst b/doc/source/rllib/rllib-offline.rst index 3917e321bac2..5c06c543984e 100644 --- a/doc/source/rllib/rllib-offline.rst +++ b/doc/source/rllib/rllib-offline.rst @@ -220,12 +220,12 @@ Scaling I/O throughput Similar to scaling online training, you can scale offline I/O throughput by increasing the number of RLlib workers via the ``num_workers`` config. Each worker accesses offline storage independently in parallel, for linear scaling of I/O throughput. Within each read worker, files are chosen in random order for reads, but file contents are read sequentially. -Ray Dataset Integration --------------------------- +Ray Data Integration +-------------------- RLlib has experimental support for reading/writing training samples from/to large offline datasets using -`Ray Dataset `__. -We support JSON and Parquet files today. Other file formats supported by Dataset can also be easily added. +:ref:`Ray Data `. +We support JSON and Parquet files today. Other file formats supported by Ray Data can also be easily added. Unlike JSON input, a single dataset can be automatically sharded and replayed by multiple rollout workers by simply specifying the desired num_workers config. diff --git a/doc/source/rllib/rllib-replay-buffers.rst b/doc/source/rllib/rllib-replay-buffers.rst index 5ce3621d27a6..b3d709ab8204 100644 --- a/doc/source/rllib/rllib-replay-buffers.rst +++ b/doc/source/rllib/rllib-replay-buffers.rst @@ -36,7 +36,7 @@ Replay Buffers in RLlib RLlib comes with a set of extendable replay buffers built in. All the of them support the two basic methods ``add()`` and ``sample()``. We provide a base :py:class:`~ray.rllib.utils.replay_buffers.replay_buffer.ReplayBuffer` class from which you can build your own buffer. In most algorithms, we require :py:class:`~ray.rllib.utils.replay_buffers.multi_agent_replay_buffer.MultiAgentReplayBuffer`\s. -This is because we want them to generalize to the the multi-agent case. Therefore, these buffer's ``add()`` and ``sample()`` methods require a ``policy_id`` to handle experiences per policy. +This is because we want them to generalize to the multi-agent case. Therefore, these buffer's ``add()`` and ``sample()`` methods require a ``policy_id`` to handle experiences per policy. Have a look at the :py:class:`~ray.rllib.utils.replay_buffers.multi_agent_replay_buffer.MultiAgentReplayBuffer` to get a sense of how it extends our base class. You can find buffer types and arguments to modify their behaviour as part of RLlib's default parameters. They are part of the ``replay_buffer_config``. diff --git a/doc/source/rllib/rllib-rlmodule.rst b/doc/source/rllib/rllib-rlmodule.rst index 62bddda34053..cf7be524ea4a 100644 --- a/doc/source/rllib/rllib-rlmodule.rst +++ b/doc/source/rllib/rllib-rlmodule.rst @@ -73,43 +73,46 @@ The RLModule API provides a unified way to define custom reinforcement learning To maintain consistency and usability, RLlib offers a standardized approach for defining module objects for both single-agent and multi-agent reinforcement learning environments. This is achieved through the :py:class:`~ray.rllib.core.rl_module.rl_module.SingleAgentRLModuleSpec` and :py:class:`~ray.rllib.core.rl_module.marl_module.MultiAgentRLModuleSpec` classes. The built-in RLModules in RLlib follow this consistent design pattern, making it easier for you to understand and utilize these modules. -.. tabbed:: Single Agent +.. tab-set:: - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __constructing-rlmodules-sa-begin__ - :end-before: __constructing-rlmodules-sa-end__ + .. tab-item:: Single Agent + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __constructing-rlmodules-sa-begin__ + :end-before: __constructing-rlmodules-sa-end__ -.. tabbed:: Multi Agent - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __constructing-rlmodules-ma-begin__ - :end-before: __constructing-rlmodules-ma-end__ + .. tab-item:: Multi Agent + + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __constructing-rlmodules-ma-begin__ + :end-before: __constructing-rlmodules-ma-end__ You can pass RL Module specs to the algorithm configuration to be used by the algorithm. -.. tabbed:: Single Agent +.. tab-set:: - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __pass-specs-to-configs-sa-begin__ - :end-before: __pass-specs-to-configs-sa-end__ + .. tab-item:: Single Agent + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __pass-specs-to-configs-sa-begin__ + :end-before: __pass-specs-to-configs-sa-end__ - .. note:: - For passing RL Module specs, all fields do not have to be filled as they are filled based on the described environment or other algorithm configuration parameters (i.e. ,``observation_space``, ``action_space``, ``model_config_dict`` are not required fields when passing a custom RL Module spec to the algorithm config.) + .. note:: + For passing RL Module specs, all fields do not have to be filled as they are filled based on the described environment or other algorithm configuration parameters (i.e. ,``observation_space``, ``action_space``, ``model_config_dict`` are not required fields when passing a custom RL Module spec to the algorithm config.) -.. tabbed:: Multi Agent - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __pass-specs-to-configs-ma-begin__ - :end-before: __pass-specs-to-configs-ma-end__ + .. tab-item:: Multi Agent + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __pass-specs-to-configs-ma-begin__ + :end-before: __pass-specs-to-configs-ma-end__ Writing Custom Single Agent RL Modules @@ -146,20 +149,22 @@ Also the class's constrcutor requires a dataclass config object called `~ray.rll When writing RL Modules, you need to use these fields to construct your model. -.. tabbed:: Single Agent (torch) +.. tab-set:: + + .. tab-item:: Single Agent (torch) + + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __write-custom-sa-rlmodule-torch-begin__ + :end-before: __write-custom-sa-rlmodule-torch-end__ - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __write-custom-sa-rlmodule-torch-begin__ - :end-before: __write-custom-sa-rlmodule-torch-end__ + .. tab-item:: Single Agent (tensorflow) -.. tabbed:: Single Agent (tensorflow) - - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __write-custom-sa-rlmodule-tf-begin__ - :end-before: __write-custom-sa-rlmodule-tf-end__ + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __write-custom-sa-rlmodule-tf-begin__ + :end-before: __write-custom-sa-rlmodule-tf-end__ In :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` you can enforce the checking for the existence of certain input or output keys in the data that is communicated into and out of RL Modules. This serves multiple purposes: @@ -167,35 +172,37 @@ In :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` you can enforce the - For the I/O requirement of each method to be self-documenting. - For failures to happen quickly. If users extend the modules and implement something that does not match the assumptions of the I/O specs, the check reports missing keys and their expected format. For example, RLModule should always have an ``obs`` key in the input batch and an ``action_dist`` key in the output. -.. tabbed:: Single Level Keys - - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __extend-spec-checking-single-level-begin__ - :end-before: __extend-spec-checking-single-level-end__ +.. tab-set:: -.. tabbed:: Nested Keys + .. tab-item:: Single Level Keys - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __extend-spec-checking-nested-begin__ - :end-before: __extend-spec-checking-nested-end__ + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __extend-spec-checking-single-level-begin__ + :end-before: __extend-spec-checking-single-level-end__ + .. tab-item:: Nested Keys -.. tabbed:: TensorShape Spec + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __extend-spec-checking-nested-begin__ + :end-before: __extend-spec-checking-nested-end__ - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __extend-spec-checking-torch-specs-begin__ - :end-before: __extend-spec-checking-torch-specs-end__ + .. tab-item:: TensorShape Spec -.. tabbed:: Type Spec + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __extend-spec-checking-torch-specs-begin__ + :end-before: __extend-spec-checking-torch-specs-end__ - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __extend-spec-checking-type-specs-begin__ - :end-before: __extend-spec-checking-type-specs-end__ + + .. tab-item:: Type Spec + + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __extend-spec-checking-type-specs-begin__ + :end-before: __extend-spec-checking-type-specs-end__ :py:class:`~ray.rllib.core.rl_module.rl_module.RLModule` has two methods for each forward method, totaling 6 methods that can be override to describe the specs of the input and output of each method: @@ -220,12 +227,14 @@ The :py:class:`~ray.rllib.core.rl_module.marl_module.MultiAgentRLModule` offers The following example creates a custom multi-agent RL module with underlying modules. The modules share an encoder, which gets applied to the global part of the observations space. The local part passes through a separate encoder, specific to each policy. -.. tabbed:: Multi agent with shared encoder (Torch) +.. tab-set:: + + .. tab-item:: Multi agent with shared encoder (Torch) - .. literalinclude:: doc_code/rlmodule_guide.py - :language: python - :start-after: __write-custom-marlmodule-shared-enc-begin__ - :end-before: __write-custom-marlmodule-shared-enc-end__ + .. literalinclude:: doc_code/rlmodule_guide.py + :language: python + :start-after: __write-custom-marlmodule-shared-enc-begin__ + :end-before: __write-custom-marlmodule-shared-enc-end__ To construct this custom multi-agent RL module, pass the class to the :py:class:`~ray.rllib.core.rl_module.marl_module.MultiAgentRLModuleSpec` constructor. Also, pass the :py:class:`~ray.rllib.core.rl_module.rl_module.SingleAgentRLModuleSpec` for each agent because RLlib requires the observation, action spaces, and model hyper-parameters for each agent. @@ -243,44 +252,46 @@ RLlib provides a number of RL Modules for different frameworks (e.g., PyTorch, T There are two possible ways to extend existing RL Modules: -.. tabbed:: Inheriting existing RL Modules +.. tab-set:: - One way to extend existing RL Modules is to inherit from them and override the methods you need to customize. For example, extend :py:class:`~ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module.PPOTorchRLModule` and augment it with your own customization. Then pass the new customized class into the algorithm configuration to use the PPO algorithm to optimize your custom RL Module. + .. tab-item:: Inheriting existing RL Modules - .. code-block:: python + One way to extend existing RL Modules is to inherit from them and override the methods you need to customize. For example, extend :py:class:`~ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module.PPOTorchRLModule` and augment it with your own customization. Then pass the new customized class into the algorithm configuration to use the PPO algorithm to optimize your custom RL Module. + + .. code-block:: python + + class MyPPORLModule(PPORLModule): + + def __init__(self, config: RLModuleConfig): + super().__init__(config) + ... + + # Pass in the custom RL Module class to the spec + algo_config = algo_config.rl_module( + rl_module_spec=SingleAgentRLModuleSpec(module_class=MyPPORLModule) + ) - class MyPPORLModule(PPORLModule): - - def __init__(self, config: RLModuleConfig): - super().__init__(config) - ... - # Pass in the custom RL Module class to the spec - algo_config = algo_config.rl_module( - rl_module_spec=SingleAgentRLModuleSpec(module_class=MyPPORLModule) - ) + .. tab-item:: Extending RL Module Catalog - -.. tabbed:: Extending RL Module Catalog + Another way to customize your module is by extending its :py:class:`~ray.rllib.core.models.catalog.Catalog`. The :py:class:`~ray.rllib.core.models.catalog.Catalog` is a component that defines the default architecture and behavior of a model based on factors such as ``observation_space``, ``action_space``, etc. To modify sub-components of an existing RL Module, extend the corresponding Catalog class. - Another way to customize your module is by extending its :py:class:`~ray.rllib.core.models.catalog.Catalog`. The :py:class:`~ray.rllib.core.models.catalog.Catalog` is a component that defines the default architecture and behavior of a model based on factors such as ``observation_space``, ``action_space``, etc. To modify sub-components of an existing RL Module, extend the corresponding Catalog class. + For instance, to adapt the existing ``PPORLModule`` for a custom graph observation space not supported by RLlib out-of-the-box, extend the :py:class:`~ray.rllib.core.models.catalog.Catalog` class used to create the ``PPORLModule`` and override the method responsible for returning the encoder component to ensure that your custom encoder replaces the default one initially provided by RLlib. For more information on the :py:class:`~ray.rllib.core.models.catalog.Catalog` class, refer to the `Catalog user guide `__. - For instance, to adapt the existing ``PPORLModule`` for a custom graph observation space not supported by RLlib out-of-the-box, extend the :py:class:`~ray.rllib.core.models.catalog.Catalog` class used to create the ``PPORLModule`` and override the method responsible for returning the encoder component to ensure that your custom encoder replaces the default one initially provided by RLlib. For more information on the :py:class:`~ray.rllib.core.models.catalog.Catalog` class, refer to the `Catalog user guide `__. + .. code-block:: python - .. code-block:: python + class MyAwesomeCatalog(PPOCatalog): - class MyAwesomeCatalog(PPOCatalog): + def get_actor_critic_encoder_config(): + # create your awesome graph encoder here and return it + pass - def get_actor_critic_encoder_config(): - # create your awesome graph encoder here and return it - pass - - # Pass in the custom catalog class to the spec - algo_config = algo_config.rl_module( - rl_module_spec=SingleAgentRLModuleSpec(catalog_class=MyAwesomeCatalog) - ) + # Pass in the custom catalog class to the spec + algo_config = algo_config.rl_module( + rl_module_spec=SingleAgentRLModuleSpec(catalog_class=MyAwesomeCatalog) + ) Migrating from Custom Policies and Models to RL Modules @@ -293,150 +304,154 @@ In the new `~ray.rllib.core.rl_module.rl_module.RLModule` API the construction o What your customization could have looked like before: -.. tabbed:: ModelV2 +.. tab-set:: - .. code-block:: python - - from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 - from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 + .. tab-item:: ModelV2 + .. code-block:: python - class MyCustomModel(TorchModelV2): - """Code for your previous custom model""" - ... + from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 + from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 - class CustomPolicy(TorchPolicyV2): + class MyCustomModel(TorchModelV2): + """Code for your previous custom model""" + ... - @DeveloperAPI - @OverrideToImplementCustomLogic - def make_model(self) -> ModelV2: - """Create model. - Note: only one of make_model or make_model_and_action_dist - can be overridden. + class CustomPolicy(TorchPolicyV2): - Returns: - ModelV2 model. - """ - return MyCustomModel(...) + @DeveloperAPI + @OverrideToImplementCustomLogic + def make_model(self) -> ModelV2: + """Create model. + Note: only one of make_model or make_model_and_action_dist + can be overridden. -.. tabbed:: ModelV2 + Distribution + Returns: + ModelV2 model. + """ + return MyCustomModel(...) - .. code-block:: python - - from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 - from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 + .. tab-item:: ModelV2 + Distribution - class MyCustomModel(TorchModelV2): - """Code for your previous custom model""" - ... + .. code-block:: python + from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 + from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 - class CustomPolicy(TorchPolicyV2): - @DeveloperAPI - @OverrideToImplementCustomLogic - def make_model_and_action_dist(self): - """Create model and action distribution function. + class MyCustomModel(TorchModelV2): + """Code for your previous custom model""" + ... - Returns: - ModelV2 model. - ActionDistribution class. - """ - my_model = MyCustomModel(...) # construct some ModelV2 instance here - dist_class = ... # Action distribution cls - return my_model, dist_class + class CustomPolicy(TorchPolicyV2): + @DeveloperAPI + @OverrideToImplementCustomLogic + def make_model_and_action_dist(self): + """Create model and action distribution function. -.. tabbed:: Sampler functions + Returns: + ModelV2 model. + ActionDistribution class. + """ + my_model = MyCustomModel(...) # construct some ModelV2 instance here + dist_class = ... # Action distribution cls - .. code-block:: python + return my_model, dist_class - from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 - from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 - class CustomPolicy(TorchPolicyV2): + .. tab-item:: Sampler functions - @DeveloperAPI - @OverrideToImplementCustomLogic - def action_sampler_fn( - self, - model: ModelV2, - *, - obs_batch: TensorType, - state_batches: TensorType, - **kwargs, - ) -> Tuple[TensorType, TensorType, TensorType, List[TensorType]]: - """Custom function for sampling new actions given policy. + .. code-block:: python - Args: - model: Underlying model. - obs_batch: Observation tensor batch. - state_batches: Action sampling state batch. + from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 + from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 - Returns: - Sampled action - Log-likelihood - Action distribution inputs - Updated state - """ - return None, None, None, None + class CustomPolicy(TorchPolicyV2): + @DeveloperAPI + @OverrideToImplementCustomLogic + def action_sampler_fn( + self, + model: ModelV2, + *, + obs_batch: TensorType, + state_batches: TensorType, + **kwargs, + ) -> Tuple[TensorType, TensorType, TensorType, List[TensorType]]: + """Custom function for sampling new actions given policy. - @DeveloperAPI - @OverrideToImplementCustomLogic - def action_distribution_fn( - self, - model: ModelV2, - *, - obs_batch: TensorType, - state_batches: TensorType, - **kwargs, - ) -> Tuple[TensorType, type, List[TensorType]]: - """Action distribution function for this Policy. + Args: + model: Underlying model. + obs_batch: Observation tensor batch. + state_batches: Action sampling state batch. - Args: - model: Underlying model. - obs_batch: Observation tensor batch. - state_batches: Action sampling state batch. + Returns: + Sampled action + Log-likelihood + Action distribution inputs + Updated state + """ + return None, None, None, None - Returns: - Distribution input. - ActionDistribution class. - State outs. - """ - return None, None, None + @DeveloperAPI + @OverrideToImplementCustomLogic + def action_distribution_fn( + self, + model: ModelV2, + *, + obs_batch: TensorType, + state_batches: TensorType, + **kwargs, + ) -> Tuple[TensorType, type, List[TensorType]]: + """Action distribution function for this Policy. -All of the ``Policy.compute_***`` functions expect that `~ray.rllib.core.rl_module.rl_module.RLModule.forward_exploration` and `~ray.rllib.core.rl_module.rl_module.RLModule.forward_inference` return a dictionary that contains the key "action_dist" mapping to a ``ray.rllib.models.distributions.Distribution`` instance. Commonly used distribution implementations can be found under ``ray.rllib.models.tf.tf_distributions`` for tensorflow and ``ray.rllib.models.torch.torch_distributions`` for torch. You can choose to return determinstic actions, by creating a determinstic distribution instance. See `Writing Custom Single Agent RL Modules`_ for more details on how to implement your own custom RL Module. + Args: + model: Underlying model. + obs_batch: Observation tensor batch. + state_batches: Action sampling state batch. -.. tabbed:: The Equivalent RL Module + Returns: + Distribution input. + ActionDistribution class. + State outs. + """ + return None, None, None - .. code-block:: python - """ - No need to override any policy functions. Simply instead implement any custom logic in your custom RL Module - """ - from ray.rllib.models.torch.torch_distributions import YOUR_DIST_CLASS +All of the ``Policy.compute_***`` functions expect that `~ray.rllib.core.rl_module.rl_module.RLModule.forward_exploration` and `~ray.rllib.core.rl_module.rl_module.RLModule.forward_inference` return a dictionary that contains the key "action_dist_inputs", whose value are the parameters (inputs) of a ``ray.rllib.models.distributions.Distribution`` class. Commonly used distribution implementations can be found under ``ray.rllib.models.tf.tf_distributions`` for tensorflow and ``ray.rllib.models.torch.torch_distributions`` for torch. You can choose to return determinstic actions, by creating a determinstic distribution instance. See `Writing Custom Single Agent RL Modules`_ for more details on how to implement your own custom RL Module. +.. tab-set:: - class MyRLModule(TorchRLModule): + .. tab-item:: The Equivalent RL Module - def __init__(self, config: RLConfig): - # construct any custom networks here using config - # specify an action distribution class here - ... + .. code-block:: python - def _forward_inference(self, batch): - ... + """ + No need to override any policy functions. Simply instead implement any custom logic in your custom RL Module + """ + from ray.rllib.models.torch.torch_distributions import YOUR_DIST_CLASS - def _forward_exploration(self, batch): - ... + + class MyRLModule(TorchRLModule): + + def __init__(self, config: RLConfig): + # construct any custom networks here using config + # specify an action distribution class here + ... + + def _forward_inference(self, batch): + ... + + def _forward_exploration(self, batch): + ... Notable TODOs diff --git a/doc/source/rllib/user-guides.rst b/doc/source/rllib/user-guides.rst index fed9cc8352f8..4b325deb2f27 100644 --- a/doc/source/rllib/user-guides.rst +++ b/doc/source/rllib/user-guides.rst @@ -13,97 +13,86 @@ User Guides RLlib Feature Guides -------------------- -.. panels:: - :container: container pb-4 full-width - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-advanced-api-doc - :type: ref - :text: Advanced Feautures of the RLlib Python API - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-models - :type: ref - :text: Working With Models, Preprocessors and Action Distributions - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-saving-and-loading-algos-and-policies - :type: ref - :text: Checkpointing your Algorithms and Policies, and Exporting your NN Models - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-concepts - :type: ref - :text: How To Customize Your Policies? - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-sample-collection - :type: ref - :text: How To Use Sample Collections and Trajectory Views? - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-offline - :type: ref - :text: Working With Offline Data - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-replay-buffers - :type: ref - :text: Working with ReplayBuffers - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-dev - :type: ref - :text: How To Contribute To RLlib? - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-cli - :type: ref - :text: How To Work With the RLlib CLI? - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.svg - - +++ - .. link-button:: rllib-catalogs - :type: ref - :text: How To Use the RLlib Catalogs - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-advanced-api-doc + + Advanced Features of the RLlib Python API + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-models + + Working With Models, Preprocessors and Action Distributions + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-saving-and-loading-algos-and-policies + + Checkpointing your Algorithms and Policies, and Exporting your Models + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-concepts + + How To Customize Your Policies? + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-sample-collection + + How To Use Sample Collections and Trajectory Views? + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-offline + + Working With Offline Data + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-replay-buffers + + Working with ReplayBuffers + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-dev + + How To Contribute To RLlib? + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-cli + + How To Work With the RLlib CLI? + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: rllib-catalogs + + How To Use the RLlib Catalogs diff --git a/doc/source/serve/api/index.md b/doc/source/serve/api/index.md index c124fa6b2bf5..9b6a9dbc024c 100644 --- a/doc/source/serve/api/index.md +++ b/doc/source/serve/api/index.md @@ -1,12 +1,395 @@ (serve-api)= # Ray Serve API -```{toctree} -:maxdepth: '-1' +## Python API -python_api -rest_api -serve_cli +(core-apis)= + +```{eval-rst} +.. module:: ray +``` + +### Writing Applications + + + +#### Deployment Decorators + +```{eval-rst} +.. autosummary:: + :nosignatures: + :toctree: doc/ + + serve.deployment + :noindex: + serve.ingress + serve.batch +``` + +#### Object Types + +```{eval-rst} +.. autosummary:: + :nosignatures: + :toctree: doc/ + :template: autosummary/class_without_init_args.rst + + serve.Deployment + serve.Application + serve.handle.RayServeHandle + serve.handle.RayServeSyncHandle +``` + +#### Advanced APIs + +```{eval-rst} +.. autosummary:: + :nosignatures: + :toctree: doc/ + + serve.get_replica_context +``` + +### Running Applications + +```{eval-rst} +.. autosummary:: + :nosignatures: + :toctree: doc/ + + serve.run + serve.delete + serve.start + serve.shutdown +``` + +(serve-cli)= + +## Command Line Interface (CLI) + +```{eval-rst} +.. click:: ray.serve.scripts:cli + :prog: serve + :nested: full +``` + +(serve-rest-api)= + +## Serve REST API + +### V1 REST API (Single-application) + +#### `PUT "/api/serve/deployments/"` + +Declaratively deploys the Serve application. Starts Serve on the Ray cluster if it's not already running. See [single-app config schema](serve-rest-api-config-schema) for the request's JSON schema. + +**Example Request**: + +```http +PUT /api/serve/deployments/ HTTP/1.1 +Host: http://localhost:52365/ +Accept: application/json +Content-Type: application/json + +{ + "import_path": "fruit.deployment_graph", + "runtime_env": { + "working_dir": "https://github.com/ray-project/serve_config_examples/archive/HEAD.zip" + }, + "deployments": [ + {"name": "MangoStand", "user_config": {"price": 1}}, + {"name": "OrangeStand", "user_config": {"price": 2}}, + {"name": "PearStand", "user_config": {"price": 3}} + ] +} +``` + +**Example Response** + + +```http +HTTP/1.1 200 OK +Content-Type: application/json +``` + +#### `GET "/api/serve/deployments/"` + +Gets the config for the application currently deployed on the Ray cluster. This config represents the current goal state for the Serve application. See [single-app config schema](serve-rest-api-config-schema) for the response's JSON schema. + +**Example Request**: +```http +GET /api/serve/deployments/ HTTP/1.1 +Host: http://localhost:52365/ +Accept: application/json +``` + +**Example Response**: + +```http +HTTP/1.1 200 OK +Content-Type: application/json + +{ + "import_path": "fruit.deployment_graph", + "runtime_env": { + "working_dir": "https://github.com/ray-project/serve_config_examples/archive/HEAD.zip" + }, + "deployments": [ + {"name": "MangoStand", "user_config": {"price": 1}}, + {"name": "OrangeStand", "user_config": {"price": 2}}, + {"name": "PearStand", "user_config": {"price": 3}} + ] +} +``` + + +#### `GET "/api/serve/deployments/status"` + +Gets the Serve application's current status, including all the deployment statuses. See [status schema](serve-rest-api-response-schema) for the response's JSON schema. + +**Example Request**: + +```http +GET /api/serve/deployments/ HTTP/1.1 +Host: http://localhost:52365/ +Accept: application/json +``` + +**Example Response** + +```http +HTTP/1.1 200 OK +Content-Type: application/json + +{ + "app_status": { + "status": "RUNNING", + "message": "", + "deployment_timestamp": 1855994527.146304 + }, + "deployment_statuses": [ + { + "name": "MangoStand", + "status": "HEALTHY", + "message": "" + }, + { + "name": "OrangeStand", + "status": "HEALTHY", + "message": "" + }, + { + "name": "PearStand", + "status": "HEALTHY", + "message": "" + }, + { + "name": "FruitMarket", + "status": "HEALTHY", + "message": "" + }, + { + "name": "DAGDriver", + "status": "HEALTHY", + "message": "" + } + ] +} +``` + +#### `DELETE "/api/serve/deployments/"` + +Shuts down Serve and the Serve application running on the Ray cluster. Has no effect if Serve is not running on the Ray cluster. + +**Example Request**: + +```http +DELETE /api/serve/deployments/ HTTP/1.1 +Host: http://localhost:52365/ +Accept: application/json +``` + +**Example Response** + +```http +HTTP/1.1 200 OK +Content-Type: application/json +``` + +### V2 REST API (Multi-application) + +#### `PUT "/api/serve/applications/"` + +Declaratively deploys a list of Serve applications. If Serve is already running on the Ray cluster, removes all applications not listed in the new config. If Serve is not running on the Ray cluster, starts Serve. See [multi-app config schema](serve-rest-api-config-schema) for the request's JSON schema. + +**Example Request**: + +```http +PUT /api/serve/applications/ HTTP/1.1 +Host: http://localhost:52365/ +Accept: application/json +Content-Type: application/json + +{ + "applications": [ + { + "name": "fruit_stand", + "route_prefix": "/fruit", + "import_path": "fruit.deployment_graph", + "runtime_env": { + "working_dir": "https://github.com/ray-project/serve_config_examples/archive/HEAD.zip" + }, + "deployments": [ + {"name": "MangoStand", "user_config": {"price": 1}}, + {"name": "OrangeStand", "user_config": {"price": 2}}, + {"name": "PearStand", "user_config": {"price": 3}} + ] + }, + { + "name": "calculator", + "route_prefix": "/math", + "import_path": "conditional_dag.serve_dag", + "runtime_env": { + "working_dir": "https://github.com/ray-project/test_dag/archive/HEAD.zip" + }, + "deployments": [ + {"name": "Multiplier", "ray_actor_options": {"num_cpus": 0.5}}, + { + "name": "Adder", + "ray_actor_options": {"env_vars": {"override_increment": "5"}} + }, + ] + } + ] +} ``` + +**Example Response** + + +```http +HTTP/1.1 200 OK +Content-Type: application/json +``` + +#### `GET "/api/serve/applications/"` + +Gets cluster-level info and comprehensive details on all Serve applications deployed on the Ray cluster. See [metadata schema](serve-rest-api-response-schema) for the response's JSON schema. + +```http +GET /api/serve/applications/ HTTP/1.1 +Host: http://localhost:52365/ +Accept: application/json +``` + +**Example Response (abridged JSON)**: + +```http +HTTP/1.1 200 OK +Content-Type: application/json + +{ + "proxy_location": "HeadOnly", + "http_options": { + "host": "127.0.0.1", + "port": 8000 + }, + "deploy_mode": "MULTI_APP", + "applications": { + "fruit_stand": { + "name": "fruit_stand", + "route_prefix": "/fruit", + "docs_path": null, + "status": "RUNNING", + "message": "", + "last_deployed_time_s": 1679952253.748111, + "deployed_app_config": "...", + "deployments": { + "fruit_app_MangoStand": { + "name": "fruit_app_MangoStand", + "status": "HEALTHY", + "message": "", + "deployment_config": "...", + "replicas": [ + { + "replica_id": "fruit_app_MangoStand#bSkrHK", + "state": "RUNNING", + "pid": 59350, + "actor_name": "...", + "actor_id": "...", + "node_id": "...", + "node_ip": "...", + "start_time_s": 1679952254.3458009 + } + ] + }, + } + }, + } +} +``` + +#### `DELETE "/api/serve/applications/"` + +Shuts down Serve and all applications running on the Ray cluster. Has no effect if Serve is not running on the Ray cluster. + +**Example Request**: + +```http +DELETE /api/serve/applications/ HTTP/1.1 +Host: http://localhost:52365/ +Accept: application/json +``` + +**Example Response** + +```http +HTTP/1.1 200 OK +Content-Type: application/json +``` + +(serve-rest-api-config-schema)= +## Config Schemas + +```{eval-rst} +.. currentmodule:: ray.serve +``` + + +```{eval-rst} +.. autosummary:: + :toctree: doc/ + + schema.ServeDeploySchema + schema.HTTPOptionsSchema + schema.ServeApplicationSchema + schema.DeploymentSchema + schema.RayActorOptionsSchema +``` + +(serve-rest-api-response-schema)= +## Response Schemas + +#### V1 REST API +```{eval-rst} +.. autosummary:: + :toctree: doc/ + + schema.ServeStatusSchema +``` + +#### V2 REST API +```{eval-rst} +.. autosummary:: + :toctree: doc/ + + schema.ServeInstanceDetails + schema.ApplicationDetails + schema.DeploymentDetails + schema.ReplicaDetails +``` diff --git a/doc/source/serve/api/python_api.md b/doc/source/serve/api/python_api.md deleted file mode 100644 index b3ee7719a7d3..000000000000 --- a/doc/source/serve/api/python_api.md +++ /dev/null @@ -1,53 +0,0 @@ -# Ray Serve Python API - -(core-apis)= - -```{eval-rst} -.. currentmodule:: ray -``` - -## Core APIs - -```{eval-rst} -.. autosummary:: - :toctree: doc/ - - serve.run - serve.start - serve.shutdown - serve.delete -``` - -(servehandle-api)= -## ServeHandle API - -```{eval-rst} -.. autosummary:: - :toctree: doc/ - - serve.handle.RayServeHandle - -.. autosummary:: - :toctree: doc/ - - serve.handle.RayServeHandle.remote - serve.handle.RayServeHandle.options -``` - -## Batching Requests - -```{eval-rst} -.. autosummary:: - :toctree: doc/ - - serve.batch -``` - -## Deployment Graph APIs - -```{eval-rst} -.. autosummary:: - :toctree: doc/ - - serve.api.build -``` \ No newline at end of file diff --git a/doc/source/serve/api/rest_api.md b/doc/source/serve/api/rest_api.md deleted file mode 100644 index d0de6fa069c9..000000000000 --- a/doc/source/serve/api/rest_api.md +++ /dev/null @@ -1,317 +0,0 @@ -(serve-rest-api)= - -# Serve REST API - -## V1 REST API (Single-application) - -### `PUT "/api/serve/deployments/"` - -Declaratively deploys the Serve application. Starts Serve on the Ray cluster if it's not already running. See [single-app config schema](serve-rest-api-config-schema) for the request's JSON schema. - -**Example Request**: - -```http -PUT /api/serve/deployments/ HTTP/1.1 -Host: http://localhost:52365/ -Accept: application/json -Content-Type: application/json - -{ - "import_path": "fruit.deployment_graph", - "runtime_env": { - "working_dir": "https://github.com/ray-project/serve_config_examples/archive/HEAD.zip" - }, - "deployments": [ - {"name": "MangoStand", "user_config": {"price": 1}}, - {"name": "OrangeStand", "user_config": {"price": 2}}, - {"name": "PearStand", "user_config": {"price": 3}} - ] -} -``` - -**Example Response** - - -```http -HTTP/1.1 200 OK -Content-Type: application/json -``` - -### `GET "/api/serve/deployments/"` - -Gets the config for the application currently deployed on the Ray cluster. This config represents the current goal state for the Serve application. See [single-app config schema](serve-rest-api-config-schema) for the response's JSON schema. - -**Example Request**: -```http -GET /api/serve/deployments/ HTTP/1.1 -Host: http://localhost:52365/ -Accept: application/json -``` - -**Example Response**: - -```http -HTTP/1.1 200 OK -Content-Type: application/json - -{ - "import_path": "fruit.deployment_graph", - "runtime_env": { - "working_dir": "https://github.com/ray-project/serve_config_examples/archive/HEAD.zip" - }, - "deployments": [ - {"name": "MangoStand", "user_config": {"price": 1}}, - {"name": "OrangeStand", "user_config": {"price": 2}}, - {"name": "PearStand", "user_config": {"price": 3}} - ] -} -``` - - -### `GET "/api/serve/deployments/status"` - -Gets the Serve application's current status, including all the deployment statuses. See [status schema](serve-rest-api-response-schema) for the response's JSON schema. - -**Example Request**: - -```http -GET /api/serve/deployments/ HTTP/1.1 -Host: http://localhost:52365/ -Accept: application/json -``` - -**Example Response** - -```http -HTTP/1.1 200 OK -Content-Type: application/json - -{ - "app_status": { - "status": "RUNNING", - "message": "", - "deployment_timestamp": 1855994527.146304 - }, - "deployment_statuses": [ - { - "name": "MangoStand", - "status": "HEALTHY", - "message": "" - }, - { - "name": "OrangeStand", - "status": "HEALTHY", - "message": "" - }, - { - "name": "PearStand", - "status": "HEALTHY", - "message": "" - }, - { - "name": "FruitMarket", - "status": "HEALTHY", - "message": "" - }, - { - "name": "DAGDriver", - "status": "HEALTHY", - "message": "" - } - ] -} -``` - -### `DELETE "/api/serve/deployments/"` - -Shuts down Serve and the Serve application running on the Ray cluster. Has no effect if Serve is not running on the Ray cluster. - -**Example Request**: - -```http -DELETE /api/serve/deployments/ HTTP/1.1 -Host: http://localhost:52365/ -Accept: application/json -``` - -**Example Response** - -```http -HTTP/1.1 200 OK -Content-Type: application/json -``` - -## V2 REST API (Multi-application) - -### `PUT "/api/serve/applications/"` - -Declaratively deploys a list of Serve applications. If Serve is already running on the Ray cluster, removes all applications not listed in the new config. If Serve is not running on the Ray cluster, starts Serve. See [multi-app config schema](serve-rest-api-config-schema) for the request's JSON schema. - -**Example Request**: - -```http -PUT /api/serve/applications/ HTTP/1.1 -Host: http://localhost:52365/ -Accept: application/json -Content-Type: application/json - -{ - "applications": [ - { - "name": "fruit_stand", - "route_prefix": "/fruit", - "import_path": "fruit.deployment_graph", - "runtime_env": { - "working_dir": "https://github.com/ray-project/serve_config_examples/archive/HEAD.zip" - }, - "deployments": [ - {"name": "MangoStand", "user_config": {"price": 1}}, - {"name": "OrangeStand", "user_config": {"price": 2}}, - {"name": "PearStand", "user_config": {"price": 3}} - ] - }, - { - "name": "calculator", - "route_prefix": "/math", - "import_path": "conditional_dag.serve_dag", - "runtime_env": { - "working_dir": "https://github.com/ray-project/test_dag/archive/HEAD.zip" - }, - "deployments": [ - {"name": "Multiplier", "ray_actor_options": {"num_cpus": 0.5}}, - { - "name": "Adder", - "ray_actor_options": {"env_vars": {"override_increment": "5"}} - }, - ] - } - ] -} -``` - - - -**Example Response** - - -```http -HTTP/1.1 200 OK -Content-Type: application/json -``` - -### `GET "/api/serve/applications/"` - -Gets cluster-level info and comprehensive details on all Serve applications deployed on the Ray cluster. See [metadata schema](serve-rest-api-response-schema) for the response's JSON schema. - -```http -GET /api/serve/applications/ HTTP/1.1 -Host: http://localhost:52365/ -Accept: application/json -``` - -**Example Response (abridged JSON)**: - -```http -HTTP/1.1 200 OK -Content-Type: application/json - -{ - "proxy_location": "HeadOnly", - "http_options": { - "host": "127.0.0.1", - "port": 8000 - }, - "deploy_mode": "MULTI_APP", - "applications": { - "fruit_stand": { - "name": "fruit_stand", - "route_prefix": "/fruit", - "docs_path": null, - "status": "RUNNING", - "message": "", - "last_deployed_time_s": 1679952253.748111, - "deployed_app_config": "...", - "deployments": { - "fruit_app_MangoStand": { - "name": "fruit_app_MangoStand", - "status": "HEALTHY", - "message": "", - "deployment_config": "...", - "replicas": [ - { - "replica_id": "fruit_app_MangoStand#bSkrHK", - "state": "RUNNING", - "pid": 59350, - "actor_name": "...", - "actor_id": "...", - "node_id": "...", - "node_ip": "...", - "start_time_s": 1679952254.3458009 - } - ] - }, - } - }, - } -} -``` - -### `DELETE "/api/serve/applications/"` - -Shuts down Serve and all applications running on the Ray cluster. Has no effect if Serve is not running on the Ray cluster. - -**Example Request**: - -```http -DELETE /api/serve/applications/ HTTP/1.1 -Host: http://localhost:52365/ -Accept: application/json -``` - -**Example Response** - -```http -HTTP/1.1 200 OK -Content-Type: application/json -``` - -(serve-rest-api-config-schema)= -## Serve Config Schema - -```{eval-rst} -.. currentmodule:: ray.serve -``` - - -```{eval-rst} -.. autosummary:: - :toctree: doc/ - - schema.ServeDeploySchema - schema.HTTPOptionsSchema - schema.ServeApplicationSchema - schema.DeploymentSchema - schema.RayActorOptionsSchema -``` - -(serve-rest-api-response-schema)= -## Serve Response Schemas - -### V1 REST API -```{eval-rst} -.. autosummary:: - :toctree: doc/ - - schema.ServeStatusSchema -``` - -### V2 REST API -```{eval-rst} -.. autosummary:: - :toctree: doc/ - - schema.ServeInstanceDetails - schema.ApplicationDetails - schema.DeploymentDetails - schema.ReplicaDetails -``` \ No newline at end of file diff --git a/doc/source/serve/api/serve_cli.md b/doc/source/serve/api/serve_cli.md deleted file mode 100644 index 7f3f2705c7c7..000000000000 --- a/doc/source/serve/api/serve_cli.md +++ /dev/null @@ -1,9 +0,0 @@ -(serve-cli)= - -# Serve CLI - -```{eval-rst} -.. click:: ray.serve.scripts:cli - :prog: serve - :show-nested: -``` \ No newline at end of file diff --git a/doc/source/serve/app-builder-guide.md b/doc/source/serve/app-builder-guide.md new file mode 100644 index 000000000000..a35dd6c115e2 --- /dev/null +++ b/doc/source/serve/app-builder-guide.md @@ -0,0 +1,132 @@ +# Passing Arguments to Applications + +This section describes how to pass arguments to your applications using an application builder function. + +## Defining an application builder + +When writing an application, there are often parameters that you want to be able to easily change in development or production. +For example, you might have a path to trained model weights and want to test out a newly trained model. +In Ray Serve, these parameters are typically passed to the constructor of your deployments using `.bind()`. +This pattern allows you to be configure deployments using ordinary Python code but it requires modifying the code anytime one of the parameters needs to change. + +To pass arguments without changing the code, define an "application builder" function that takes an arguments dictionary (or [Pydantic object](typed-app-builders)) and returns the built application to be run. + +```{literalinclude} ../serve/doc_code/app_builder.py +:start-after: __begin_untyped_builder__ +:end-before: __end_untyped_builder__ +:language: python +``` + +You can use this application buidler function as the import path in the `serve run` CLI command or the config file (as shown below). +To avoid writing code to handle type conversions and missing arguments, use a [Pydantic object](typed-app-builders) instead. + +### Passing arguments via `serve run` + +Pass arguments to the application builder from `serve run` using the following syntax: + +```bash +$ serve run hello:app_builder key1=val1 key2=val2 +``` + +The arguments are passed to the application builder as a dictionary, in this case `{"key1": "val1", "key2": "val2"}`. +For example, to pass a new message to the `HelloWorld` app defined above (with the code saved in `hello.py`): + +```bash +% serve run hello:app_builder message="Hello from CLI" +2023-05-16 10:47:31,641 INFO scripts.py:404 -- Running import path: 'hello:app_builder'. +2023-05-16 10:47:33,344 INFO worker.py:1615 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 +(ServeController pid=56826) INFO 2023-05-16 10:47:35,115 controller 56826 deployment_state.py:1244 - Deploying new version of deployment default_HelloWorld. +(ServeController pid=56826) INFO 2023-05-16 10:47:35,141 controller 56826 deployment_state.py:1483 - Adding 1 replica to deployment default_HelloWorld. +(HTTPProxyActor pid=56828) INFO: Started server process [56828] +(ServeReplica:default_HelloWorld pid=56830) Message: Hello from CLI +2023-05-16 10:47:36,131 SUCC scripts.py:424 -- Deployed Serve app successfully. +``` + +Notice that the "Hello from CLI" message is printed from within the deployment constructor. + +### Passing arguments via config file + +Pass arguments to the application builder in the config file's `args` field: + +```yaml +applications: + - name: MyApp + import_path: hello:app_builder + args: + message: "Hello from config" +``` + +For example, to pass a new message to the `HelloWorld` app defined above (with the code saved in `hello.py` and the config saved in `config.yaml`): + +```bash +% serve run config.yaml +2023-05-16 10:49:25,247 INFO scripts.py:351 -- Running config file: 'config.yaml'. +2023-05-16 10:49:26,949 INFO worker.py:1615 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 +2023-05-16 10:49:28,678 SUCC scripts.py:419 -- Submitted deploy config successfully. +(ServeController pid=57109) INFO 2023-05-16 10:49:28,676 controller 57109 controller.py:559 - Starting deploy_serve_application task for application MyApp. +(HTTPProxyActor pid=57111) INFO: Started server process [57111] +(ServeController pid=57109) INFO 2023-05-16 10:49:28,942 controller 57109 deployment_state.py:1244 - Deploying new version of deployment MyApp_HelloWorld. +(ServeController pid=57109) INFO 2023-05-16 10:49:29,016 controller 57109 deployment_state.py:1483 - Adding 1 replica to deployment MyApp_HelloWorld. +(ServeReplica:MyApp_HelloWorld pid=57113) Message: Hello from config +(ServeController pid=57109) INFO 2023-05-16 10:49:30,046 controller 57109 application_state.py:202 - Deploy task for app 'MyApp' ran successfully. +``` + +Notice that the "Hello from config" message is printed from within the deployment constructor. + +(typed-app-builders)= +### Typing arguments with Pydantic + +To avoid writing logic to parse and validate the arguments by hand, define a [Pydantic model](https://pydantic-docs.helpmanual.io/usage/models/) as the single input parameter's type to your application builder function (the parameter must be type annotated). +Arguments are passed the same way, but the resulting dictionary is used to construct the Pydantic model using `model.parse_obj(args_dict)`. + +```{literalinclude} ../serve/doc_code/app_builder.py +:start-after: __begin_typed_builder__ +:end-before: __end_typed_builder__ +:language: python +``` + +```bash +% serve run hello:app_builder message="Hello from CLI" +2023-05-16 10:47:31,641 INFO scripts.py:404 -- Running import path: 'hello:app_builder'. +2023-05-16 10:47:33,344 INFO worker.py:1615 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 +(ServeController pid=56826) INFO 2023-05-16 10:47:35,115 controller 56826 deployment_state.py:1244 - Deploying new version of deployment default_HelloWorld. +(ServeController pid=56826) INFO 2023-05-16 10:47:35,141 controller 56826 deployment_state.py:1483 - Adding 1 replica to deployment default_HelloWorld. +(HTTPProxyActor pid=56828) INFO: Started server process [56828] +(ServeReplica:default_HelloWorld pid=56830) Message: Hello from CLI +2023-05-16 10:47:36,131 SUCC scripts.py:424 -- Deployed Serve app successfully. +``` + +## Common patterns + +### Multiple parametrized applications using the same builder + +You can use application builders to run multiple applications with the same code but different parameters. +For example, multiple applications may share preprocessing and HTTP handling logic but use many different trained model weights. +The same application builder `import_path` can take different arguments to define multiple applications as follows: + +```yaml +applications: + - name: Model1 + import_path: my_module:my_model_code + args: + model_uri: s3://my_bucket/model_1 + - name: Model2 + import_path: my_module:my_model_code + args: + model_uri: s3://my_bucket/model_2 + - name: Model3 + import_path: my_module:my_model_code + args: + model_uri: s3://my_bucket/model_3 +``` + +### Configuring multiple composed deployments + +You can use the arguments passed to an application builder to configure multiple deployments in a single application. +For example a model composition application might take weights to two different models as follows: + +```{literalinclude} ../serve/doc_code/app_builder.py +:start-after: __begin_composed_builder__ +:end-before: __end_composed_builder__ +:language: python +``` diff --git a/doc/source/serve/dev-workflow.md b/doc/source/serve/dev-workflow.md index d32626d049a7..8a40215feeb9 100644 --- a/doc/source/serve/dev-workflow.md +++ b/doc/source/serve/dev-workflow.md @@ -88,7 +88,7 @@ When making the transition from your local machine to a remote cluster, you'll n Let's see a simple example that just packages the code. Run the following command on your local machine, with your remote cluster head node IP address substituted for `` in the command: ```bash -serve run --address=ray://:10001 --working_dir="./project/src" local_dev:graph +serve run --address=ray://:10001 --working-dir="./project/src" local_dev:graph ``` This will connect to the remote cluster via Ray Client, upload the `working_dir` directory, and run your serve application. Here, the local directory specified by `working_dir` must contain `local_dev.py` so that it can be uploaded to the cluster and imported by Ray Serve. diff --git a/doc/source/serve/doc_code/app_builder.py b/doc/source/serve/doc_code/app_builder.py new file mode 100644 index 000000000000..99fc28c66508 --- /dev/null +++ b/doc/source/serve/doc_code/app_builder.py @@ -0,0 +1,79 @@ +# flake8: noqa + +# __begin_untyped_builder__ +from typing import Dict + +from ray import serve +from ray.serve import Application + + +@serve.deployment +class HelloWorld: + def __init__(self, message: str): + self._message = message + print("Message:", self._message) + + def __call__(self, request): + return self._message + + +def app_builder(args: Dict[str, str]) -> Application: + return HelloWorld.bind(args["message"]) + + +# __end_untyped_builder__ + +serve.run(app_builder({"message": "Hello bar"})) +resp = requests.get("http://localhost:8000") +assert resp.text == "Hello bar" + +# __begin_typed_builder__ +from pydantic import BaseModel + +from ray import serve +from ray.serve import Application + + +class HelloWorldArgs(BaseModel): + message: str + + +@serve.deployment +class HelloWorld: + def __init__(self, message: str): + self._message = message + print("Message:", self._message) + + def __call__(self, request): + return self._message + + +def typed_app_builder(args: HelloWorldArgs) -> Application: + return HelloWorld.bind(args.message) + + +# __end_typed_builder__ + +serve.run(typed_app_builder(HelloWorldArgs(message="Hello baz"))) +resp = requests.get("http://localhost:8000") +assert resp.text == "Hello baz" + +# __begin_composed_builder__ +from pydantic import BaseModel + +from ray.serve import Application + + +class ComposedArgs(BaseModel): + model1_uri: str + model2_uri: str + + +def composed_app_builder(args: ComposedArgs) -> Application: + return IngressDeployment.bind( + Model1.bind(args.model1_uri), + Model2.bind(args.model2_uri), + ) + + +# __end_composed_builder__ diff --git a/doc/source/serve/doc_code/getting_started/model_deployment.py b/doc/source/serve/doc_code/getting_started/model_deployment.py index 571de9dda515..482d12ffe4ad 100644 --- a/doc/source/serve/doc_code/getting_started/model_deployment.py +++ b/doc/source/serve/doc_code/getting_started/model_deployment.py @@ -35,11 +35,11 @@ async def __call__(self, http_request: Request) -> str: # __model_end__ # __model_deploy_start__ -translator = Translator.bind() +translator_app = Translator.bind() # __model_deploy_end__ -translator = Translator.options(ray_actor_options={}).bind() -serve.run(translator) +translator_app = Translator.options(ray_actor_options={}).bind() +serve.run(translator_app) # __client_function_start__ # File name: model_client.py diff --git a/doc/source/serve/doc_code/getting_started/model_deployment_full.py b/doc/source/serve/doc_code/getting_started/model_deployment_full.py index f6166779749e..16a4f65d5ade 100644 --- a/doc/source/serve/doc_code/getting_started/model_deployment_full.py +++ b/doc/source/serve/doc_code/getting_started/model_deployment_full.py @@ -30,11 +30,11 @@ async def __call__(self, http_request: Request) -> str: return self.translate(english_text) -translator = Translator.bind() +translator_app = Translator.bind() # __deployment_full_end__ -translator = Translator.options(ray_actor_options={}).bind() -serve.run(translator) +translator_app = Translator.options(ray_actor_options={}).bind() +serve.run(translator_app) import requests diff --git a/doc/source/serve/doc_code/getting_started/model_graph.py b/doc/source/serve/doc_code/getting_started/model_graph.py index b20a91ee3a69..7db64b9ddc83 100644 --- a/doc/source/serve/doc_code/getting_started/model_graph.py +++ b/doc/source/serve/doc_code/getting_started/model_graph.py @@ -6,6 +6,7 @@ import ray from ray import serve +from ray.serve.handle import RayServeHandle from transformers import pipeline @@ -28,7 +29,7 @@ def translate(self, text: str) -> str: @serve.deployment class Summarizer: - def __init__(self, translator): + def __init__(self, translator: RayServeHandle): # Load model self.model = pipeline("summarization", model="t5-small") self.translator = translator @@ -52,13 +53,13 @@ async def __call__(self, http_request: Request) -> str: return translation -deployment_graph = Summarizer.bind(Translator.bind()) +app = Summarizer.bind(Translator.bind()) # __end_graph__ -serve.run(deployment_graph) +serve.run(app) # __start_client__ -# File name: graph_client.py +# File name: composed_client.py import requests english_text = ( diff --git a/doc/source/serve/doc_code/quickstart.py b/doc/source/serve/doc_code/quickstart.py index adaf042e3b76..13aaddfca819 100644 --- a/doc/source/serve/doc_code/quickstart.py +++ b/doc/source/serve/doc_code/quickstart.py @@ -5,7 +5,7 @@ from ray import serve -# 1: Define a Ray Serve deployment. +# 1: Define a Ray Serve application. @serve.deployment(route_prefix="/") class MyModelDeployment: def __init__(self, msg: str): @@ -16,9 +16,11 @@ def __call__(self, request: Request) -> Dict: return {"result": self._msg} -# 2: Deploy the model. -serve.run(MyModelDeployment.bind(msg="Hello world!")) +app = MyModelDeployment.bind(msg="Hello world!") -# 3: Query the deployment and print the result. +# 2: Deploy the application locally. +serve.run(app) + +# 3: Query the application and print the result. print(requests.get("http://localhost:8000/").json()) # {'result': 'Hello world!'} diff --git a/doc/source/serve/doc_code/quickstart_composed.py b/doc/source/serve/doc_code/quickstart_composed.py new file mode 100644 index 000000000000..f88250d8116c --- /dev/null +++ b/doc/source/serve/doc_code/quickstart_composed.py @@ -0,0 +1,47 @@ +import requests +import starlette +from typing import Dict +from ray import serve +from ray.serve.handle import RayServeHandle + + +# 1. Define the models in our composition graph and an ingress that calls them. +@serve.deployment +class Adder: + def __init__(self, increment: int): + self.increment = increment + + def add(self, inp: int): + return self.increment + inp + + +@serve.deployment +class Combiner: + def average(self, *inputs) -> float: + return sum(inputs) / len(inputs) + + +@serve.deployment +class Ingress: + def __init__( + self, adder1: RayServeHandle, adder2: RayServeHandle, combiner: RayServeHandle + ): + self._adder1, self._adder2, self._combiner = adder1, adder2, combiner + + async def __call__(self, request: starlette.requests.Request) -> Dict[str, float]: + input_json = await request.json() + + adder1_result = await self._adder1.add.remote(input_json["val"]) + adder2_result = await self._adder2.add.remote(input_json["val"]) + final_result = await self._combiner.average.remote(adder1_result, adder2_result) + + return {"result": await final_result} + + +# 2. Build the application consisting of the models and ingress. +app = Ingress.bind(Adder.bind(increment=1), Adder.bind(increment=2), Combiner.bind()) +serve.run(app) + +# 3: Query the application and print the result. +print(requests.post("http://localhost:8000/", json={"val": 100.0}).json()) +# {"result": 101.5} diff --git a/doc/source/serve/doc_code/quickstart_graph.py b/doc/source/serve/doc_code/quickstart_graph.py deleted file mode 100644 index a98e6aaf8d4a..000000000000 --- a/doc/source/serve/doc_code/quickstart_graph.py +++ /dev/null @@ -1,35 +0,0 @@ -import requests -from ray import serve -from ray.serve.drivers import DAGDriver -from ray.serve.dag import InputNode -from ray.serve.http_adapters import json_request - - -# 1. Define the models in our composition graph -@serve.deployment -class Adder: - def __init__(self, increment: int): - self.increment = increment - - def predict(self, inp: int): - return self.increment + inp - - -@serve.deployment -def combine_average(*input_values) -> float: - return {"result": sum(input_values) / len(input_values)} - - -# 2: Define the model composition graph and call it. -with InputNode() as input_node: - adder_1 = Adder.bind(increment=1) - adder_2 = Adder.bind(increment=2) - dag = combine_average.bind( - adder_1.predict.bind(input_node), adder_2.predict.bind(input_node) - ) - -serve.run(DAGDriver.bind(dag, http_adapter=json_request)) - -# 3: Query the deployment and print the result. -print(requests.post("http://localhost:8000/", json=100).json()) -# {"result": 101.5} diff --git a/doc/source/serve/doc_code/stable_diffusion.py b/doc/source/serve/doc_code/stable_diffusion.py index d60db3e75005..8b9a86cf0a49 100644 --- a/doc/source/serve/doc_code/stable_diffusion.py +++ b/doc/source/serve/doc_code/stable_diffusion.py @@ -59,7 +59,7 @@ def generate(self, prompt: str, img_size: int = 512): return image -my_first_deployment = APIIngress.bind(StableDiffusionV2.bind()) +entrypoint = APIIngress.bind(StableDiffusionV2.bind()) # __example_code_end__ @@ -88,7 +88,7 @@ def serve_session(deployment): } ) - with serve_session(my_first_deployment) as handle: + with serve_session(entrypoint) as handle: ray.get(handle.generate.remote("hi")) prompt = "a cute cat is dancing on the grass." diff --git a/doc/source/serve/getting_started.md b/doc/source/serve/getting_started.md index b639a3916744..a900ffd1d39f 100644 --- a/doc/source/serve/getting_started.md +++ b/doc/source/serve/getting_started.md @@ -2,10 +2,10 @@ # Getting Started -This tutorial will walk you through the process of deploying models with Ray Serve. It will show you how to +This tutorial will walk you through the process of writing and testing a Ray Serve application. It will show you how to -* expose your models over HTTP using deployments -* test your deployments over HTTP +* convert a machine learning model to a Ray Serve deployment +* test a Ray Serve application locally over HTTP * compose multiple-model machine learning models together into a single application We'll use two models in this tutorial: @@ -29,7 +29,7 @@ pip install "ray[serve]" transformers requests torch ``` -## Model Example: Before Ray Serve +## Text Translation Model (before Ray Serve) First, let's take a look at our text-translation model. Here's its code: @@ -37,18 +37,16 @@ First, let's take a look at our text-translation model. Here's its code: :start-after: __start_translation_model__ :end-before: __end_translation_model__ :language: python -:linenos: true ``` The Python file, called `model.py`, uses the `Translator` class to translate English text to French. -- The `self.model` variable on line 8 inside `Translator`'s `__init__` method +- The `self.model` variable inside `Translator`'s `__init__` method stores a function that uses the [t5-small](https://huggingface.co/t5-small) model to translate text. - When `self.model` is called on English text, it returns translated French text inside a dictionary formatted as `[{"translation_text": "..."}]`. -- The `Translator`'s `translate` method extracts the translated text on - line 15 by indexing into the dictionary. +- The `Translator`'s `translate` method extracts the translated text by indexing into the dictionary. You can copy-paste this script and run it locally. It translates `"Hello world!"` into `"Bonjour Monde!"`. @@ -66,14 +64,14 @@ PyTorch, and Tensorflow for more info and examples: - {ref}`serve-ml-models-tutorial` -(converting-to-ray-serve-deployment)= -## Converting to a Ray Serve Deployment +(converting-to-ray-serve-application)= +## Converting to a Ray Serve Application In this section, we'll deploy the text translation model using Ray Serve, so it can be scaled up and queried over HTTP. We'll start by converting -`Translator` into a Ray Serve deployment that runs locally on your computer. +`Translator` into a Ray Serve deployment. -First, we open a new Python file and import `ray` and `ray serve`: +First, we open a new Python file and import `ray` and `ray.serve`: ```{literalinclude} ../serve/doc_code/getting_started/model_deployment.py :start-after: __import_start__ @@ -93,8 +91,7 @@ The `Translator` class has two modifications: 1. It has a decorator, `@serve.deployment`. 2. It has a new method, `__call__`. -The decorator converts `Translator` from a Python class into a Ray Serve -`Deployment` object. +The decorator converts `Translator` from a Python class into a Ray Serve `Deployment` object. Each deployment stores a single Python function or class that you write and uses it to serve requests. You can scale and configure each of your deployments independently using @@ -114,11 +111,11 @@ class Translator: ... ``` -Deployments receive Starlette HTTP `request` objects [^f1]. If your deployment stores a Python function, the function is called on this `request` object. If your deployment stores a class, the class's `__call__` method is called on this `request` object. The return value is sent back in the HTTP response body. +Deployments receive Starlette HTTP `request` objects [^f1]. By default, the deployment class's `__call__` method is called on this `request` object. The return value is sent back in the HTTP response body. This is why `Translator` needs a new `__call__` method. The method processes the incoming HTTP request by reading its JSON data and forwarding it to the `translate` method. The translated text is returned and sent back through the HTTP response. You can also use Ray Serve's FastAPI integration to avoid working with raw HTTP requests. Check out {ref}`serve-fastapi-http` for more info about FastAPI with Serve. -Next, we need to `bind` our `Translator` deployment to arguments that Ray Serve can pass into its constructor. This will let Ray Serve initialize a `Translator` object that can serve requests. Since `Translator`'s constructor doesn't take in any arguments, we can call the deployment's `bind` method without passing anything in: +Next, we need to `bind` our `Translator` deployment to arguments that will be passed into its constructor. This defines a Ray Serve application that we can run locally or deploy to production (you'll see later that applications can consist of multiple deployments). Since `Translator`'s constructor doesn't take in any arguments, we can call the deployment's `bind` method without passing anything in: ```{literalinclude} ../serve/doc_code/getting_started/model_deployment.py :start-after: __model_deploy_start__ @@ -126,37 +123,33 @@ Next, we need to `bind` our `Translator` deployment to arguments that Ray Serve :language: python ``` -With that, we can run our model on Ray Serve! -Here's the full Ray Serve script that we built: +With that, we are ready to test the application locally. + +## Running a Ray Serve Application + +Here's the full Ray Serve script that we built above: ```{literalinclude} ../serve/doc_code/getting_started/model_deployment_full.py :start-after: __deployment_full_start__ :end-before: __deployment_full_end__ :language: python -:linenos: true ``` -We can run our script with the `serve run` CLI command. This command takes in an import path -to our deployment formatted as `module:bound_deployment`. Make sure to run the command from a directory containing a local copy of this script, so it can find the bound deployment: +To test locally, we run the script with the `serve run` CLI command. This command takes in an import path +to our deployment formatted as `module:application`. Make sure to run the command from a directory containing a local copy of this script saved as `serve_quickstart.py`, so it can import the application: ```console -$ serve run serve_deployment:translator +$ serve run serve_quickstart:translator_app ``` -This command will start running `Translator` and then block. It can be killed with `ctrl-C` in the terminal. - -## Testing Ray Serve Deployments +This command will run the `translator_app` application and then block, streaming logs to the console. It can be killed with `Ctrl-C`, which will tear down the application. -We can now test our model over HTTP. It can be reached at the following URL: +We can now test our model over HTTP. It can be reached at the following URL by default: ``` http://127.0.0.1:8000/ ``` -Since the cluster is deployed locally in this tutorial, the `127.0.0.1:8000` -refers to a localhost with port 8000 (the default port where you can reach -Serve deployments). - We'll send a POST request with JSON data containing our English text. `Translator`'s `__call__` method will unpack this text and forward it to the `translate` method. Here's a client script that requests a translation for "Hello world!": @@ -170,7 +163,7 @@ We'll send a POST request with JSON data containing our English text. To test our deployment, first make sure `Translator` is running: ``` -$ serve run serve_deployment:translator +$ serve run serve_deployment:translator_app ``` While `Translator` is running, we can open a separate terminal window and run the client script. This will get a response over HTTP: @@ -181,9 +174,10 @@ $ python model_client.py Bonjour monde! ``` -## Composing Machine Learning Models with Deployment Graphs +## Composing Multiple Models -Ray Serve's Deployment Graph API allows us to compose multiple machine learning models together into a single Ray Serve application. We can use parameters like `num_replicas`, `num_cpus`, and `num_gpus` to independently configure and scale each deployment in the graph. +Ray Serve allows you to compose multiple deployments into a single Ray Serve application. This makes it easy to combine multiple machine learning models along with business logic to serve a single request. +We can use parameters like `autoscaling_config`, `num_replicas`, `num_cpus`, and `num_gpus` to independently configure and scale each deployment in the application. For example, let's deploy a machine learning pipeline with two steps: @@ -201,39 +195,38 @@ For example, let's deploy a machine learning pipeline with two steps: You can copy-paste this script and run it locally. It summarizes the snippet from _A Tale of Two Cities_ to `it was the best of times, it was worst of times .` ```console -$ python model.py +$ python summary_model.py it was the best of times, it was worst of times . ``` -Here's a Ray Serve deployment graph that chains the two models together. The graph takes English text, summarizes it, and then translates it: +Here's an application that chains the two models together. The graph takes English text, summarizes it, and then translates it: ```{literalinclude} ../serve/doc_code/getting_started/model_graph.py :start-after: __start_graph__ :end-before: __end_graph__ :language: python -:linenos: true ``` -This script contains our `Summarizer` class converted to a deployment and our `Translator` class with some modifications. In this script, the `Summarizer` class contains the `__call__` method since requests are sent to it first. It also takes in the `Translator` as one of its constructor arguments, so it can forward summarized texts to the `Translator` deployment. The `__call__` method also contains some new code on lines 44 and 45: +This script contains our `Summarizer` class converted to a deployment and our `Translator` class with some modifications. In this script, the `Summarizer` class contains the `__call__` method since requests are sent to it first. It also takes in the `Translator` as one of its constructor arguments, so it can forward summarized texts to the `Translator` deployment. The `__call__` method also contains some new code: ```python translation_ref = await self.translator.translate.remote(summary) translation = await translation_ref ``` -`self.translator.translate.remote(summary)` issues an asynchronous call to the `Translator`'s `translate` method. Essentially, this line tells Ray to schedule a request to the `Translator` deployment's `translate` method, which can be fulfilled asynchronously. The line immediately returns a reference to the method's output. The next line `await translation_ref` waits for `translate` to execute and returns the value of that execution. +`self.translator.translate.remote(summary)` issues an asynchronous call to the `Translator`'s `translate` method. The line immediately returns a reference to the method's output, then the next line `await translation_ref` waits for `translate` to execute and returns the value of that execution. -We compose our graph in line 52: +We define the full application as follows: ```python deployment_graph = Summarizer.bind(Translator.bind()) ``` -Here, we bind `Translator` to its (empty) constructor arguments, and then we pass in the bound `Translator` as the constructor argument for the `Summarizer`. We can run this deployment graph using the `serve run` CLI command. Make sure to run this command from a directory containing a local copy of the `graph.py` code: +Here, we bind `Translator` to its (empty) constructor arguments, and then we pass in the bound `Translator` as the constructor argument for the `Summarizer`. We can run this deployment graph using the `serve run` CLI command. Make sure to run this command from a directory containing a local copy of the `serve_quickstart_composed.py` code: ```console -$ serve run graph:deployment_graph +$ serve run serve_quickstart_composed:app ``` We can use this client script to make requests to the graph: @@ -244,15 +237,15 @@ We can use this client script to make requests to the graph: :language: python ``` -While the graph is running, we can open a separate terminal window and run the client script: +While the application is running, we can open a separate terminal window and query it: ```console -$ python graph_client.py +$ python composed_client.py c'était le meilleur des temps, c'était le pire des temps . ``` -Deployment graphs are useful since they let you deploy each part of your machine learning pipeline, such as inference and business logic steps, in separate deployments. Each of these deployments can be individually configured and scaled, ensuring you get maximal performance from your resources. See the guide on [model composition](serve-model-composition) to learn more. +Composed Ray Serve applications let you deploy each part of your machine learning pipeline, such as inference and business logic steps, in separate deployments. Each of these deployments can be individually configured and scaled, ensuring you get maximal performance from your resources. See the guide on [model composition](serve-model-composition) to learn more. ## Next Steps @@ -264,6 +257,4 @@ Deployment graphs are useful since they let you deploy each part of your machine ```{rubric} Footnotes ``` -[^f1]: [Starlette](https://www.starlette.io/) is a web server framework - used by Ray Serve. Its [Request](https://www.starlette.io/requests/) class - provides a nice interface for incoming HTTP requests. +[^f1]: [Starlette](https://www.starlette.io/) is a web server framework used by Ray Serve. diff --git a/doc/source/serve/index.md b/doc/source/serve/index.md index 7d5fe2fa3011..3c81829c6d03 100644 --- a/doc/source/serve/index.md +++ b/doc/source/serve/index.md @@ -19,7 +19,7 @@ (rayserve-overview)= Ray Serve is a scalable model serving library for building online inference APIs. -Serve is framework agnostic, so you can use a single toolkit to serve everything from deep learning models built with frameworks like PyTorch, Tensorflow, and Keras, to Scikit-Learn models, to arbitrary Python business logic. +Serve is framework-agnostic, so you can use a single toolkit to serve everything from deep learning models built with frameworks like PyTorch, Tensorflow, and Keras, to Scikit-Learn models, to arbitrary Python business logic. Serve is particularly well suited for [model composition](serve-model-composition), enabling you to build a complex inference service consisting of multiple ML models and business logic all in Python code. @@ -32,47 +32,59 @@ Install Ray Serve and its dependencies: ```bash pip install "ray[serve]" ``` - -In this quick-start example we will define a simple "hello world" deployment, deploy it behind HTTP locally, and query it. +Define a simple "hello world" application, run it locally, and query it over HTTP. ```{literalinclude} doc_code/quickstart.py :language: python ``` -:::{tabbed} More examples +::::{tab-set} + +:::{tab-item} More examples + For more examples, select from the tabs. + ::: -:::{tabbed} Model composition +:::: + +::::{tab-set} -In this example, we demonstrate how you can use Serve's model composition API to express a complex computation graph and deploy it as a Serve application. +:::{tab-item} Model composition -```{literalinclude} doc_code/quickstart_graph.py +Use Serve's model composition API to combine multiple deployments into a single application. + +```{literalinclude} doc_code/quickstart_composed.py :language: python ``` + ::: -:::{tabbed} FastAPI integration +:::{tab-item} FastAPI integration -In this example we will use Serve's [FastAPI](https://fastapi.tiangolo.com/) integration to make use of more advanced HTTP functionality. +Use Serve's [FastAPI](https://fastapi.tiangolo.com/) integration to elegantly handle HTTP parsing and validation. ```{literalinclude} doc_code/fastapi_example.py :language: python ``` + ::: -:::{tabbed} Hugging Face Transformers model +:::{tab-item} Hugging Face Transformers model To run this example, install the following: ``pip install transformers`` -In this example we will serve a pre-trained [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) model using Ray Serve. +Serve a pre-trained [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) model using Ray Serve. The model we'll use is a sentiment analysis model: it will take a text string as input and return if the text was "POSITIVE" or "NEGATIVE." ```{literalinclude} doc_code/transformers_example.py :language: python ``` + ::: +:::: + ## Why choose Serve? :::{dropdown} Build end-to-end ML-powered applications @@ -154,7 +166,7 @@ Serve supports arbitrary Python code and therefore integrates well with the MLOp :::{dropdown} TFServing, TorchServe, ONNXRuntime :animate: fade-in-slide-down -Ray Serve is *framework agnostic*, so you can use it alongside any other Python framework or library. +Ray Serve is *framework-agnostic*, so you can use it alongside any other Python framework or library. We believe data scientists should not be bound to a particular machine learning framework. They should be empowered to use the best tool available for the job. @@ -210,83 +222,106 @@ or head over to the {doc}`tutorials/index` to get started building your Ray Serv ```{eval-rst} -.. panels:: - :container: text-center - :column: col-lg-6 px-2 py-2 - :card: - - **Getting Started** - ^^^ - - Start with our quick start tutorials for :ref:`deploying a single model locally ` and how to :ref:`convert an existing model into a Ray Serve deployment ` . - - +++ - .. link-button:: getting-started - :type: ref - :text: Get Started with Ray Serve - :classes: btn-outline-info btn-block - --- - - **Key Concepts** - ^^^ - - Understand the key concepts behind Ray Serve. - Learn about :ref:`Deployments `, :ref:`how to query them `, and the :ref:`Deployment Graph ` API for composing models into a graph structure. - - +++ - .. link-button:: serve-key-concepts - :type: ref - :text: Learn Key Concepts - :classes: btn-outline-info btn-block - --- - - **User Guides** - ^^^ - Learn best practices for common patterns like :ref:`scaling and resource allocation ` and :ref:`model composition `. - Learn how to :ref:`develop Serve applications locally ` and :ref:`go to production `. - - +++ - .. link-button:: serve-user-guides - :type: ref - :text: Start Using Ray Serve - :classes: btn-outline-info btn-block - --- - - **Examples** - ^^^ - - Follow the tutorials to learn how to integrate Ray Serve with :ref:`TensorFlow `, :ref:`Scikit-Learn `, and :ref:`RLlib `. - - +++ - .. link-button:: serve-examples - :type: ref - :text: Serve Examples - :classes: btn-outline-info btn-block - --- - - **API Reference** - ^^^ - - Get more in-depth information about the Ray Serve API. - - +++ - .. link-button:: serve-api - :type: ref - :text: Read the API Reference - :classes: btn-outline-info btn-block - - --- - - **Serve Architecture** - ^^^ - - Understand how each component in Ray Serve works. - - +++ - .. link-button:: serve-architecture - :type: ref - :text: Understand Serve Architecture - :classes: btn-outline-info btn-block +.. grid:: 1 2 2 2 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + **Getting Started** + ^^^ + + Start with our quick start tutorials for :ref:`deploying a single model locally ` and how to :ref:`convert an existing model into a Ray Serve deployment ` . + + +++ + .. button-ref:: getting-started + :color: primary + :outline: + :expand: + + Get Started with Ray Serve + + .. grid-item-card:: + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + **Key Concepts** + ^^^ + + Understand the key concepts behind Ray Serve. + Learn about :ref:`Deployments `, :ref:`how to query them `, and the :ref:`Deployment Graph ` API for composing models into a graph structure. + + +++ + .. button-ref:: serve-key-concepts + :color: primary + :outline: + :expand: + + Learn Key Concepts + + .. grid-item-card:: + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + **User Guides** + ^^^ + Learn best practices for common patterns like :ref:`scaling and resource allocation ` and :ref:`model composition `. + Learn how to :ref:`develop Serve applications locally ` and :ref:`go to production `. + + +++ + .. button-ref:: serve-user-guides + :color: primary + :outline: + :expand: + + Start Using Ray Serve + + .. grid-item-card:: + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + **Examples** + ^^^ + + Follow the tutorials to learn how to integrate Ray Serve with :ref:`TensorFlow `, :ref:`Scikit-Learn `, and :ref:`RLlib `. + + +++ + .. button-ref:: serve-examples + :color: primary + :outline: + :expand: + + Serve Examples + + .. grid-item-card:: + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + **API Reference** + ^^^ + + Get more in-depth information about the Ray Serve API. + + +++ + .. button-ref:: serve-api + :color: primary + :outline: + :expand: + + Read the API Reference + + .. grid-item-card:: + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + **Serve Architecture** + ^^^ + + Understand how each component in Ray Serve works. + + +++ + .. button-ref:: serve-architecture + :color: primary + :outline: + :expand: + + Understand Serve Architecture ``` For more, see the following blog posts about Ray Serve: diff --git a/doc/source/serve/migration.md b/doc/source/serve/migration.md index f19bac4cae7c..1d0501695914 100644 --- a/doc/source/serve/migration.md +++ b/doc/source/serve/migration.md @@ -6,7 +6,7 @@ This section covers what to consider or change in your application when migratin ## What has been changed? -In Ray Serve 2.0, we released a [new deployment API](converting-to-ray-serve-deployment). The 1.x deployment API can still be used, but it will be deprecated in the future version. +In Ray Serve 2.0, we released a [new deployment API](converting-to-ray-serve-application). The 1.x deployment API can still be used, but it will be deprecated in the future version. ## Migrating the 1.x Deployment diff --git a/doc/source/serve/model_composition.md b/doc/source/serve/model_composition.md index d07dcc68211e..f7d9c542b936 100644 --- a/doc/source/serve/model_composition.md +++ b/doc/source/serve/model_composition.md @@ -331,22 +331,32 @@ You can render an illustration of your deployment graph to see its nodes and the Make sure you have `pydot` and `graphviz` to follow this section: -::::{tabbed} MacOS +::::{tab-set} + +:::{tab-item} MacOS + ``` pip install -U pydot && brew install graphviz ``` -:::: -::::{tabbed} Windows +::: + +:::{tab-item} Windows + ``` pip install -U pydot && winget install graphviz ``` -:::: -::::{tabbed} Linux +::: + +:::{tab-item} Linux + ``` pip install -U pydot && sudo apt-get install -y graphviz ``` + +::: + :::: Here's an example graph: diff --git a/doc/source/serve/production-guide/fault-tolerance.md b/doc/source/serve/production-guide/fault-tolerance.md index eb4fbd837f41..ed28518240f8 100644 --- a/doc/source/serve/production-guide/fault-tolerance.md +++ b/doc/source/serve/production-guide/fault-tolerance.md @@ -60,7 +60,7 @@ See Serve's [Kubernetes production guide](serve-in-production-kubernetes) to lea In this section, you'll learn how to add fault tolerance to Ray's Global Control Store (GCS), which allows your Serve application to serve traffic even when the head node crashes. -By default the Ray head node is a single point of failure: if it crashes, the entire Ray cluster crashes and must be restarted. When running on Kubernetes, the `RayService` controller health-checks the Ray cluster and restarts it if this occurs, but this introduces some downtime. +By default, the Ray head node is a single point of failure: if it crashes, the entire Ray cluster crashes and must be restarted. When running on Kubernetes, the `RayService` controller health-checks the Ray cluster and restarts it if this occurs, but this introduces some downtime. In Ray 2.0, KubeRay added **experimental support** for [Global Control Store (GCS) fault tolerance](https://ray-project.github.io/kuberay/guidance/gcs-ft/#ray-gcs-fault-tolerancegcs-ft-experimental), preventing the Ray cluster from crashing if the head node goes down. While the head node is recovering, Serve applications can still handle traffic via worker nodes but cannot be updated or recover from other failures (e.g. actors or worker nodes crashing). @@ -149,7 +149,9 @@ After adding the Redis objects, you also need to modify the `RayService` configu First, you need to update your `RayService` metadata's annotations: -::::{tabbed} Vanilla Config +::::{tab-set} + +:::{tab-item} Vanilla Config ```yaml ... apiVersion: ray.io/v1alpha1 @@ -159,9 +161,9 @@ metadata: spec: ... ``` -:::: +::: -::::{tabbed} Fault Tolerant Config +:::{tab-item} Fault Tolerant Config :selected: ```yaml ... @@ -175,6 +177,8 @@ metadata: spec: ... ``` +::: + :::: The annotations are: @@ -183,7 +187,10 @@ The annotations are: Next, you need to add the `RAY_REDIS_ADDRESS` environment variable to the `headGroupSpec`: -::::{tabbed} Vanilla Config +::::{tab-set} + +:::{tab-item} Vanilla Config + ```yaml apiVersion: ray.io/v1alpha1 kind: RayService @@ -201,10 +208,12 @@ spec: env: ... ``` -:::: -::::{tabbed} Fault Tolerant Config +::: + +:::{tab-item} Fault Tolerant Config :selected: + ```yaml apiVersion: ray.io/v1alpha1 kind: RayService @@ -224,6 +233,8 @@ spec: - name: RAY_REDIS_ADDRESS value: redis:6379 ``` +::: + :::: `RAY_REDIS_ADDRESS`'s value should be your Redis database's `redis://` address. It should contain your Redis database's host and port. An [example Redis address](https://www.iana.org/assignments/uri-schemes/prov/rediss) is `redis://user:secret@localhost:6379/0?foo=bar&qux=baz`. @@ -241,18 +252,22 @@ Check out the KubeRay guide on [GCS fault tolerance](https://ray-project.github. This section explains how Serve recovers from system failures. It uses the following Serve application and config as a working example. -::::{tabbed} Python Code +::::{tab-set} + +:::{tab-item} Python Code ```{literalinclude} ../doc_code/fault_tolerance/sleepy_pid.py :start-after: __start__ :end-before: __end__ :language: python ``` -:::: +::: -::::{tabbed} Kubernetes Config +:::{tab-item} Kubernetes Config ```{literalinclude} ../doc_code/fault_tolerance/k8s_config.yaml :language: yaml ``` +::: + :::: Follow the [KubeRay quickstart guide](kuberay-quickstart) to: diff --git a/doc/source/serve/production-guide/monitoring.md b/doc/source/serve/production-guide/monitoring.md index 03f8e61e1237..dc3aba959c33 100644 --- a/doc/source/serve/production-guide/monitoring.md +++ b/doc/source/serve/production-guide/monitoring.md @@ -243,32 +243,39 @@ The following metrics are exposed by Ray Serve: - * deployment * replica * route + * application - The number of queries that have been processed in this replica. * - ``serve_deployment_error_counter`` [**] - * deployment * replica * route + * application - The number of exceptions that have occurred in the deployment. * - ``serve_deployment_replica_starts`` [**] - * deployment * replica + * application - The number of times this replica has been restarted due to failure. * - ``serve_deployment_replica_healthy`` - * deployment * replica + * application - Whether this deployment replica is healthy. 1 means healthy, 0 unhealthy. * - ``serve_deployment_processing_latency_ms`` [**] - * deployment * replica * route + * application - The latency for queries to be processed. * - ``serve_replica_processing_queries`` [**] - * deployment * replica + * application - The current number of queries being processed. * - ``serve_num_http_requests`` [*] - * route * method + * application - The number of HTTP requests processed. * - ``serve_num_http_error_requests`` [*] - * route @@ -278,11 +285,13 @@ The following metrics are exposed by Ray Serve: * - ``serve_num_router_requests`` [*] - * deployment * route + * application - The number of requests processed by the router. * - ``serve_handle_request_counter`` [**] - * handle * deployment * route + * application - The number of requests processed by this ServeHandle. * - ``serve_deployment_queued_queries`` [*] - * deployment @@ -293,9 +302,12 @@ The following metrics are exposed by Ray Serve: * error_code * method * route + * application - The number of non-200 HTTP responses returned by each deployment. * - ``serve_http_request_latency_ms`` [*] - * route + * application + - The end-to-end latency of HTTP requests (measured from the Serve HTTP proxy). ``` [*] - only available when using HTTP calls diff --git a/doc/source/serve/tutorials/gradio-dag-visualization.md b/doc/source/serve/tutorials/gradio-dag-visualization.md index 5e34ad62ddb7..516581cfaba8 100644 --- a/doc/source/serve/tutorials/gradio-dag-visualization.md +++ b/doc/source/serve/tutorials/gradio-dag-visualization.md @@ -12,24 +12,34 @@ pip install gradio ``` Additionally, you can optionally install `pydot` and `graphviz`. This will allow this tool to incorporate the complementary [graphical illustration](pydot-visualize-dag) of the nodes and edges. -::::{tabbed} MacOS + +::::{tab-set} + +:::{tab-item} MacOS + ``` pip install -U pydot && brew install graphviz ``` -:::: -::::{tabbed} Windows +::: + +:::{tab-item} Windows + ``` pip install -U pydot && winget install graphviz ``` -:::: -::::{tabbed} Linux +::: + +:::{tab-item} Linux + ``` pip install -U pydot && sudo apt-get install -y graphviz ``` -:::: +::: + +:::: Also, for the [quickstart example](gradio-vis-quickstart), install the `transformers` module to pull models through [HuggingFace's Pipelines](https://huggingface.co/docs/transformers/main_classes/pipelines). ```console diff --git a/doc/source/serve/tutorials/gradio-integration.md b/doc/source/serve/tutorials/gradio-integration.md index 3f865f185f86..3a8e852a888b 100644 --- a/doc/source/serve/tutorials/gradio-integration.md +++ b/doc/source/serve/tutorials/gradio-integration.md @@ -44,6 +44,10 @@ Currently, there is no support for routing requests properly to multiple replica `GradioServer` is simply `GradioIngress` but wrapped in a Serve deployment. You can use `GradioServer` for the simple wrap-and-deploy use case, but as you will see in the next section, you can use `GradioIngress` to define your own Gradio Server for more customized use cases. ::: +:::{note} +Ray can’t pickle Gradio. Instead, pass a builder function that constructs the Gradio interface. +::: + Using either Gradio app `io` constructed by the builder function above or providing your own application (of type `Interface`, `Block`, `Parallel`, etc.), wrap it in your Gradio Server. Pass the builder function as input to your Gradio Server. It will be used to construct your Gradio app on the Ray cluster. ```{literalinclude} ../doc_code/gradio-integration.py diff --git a/doc/source/serve/tutorials/serve-ml-models.md b/doc/source/serve/tutorials/serve-ml-models.md index 5e63c4779c2d..93432e46f6f8 100644 --- a/doc/source/serve/tutorials/serve-ml-models.md +++ b/doc/source/serve/tutorials/serve-ml-models.md @@ -7,9 +7,9 @@ In this guide, we will show you how to train models from various machine learnin Please see the [Key Concepts](serve-key-concepts) to learn more general information about Ray Serve. +:::::{tab-set} -::::{tabbed} Keras and Tensorflow - +::::{tab-item} Keras and Tensorflow Let's train and deploy a simple Tensorflow neural net. In particular, we will show: @@ -17,7 +17,7 @@ In particular, we will show: - How to train a Tensorflow model and load the model from your file system in your Ray Serve deployment. - How to parse the JSON request and make a prediction. -Ray Serve is framework agnostic -- you can use any version of Tensorflow. +Ray Serve is framework-agnostic -- you can use any version of Tensorflow. However, for this tutorial, we will use Tensorflow 2 and Keras. We will also need `requests` to send HTTP requests to your model deployment. If you haven't already, please install Tensorflow 2 and requests by running: ```console @@ -38,14 +38,14 @@ Next, let's train a simple MNIST model using Keras. :end-before: __doc_train_model_end__ ``` -Next, we define a class `TFMnistModel` that will accept HTTP requests and run the MNIST model that we trained. It is decorated with `@serve.deployment` to make it a deployment object so it can be deployed onto Ray Serve. Note that the Serve deployment is exposed over an HTTP route, and by default the `__call__` method is invoked when a request is sent to your deployment over HTTP. +Next, we define a class `TFMnistModel` that will accept HTTP requests and run the MNIST model that we trained. It is decorated with `@serve.deployment` to make it a deployment object, so it can be deployed onto Ray Serve. Note that the Serve deployment is exposed over an HTTP route, and by default the `__call__` method is invoked when a request is sent to your deployment over HTTP. ```{literalinclude} ../doc_code/tutorial_tensorflow.py :start-after: __doc_define_servable_begin__ :end-before: __doc_define_servable_end__ ``` -:::{note} +:::{note} When `TFMnistModel` is deployed and instantiated, it will load the Tensorflow model from your file system so that it can be ready to run inference on the model and serve requests later. ::: @@ -56,7 +56,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be :end-before: __doc_deploy_end__ ``` -:::{note} +:::{note} `TFMnistModel.bind(TRAINED_MODEL_PATH)` binds the argument `TRAINED_MODEL_PATH` to our deployment and returns a `DeploymentNode` object (wrapping an `TFMnistModel` deployment object) that can then be used to connect with other `DeploymentNodes` to form a more complex [deployment graph](serve-model-composition-deployment-graph). ::: @@ -72,7 +72,7 @@ If you see the following error: ```console TypeError: Descriptors cannot not be created directly. If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0. - If you cannot immediately regenerate your protos, some other possible workarounds are: + If you cannot immediately regenerate your protos, some other possible workarounds are: 1. Downgrade the protobuf package to 3.20.x or lower. 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower). ``` @@ -116,7 +116,7 @@ You should get an output like the following (the exact prediction may vary): ``` :::: -::::{tabbed} Pytorch +::::{tab-item} Pytorch Let's load and deploy a PyTorch Resnet Model. In particular, we will show: @@ -144,7 +144,7 @@ We define a class `ImageModel` that parses the input data, transforms the images :end-before: __doc_define_servable_end__ ``` -:::{note} +:::{note} When `ImageModel` is deployed and instantiated, it will load the resnet18 model from `torchvision` so that it can be ready to run inference on the model and serve requests later. ::: @@ -155,7 +155,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be :end-before: __doc_deploy_end__ ``` -:::{note} +:::{note} `ImageModel.bind()` returns a `DeploymentNode` object (wrapping an `ImageModel` deployment object) that can then be used to connect with other `DeploymentNodes` to form a more complex [deployment graph](serve-model-composition-deployment-graph). ::: @@ -185,7 +185,7 @@ You should get an output like the following (the exact number may vary): ``` :::: -::::{tabbed} Scikit-Learn +::::{tab-item} Scikit-Learn Let's train and deploy a simple Scikit-Learn classifier. In particular, we will show: @@ -193,7 +193,7 @@ In particular, we will show: - How to load the Scikit-Learn model from file system in your Ray Serve definition. - How to parse the JSON request and make a prediction. -Ray Serve is framework agnostic. You can use any version of sklearn. We will also need `requests` to send HTTP requests to your model deployment. If you haven't already, please install scikit-learn and requests by running: +Ray Serve is framework-agnostic. You can use any version of sklearn. We will also need `requests` to send HTTP requests to your model deployment. If you haven't already, please install scikit-learn and requests by running: ```console $ pip install scikit-learn requests @@ -243,7 +243,7 @@ We define a class `BoostingModel` that runs inference on the `GradientBoosingCla :end-before: __doc_define_servable_end__ ``` -:::{note} +:::{note} When `BoostingModel` is deployed and instantiated, it will load the classifier model that we trained from your file system so that it can be ready to run inference on the model and serve requests later. ::: @@ -254,7 +254,7 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be :end-before: __doc_deploy_end__ ``` -:::{note} +:::{note} `BoostingModel.bind(MODEL_PATH, LABEL_PATH)` binds the arguments `MODEL_PATH` and `LABEL_PATH` to our deployment and returns a `DeploymentNode` object (wrapping an `BoostingModel` deployment object) that can then be used to connect with other `DeploymentNodes` to form a more complex [deployment graph](serve-model-composition-deployment-graph). ::: @@ -282,4 +282,7 @@ You should get an output like the following (the exact prediction may vary): ```python {"result": "versicolor"} ``` -:::: \ No newline at end of file + +:::: + +::::: diff --git a/doc/source/serve/user-guide.md b/doc/source/serve/user-guide.md index 3ab3ae86889f..cfb3e26a1904 100644 --- a/doc/source/serve/user-guide.md +++ b/doc/source/serve/user-guide.md @@ -8,6 +8,7 @@ This user guide will help you navigate the Ray Serve project and show you how to - [Scaling and Resource Allocation](scaling-and-resource-allocation) - [Model Composition](serve-model-composition) - [Development Workflow](dev-workflow) +- [Passing Arguments to Applications](app-builder-guide) - [Ray Serve Dashboard](dash-serve-view) - [Production Guide](serve-in-production) - [Performance Tuning](performance) diff --git a/doc/source/templates/01_batch_inference/README.md b/doc/source/templates/01_batch_inference/README.md index e87d6a5a92f4..4ed2de358d19 100644 --- a/doc/source/templates/01_batch_inference/README.md +++ b/doc/source/templates/01_batch_inference/README.md @@ -1,7 +1,7 @@ # Scaling Batch Inference with Ray Data This template is a quickstart to using [Ray -Data](https://docs.ray.io/en/latest/data/dataset.html) for batch +Data](https://docs.ray.io/en/latest/data/data.html) for batch inference. Ray Data is one of many libraries under the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). See [this blog @@ -16,12 +16,12 @@ to help you build your own application! At a high level, this template will: 1. [Load your dataset using Ray - Data.](https://docs.ray.io/en/latest/data/creating-datasets.html) + Data.](https://docs.ray.io/en/latest/data/loading-data.html) 2. [Preprocess your dataset before feeding it to your - model.](https://docs.ray.io/en/latest/data/transforming-datasets.html) + model.](https://docs.ray.io/en/latest/data/transforming-data.html) 3. [Initialize your model and perform inference on a shard of your dataset with a remote - actor.](https://docs.ray.io/en/latest/data/transforming-datasets.html#callable-class-udfs) + actor.](https://docs.ray.io/en/latest/data/transforming-data.html#callable-class-udfs) 4. [Save your prediction results.](https://docs.ray.io/en/latest/data/api/input_output.html) diff --git a/doc/source/templates/01_batch_inference/batch_inference.ipynb b/doc/source/templates/01_batch_inference/batch_inference.ipynb index a4bf225d399b..a6b8f7222a1a 100644 --- a/doc/source/templates/01_batch_inference/batch_inference.ipynb +++ b/doc/source/templates/01_batch_inference/batch_inference.ipynb @@ -13,9 +13,9 @@ "This template walks through GPU batch prediction on an image dataset using a PyTorch model, but the framework and data format are there just to help you build your own application!\n", "\n", "At a high level, this template will:\n", - "1. [Load your dataset using Ray Data.](https://docs.ray.io/en/latest/data/creating-datasets.html)\n", - "2. [Preprocess your dataset before feeding it to your model.](https://docs.ray.io/en/latest/data/transforming-datasets.html)\n", - "3. [Initialize your model and perform inference on a shard of your dataset with a remote actor.](https://docs.ray.io/en/latest/data/transforming-datasets.html#callable-class-udfs)\n", + "1. [Load your dataset using Ray Data.](https://docs.ray.io/en/latest/data/loading-data.html)\n", + "2. [Preprocess your dataset before feeding it to your model.](https://docs.ray.io/en/latest/data/transforming-data.html)\n", + "3. [Initialize your model and perform inference on a shard of your dataset with a remote actor.](https://docs.ray.io/en/latest/data/transforming-data.html#reduce-setup-overheads-using-actors)\n", "4. [Save your prediction results.](https://docs.ray.io/en/latest/data/api/input_output.html)\n", "\n", "> Slot in your code below wherever you see the ✂️ icon to build a many model training Ray application off of this template!" @@ -37,67 +37,61 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c99f142a", "metadata": {}, "source": [ - ">✂️ Replace these values depending on the template size you picked!\n", + ">✂️ Play around with these values!\n", ">\n", - ">For example, for the larger scale template with 4 GPU nodes, you may want to use 4 workers, each using 1 GPU." + ">For example, for a cluster with 4 GPU nodes, you may want 4 workers, each using 1 GPU.\n", + ">Be sure to stay within the resource constraints of your Ray Cluster if autoscaling is not enabled.\n", + ">You can check the available resources in your Ray Cluster with: `ray status`" ] }, { "cell_type": "code", "execution_count": null, - "id": "9aa792fc", + "id": "9d49681f-baf0-4ed8-9740-5c4e38744311", "metadata": { - "tags": [ - "small" - ] + "tags": [] }, "outputs": [], "source": [ - "# Default values for the small-scale template\n", - "NUM_WORKERS: int = 1\n", - "\n", - "USE_GPU: bool = True\n", + "NUM_WORKERS: int = 4\n", "NUM_GPUS_PER_WORKER: float = 1\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "9d49681f-baf0-4ed8-9740-5c4e38744311", - "metadata": { - "tags": [ - "large" - ] - }, + "id": "770bbdc7", + "metadata": {}, "outputs": [], "source": [ - "# Default values for the large-scale template\n", - "NUM_WORKERS: int = 4\n", - "\n", - "USE_GPU: bool = True\n", - "NUM_GPUS_PER_WORKER: float = 1\n" + "!ray status" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "23321ba8", "metadata": {}, "source": [ "```{tip}\n", - "Try setting `NUM_GPUS_PER_WORKER` to a fractional amount! This will leverage Ray's fractional resource allocation, which means you can schedule multiple batch inference workers to happen on the same GPU.\n", + "Try setting `NUM_GPUS_PER_WORKER` to a fractional amount! This will leverage Ray's fractional resource allocation, which means you can schedule multiple batch inference workers to use the same GPU.\n", "```" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3b6f2352", "metadata": {}, "source": [ - "> ✂️ Replace this function with logic to load your own data with Ray Data." + "> ✂️ Replace this function with logic to load your own data with Ray Data.\n", + ">\n", + "> See [the Ray Data guide on creating datasets](https://docs.ray.io/en/latest/data/creating-datasets.html) to learn how to create a dataset based on the data type and how file storage format." ] }, { @@ -107,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "def load_ray_dataset() -> ray.data.Dataset:\n", + "def load_ray_dataset():\n", " from ray.data.datasource.partitioning import Partitioning\n", "\n", " s3_uri = \"s3://anonymous@air-example-data-2/imagenette2/val/\"\n", @@ -173,7 +167,9 @@ "outputs": [], "source": [ "ds = ds.map_batches(preprocess, batch_format=\"numpy\")\n", - "ds.schema()\n" + "\n", + "print(\"Dataset schema:\\n\", ds.schema())\n", + "print(\"Number of images:\", ds.count())\n" ] }, { @@ -204,9 +200,9 @@ " def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n", " # \n", " input_data = torch.as_tensor(batch[\"image\"], device=self.device)\n", - " with torch.no_grad():\n", - " result = self.model(input_data)\n", - " return {\"predictions\": result.cpu().numpy()}\n" + " with torch.inference_mode():\n", + " pred = self.model(input_data)\n", + " return {\"predicted_class_index\": pred.argmax(dim=1).detach().cpu().numpy()}\n" ] }, { @@ -228,10 +224,11 @@ " PredictCallable,\n", " batch_size=128,\n", " compute=ray.data.ActorPoolStrategy(\n", - " # Fix the number of batch inference workers to a specified value.\n", - " size=NUM_WORKERS,\n", + " # Fix the number of batch inference workers to `NUM_WORKERS`.\n", + " min_size=NUM_WORKERS,\n", + " max_size=NUM_WORKERS,\n", " ),\n", - " num_gpus=NUM_GPUS_PER_WORKER if USE_GPU else 0,\n", + " num_gpus=NUM_GPUS_PER_WORKER,\n", " batch_format=\"numpy\",\n", ")\n" ] @@ -247,6 +244,15 @@ "preds.schema()\n" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2565ba08", + "metadata": {}, + "source": [ + "Show the first few predictions!" + ] + }, { "cell_type": "code", "execution_count": null, @@ -254,7 +260,7 @@ "metadata": {}, "outputs": [], "source": [ - "preds.take(1)\n" + "preds.take(5)\n" ] }, { @@ -293,14 +299,6 @@ " predictions.repartition(num_shards).write_parquet(f\"local://{temp_dir}\")\n", " print(f\"Predictions saved to `{temp_dir}`!\")\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e88a268", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/doc/source/templates/02_many_model_training/item_ids.csv b/doc/source/templates/02_many_model_training/item_ids.csv deleted file mode 100644 index cf7f79dd1de2..000000000000 --- a/doc/source/templates/02_many_model_training/item_ids.csv +++ /dev/null @@ -1,5001 +0,0 @@ -Unnamed: 0,item_id -0,FOODS_1_001_CA_1 -1,FOODS_1_001_CA_2 -2,FOODS_1_001_CA_3 -3,FOODS_1_001_CA_4 -4,FOODS_1_001_TX_1 -5,FOODS_1_001_TX_2 -6,FOODS_1_001_TX_3 -7,FOODS_1_001_WI_1 -8,FOODS_1_001_WI_2 -9,FOODS_1_001_WI_3 -10,FOODS_1_002_CA_1 -11,FOODS_1_002_CA_2 -12,FOODS_1_002_CA_3 -13,FOODS_1_002_CA_4 -14,FOODS_1_002_TX_1 -15,FOODS_1_002_TX_2 -16,FOODS_1_002_TX_3 -17,FOODS_1_002_WI_1 -18,FOODS_1_002_WI_2 -19,FOODS_1_002_WI_3 -20,FOODS_1_003_CA_1 -21,FOODS_1_003_CA_2 -22,FOODS_1_003_CA_3 -23,FOODS_1_003_CA_4 -24,FOODS_1_003_TX_1 -25,FOODS_1_003_TX_2 -26,FOODS_1_003_TX_3 -27,FOODS_1_003_WI_1 -28,FOODS_1_003_WI_2 -29,FOODS_1_003_WI_3 -30,FOODS_1_004_CA_1 -31,FOODS_1_004_CA_2 -32,FOODS_1_004_CA_3 -33,FOODS_1_004_CA_4 -34,FOODS_1_004_TX_1 -35,FOODS_1_004_TX_2 -36,FOODS_1_004_TX_3 -37,FOODS_1_004_WI_1 -38,FOODS_1_004_WI_2 -39,FOODS_1_004_WI_3 -40,FOODS_1_005_CA_1 -41,FOODS_1_005_CA_2 -42,FOODS_1_005_CA_3 -43,FOODS_1_005_CA_4 -44,FOODS_1_005_TX_1 -45,FOODS_1_005_TX_2 -46,FOODS_1_005_TX_3 -47,FOODS_1_005_WI_1 -48,FOODS_1_005_WI_2 -49,FOODS_1_005_WI_3 -50,FOODS_1_006_CA_1 -51,FOODS_1_006_CA_2 -52,FOODS_1_006_CA_3 -53,FOODS_1_006_CA_4 -54,FOODS_1_006_TX_1 -55,FOODS_1_006_TX_2 -56,FOODS_1_006_TX_3 -57,FOODS_1_006_WI_1 -58,FOODS_1_006_WI_2 -59,FOODS_1_006_WI_3 -60,FOODS_1_008_CA_1 -61,FOODS_1_008_CA_2 -62,FOODS_1_008_CA_3 -63,FOODS_1_008_CA_4 -64,FOODS_1_008_TX_1 -65,FOODS_1_008_TX_2 -66,FOODS_1_008_TX_3 -67,FOODS_1_008_WI_1 -68,FOODS_1_008_WI_2 -69,FOODS_1_008_WI_3 -70,FOODS_1_009_CA_1 -71,FOODS_1_009_CA_2 -72,FOODS_1_009_CA_3 -73,FOODS_1_009_CA_4 -74,FOODS_1_009_TX_1 -75,FOODS_1_009_TX_2 -76,FOODS_1_009_TX_3 -77,FOODS_1_009_WI_1 -78,FOODS_1_009_WI_2 -79,FOODS_1_009_WI_3 -80,FOODS_1_010_CA_1 -81,FOODS_1_010_CA_2 -82,FOODS_1_010_CA_3 -83,FOODS_1_010_CA_4 -84,FOODS_1_010_TX_1 -85,FOODS_1_010_TX_2 -86,FOODS_1_010_TX_3 -87,FOODS_1_010_WI_1 -88,FOODS_1_010_WI_2 -89,FOODS_1_010_WI_3 -90,FOODS_1_011_CA_1 -91,FOODS_1_011_CA_2 -92,FOODS_1_011_CA_3 -93,FOODS_1_011_CA_4 -94,FOODS_1_011_TX_1 -95,FOODS_1_011_TX_2 -96,FOODS_1_011_TX_3 -97,FOODS_1_011_WI_1 -98,FOODS_1_011_WI_2 -99,FOODS_1_011_WI_3 -100,FOODS_1_012_CA_1 -101,FOODS_1_012_CA_2 -102,FOODS_1_012_CA_3 -103,FOODS_1_012_CA_4 -104,FOODS_1_012_TX_1 -105,FOODS_1_012_TX_2 -106,FOODS_1_012_TX_3 -107,FOODS_1_012_WI_1 -108,FOODS_1_012_WI_2 -109,FOODS_1_012_WI_3 -110,FOODS_1_013_CA_1 -111,FOODS_1_013_CA_2 -112,FOODS_1_013_CA_3 -113,FOODS_1_013_CA_4 -114,FOODS_1_013_TX_1 -115,FOODS_1_013_TX_2 -116,FOODS_1_013_TX_3 -117,FOODS_1_013_WI_1 -118,FOODS_1_013_WI_2 -119,FOODS_1_013_WI_3 -120,FOODS_1_014_CA_1 -121,FOODS_1_014_CA_2 -122,FOODS_1_014_CA_3 -123,FOODS_1_014_CA_4 -124,FOODS_1_014_TX_1 -125,FOODS_1_014_TX_2 -126,FOODS_1_014_TX_3 -127,FOODS_1_014_WI_1 -128,FOODS_1_014_WI_2 -129,FOODS_1_014_WI_3 -130,FOODS_1_015_CA_1 -131,FOODS_1_015_CA_2 -132,FOODS_1_015_CA_3 -133,FOODS_1_015_CA_4 -134,FOODS_1_015_TX_1 -135,FOODS_1_015_TX_2 -136,FOODS_1_015_TX_3 -137,FOODS_1_015_WI_1 -138,FOODS_1_015_WI_2 -139,FOODS_1_015_WI_3 -140,FOODS_1_016_CA_1 -141,FOODS_1_016_CA_2 -142,FOODS_1_016_CA_3 -143,FOODS_1_016_CA_4 -144,FOODS_1_016_TX_1 -145,FOODS_1_016_TX_2 -146,FOODS_1_016_TX_3 -147,FOODS_1_016_WI_1 -148,FOODS_1_016_WI_2 -149,FOODS_1_016_WI_3 -150,FOODS_1_017_CA_1 -151,FOODS_1_017_CA_2 -152,FOODS_1_017_CA_3 -153,FOODS_1_017_CA_4 -154,FOODS_1_017_TX_1 -155,FOODS_1_017_TX_2 -156,FOODS_1_017_TX_3 -157,FOODS_1_017_WI_1 -158,FOODS_1_017_WI_2 -159,FOODS_1_017_WI_3 -160,FOODS_1_018_CA_1 -161,FOODS_1_018_CA_2 -162,FOODS_1_018_CA_3 -163,FOODS_1_018_CA_4 -164,FOODS_1_018_TX_1 -165,FOODS_1_018_TX_2 -166,FOODS_1_018_TX_3 -167,FOODS_1_018_WI_1 -168,FOODS_1_018_WI_2 -169,FOODS_1_018_WI_3 -170,FOODS_1_019_CA_1 -171,FOODS_1_019_CA_2 -172,FOODS_1_019_CA_3 -173,FOODS_1_019_CA_4 -174,FOODS_1_019_TX_1 -175,FOODS_1_019_TX_2 -176,FOODS_1_019_TX_3 -177,FOODS_1_019_WI_1 -178,FOODS_1_019_WI_2 -179,FOODS_1_019_WI_3 -180,FOODS_1_020_CA_1 -181,FOODS_1_020_CA_2 -182,FOODS_1_020_CA_3 -183,FOODS_1_020_CA_4 -184,FOODS_1_020_TX_1 -185,FOODS_1_020_TX_2 -186,FOODS_1_020_TX_3 -187,FOODS_1_020_WI_1 -188,FOODS_1_020_WI_2 -189,FOODS_1_020_WI_3 -190,FOODS_1_021_CA_1 -191,FOODS_1_021_CA_2 -192,FOODS_1_021_CA_3 -193,FOODS_1_021_CA_4 -194,FOODS_1_021_TX_1 -195,FOODS_1_021_TX_2 -196,FOODS_1_021_TX_3 -197,FOODS_1_021_WI_1 -198,FOODS_1_021_WI_2 -199,FOODS_1_021_WI_3 -200,FOODS_1_022_CA_1 -201,FOODS_1_022_CA_2 -202,FOODS_1_022_CA_3 -203,FOODS_1_022_CA_4 -204,FOODS_1_022_TX_1 -205,FOODS_1_022_TX_2 -206,FOODS_1_022_TX_3 -207,FOODS_1_022_WI_1 -208,FOODS_1_022_WI_2 -209,FOODS_1_022_WI_3 -210,FOODS_1_023_CA_1 -211,FOODS_1_023_CA_2 -212,FOODS_1_023_CA_3 -213,FOODS_1_023_CA_4 -214,FOODS_1_023_TX_1 -215,FOODS_1_023_TX_2 -216,FOODS_1_023_TX_3 -217,FOODS_1_023_WI_1 -218,FOODS_1_023_WI_2 -219,FOODS_1_023_WI_3 -220,FOODS_1_024_CA_1 -221,FOODS_1_024_CA_2 -222,FOODS_1_024_CA_3 -223,FOODS_1_024_CA_4 -224,FOODS_1_024_TX_1 -225,FOODS_1_024_TX_2 -226,FOODS_1_024_TX_3 -227,FOODS_1_024_WI_1 -228,FOODS_1_024_WI_2 -229,FOODS_1_024_WI_3 -230,FOODS_1_025_CA_1 -231,FOODS_1_025_CA_2 -232,FOODS_1_025_CA_3 -233,FOODS_1_025_CA_4 -234,FOODS_1_025_TX_1 -235,FOODS_1_025_TX_2 -236,FOODS_1_025_TX_3 -237,FOODS_1_025_WI_1 -238,FOODS_1_025_WI_2 -239,FOODS_1_025_WI_3 -240,FOODS_1_026_CA_1 -241,FOODS_1_026_CA_2 -242,FOODS_1_026_CA_3 -243,FOODS_1_026_CA_4 -244,FOODS_1_026_TX_1 -245,FOODS_1_026_TX_2 -246,FOODS_1_026_TX_3 -247,FOODS_1_026_WI_1 -248,FOODS_1_026_WI_2 -249,FOODS_1_026_WI_3 -250,FOODS_1_027_CA_1 -251,FOODS_1_027_CA_2 -252,FOODS_1_027_CA_3 -253,FOODS_1_027_CA_4 -254,FOODS_1_027_TX_1 -255,FOODS_1_027_TX_2 -256,FOODS_1_027_TX_3 -257,FOODS_1_027_WI_1 -258,FOODS_1_027_WI_2 -259,FOODS_1_027_WI_3 -260,FOODS_1_028_CA_1 -261,FOODS_1_028_CA_2 -262,FOODS_1_028_CA_3 -263,FOODS_1_028_CA_4 -264,FOODS_1_028_TX_1 -265,FOODS_1_028_TX_2 -266,FOODS_1_028_TX_3 -267,FOODS_1_028_WI_1 -268,FOODS_1_028_WI_2 -269,FOODS_1_028_WI_3 -270,FOODS_1_029_CA_1 -271,FOODS_1_029_CA_2 -272,FOODS_1_029_CA_3 -273,FOODS_1_029_CA_4 -274,FOODS_1_029_TX_1 -275,FOODS_1_029_TX_2 -276,FOODS_1_029_TX_3 -277,FOODS_1_029_WI_1 -278,FOODS_1_029_WI_2 -279,FOODS_1_029_WI_3 -280,FOODS_1_030_CA_1 -281,FOODS_1_030_CA_2 -282,FOODS_1_030_CA_3 -283,FOODS_1_030_CA_4 -284,FOODS_1_030_TX_1 -285,FOODS_1_030_TX_2 -286,FOODS_1_030_TX_3 -287,FOODS_1_030_WI_1 -288,FOODS_1_030_WI_2 -289,FOODS_1_030_WI_3 -290,FOODS_1_031_CA_1 -291,FOODS_1_031_CA_2 -292,FOODS_1_031_CA_3 -293,FOODS_1_031_CA_4 -294,FOODS_1_031_TX_1 -295,FOODS_1_031_TX_2 -296,FOODS_1_031_TX_3 -297,FOODS_1_031_WI_1 -298,FOODS_1_031_WI_2 -299,FOODS_1_031_WI_3 -300,FOODS_1_032_CA_1 -301,FOODS_1_032_CA_2 -302,FOODS_1_032_CA_3 -303,FOODS_1_032_CA_4 -304,FOODS_1_032_TX_1 -305,FOODS_1_032_TX_2 -306,FOODS_1_032_TX_3 -307,FOODS_1_032_WI_1 -308,FOODS_1_032_WI_2 -309,FOODS_1_032_WI_3 -310,FOODS_1_033_CA_1 -311,FOODS_1_033_CA_2 -312,FOODS_1_033_CA_3 -313,FOODS_1_033_CA_4 -314,FOODS_1_033_TX_1 -315,FOODS_1_033_TX_2 -316,FOODS_1_033_TX_3 -317,FOODS_1_033_WI_1 -318,FOODS_1_033_WI_2 -319,FOODS_1_033_WI_3 -320,FOODS_1_034_CA_1 -321,FOODS_1_034_CA_2 -322,FOODS_1_034_CA_3 -323,FOODS_1_034_CA_4 -324,FOODS_1_034_TX_1 -325,FOODS_1_034_TX_2 -326,FOODS_1_034_TX_3 -327,FOODS_1_034_WI_1 -328,FOODS_1_034_WI_2 -329,FOODS_1_034_WI_3 -330,FOODS_1_035_CA_1 -331,FOODS_1_035_CA_2 -332,FOODS_1_035_CA_3 -333,FOODS_1_035_CA_4 -334,FOODS_1_035_TX_1 -335,FOODS_1_035_TX_2 -336,FOODS_1_035_TX_3 -337,FOODS_1_035_WI_1 -338,FOODS_1_035_WI_2 -339,FOODS_1_035_WI_3 -340,FOODS_1_036_CA_1 -341,FOODS_1_036_CA_2 -342,FOODS_1_036_CA_3 -343,FOODS_1_036_CA_4 -344,FOODS_1_036_TX_1 -345,FOODS_1_036_TX_2 -346,FOODS_1_036_TX_3 -347,FOODS_1_036_WI_1 -348,FOODS_1_036_WI_2 -349,FOODS_1_036_WI_3 -350,FOODS_1_037_CA_1 -351,FOODS_1_037_CA_2 -352,FOODS_1_037_CA_3 -353,FOODS_1_037_CA_4 -354,FOODS_1_037_TX_1 -355,FOODS_1_037_TX_2 -356,FOODS_1_037_TX_3 -357,FOODS_1_037_WI_1 -358,FOODS_1_037_WI_2 -359,FOODS_1_037_WI_3 -360,FOODS_1_038_CA_1 -361,FOODS_1_038_CA_2 -362,FOODS_1_038_CA_3 -363,FOODS_1_038_CA_4 -364,FOODS_1_038_TX_1 -365,FOODS_1_038_TX_2 -366,FOODS_1_038_TX_3 -367,FOODS_1_038_WI_1 -368,FOODS_1_038_WI_2 -369,FOODS_1_038_WI_3 -370,FOODS_1_039_CA_1 -371,FOODS_1_039_CA_2 -372,FOODS_1_039_CA_3 -373,FOODS_1_039_CA_4 -374,FOODS_1_039_TX_1 -375,FOODS_1_039_TX_2 -376,FOODS_1_039_TX_3 -377,FOODS_1_039_WI_1 -378,FOODS_1_039_WI_2 -379,FOODS_1_039_WI_3 -380,FOODS_1_040_CA_1 -381,FOODS_1_040_CA_2 -382,FOODS_1_040_CA_3 -383,FOODS_1_040_CA_4 -384,FOODS_1_040_TX_1 -385,FOODS_1_040_TX_2 -386,FOODS_1_040_TX_3 -387,FOODS_1_040_WI_1 -388,FOODS_1_040_WI_2 -389,FOODS_1_040_WI_3 -390,FOODS_1_041_CA_1 -391,FOODS_1_041_CA_2 -392,FOODS_1_041_CA_3 -393,FOODS_1_041_CA_4 -394,FOODS_1_041_TX_1 -395,FOODS_1_041_TX_2 -396,FOODS_1_041_TX_3 -397,FOODS_1_041_WI_1 -398,FOODS_1_041_WI_2 -399,FOODS_1_041_WI_3 -400,FOODS_1_042_CA_1 -401,FOODS_1_042_CA_2 -402,FOODS_1_042_CA_3 -403,FOODS_1_042_CA_4 -404,FOODS_1_042_TX_1 -405,FOODS_1_042_TX_2 -406,FOODS_1_042_TX_3 -407,FOODS_1_042_WI_1 -408,FOODS_1_042_WI_2 -409,FOODS_1_042_WI_3 -410,FOODS_1_043_CA_1 -411,FOODS_1_043_CA_2 -412,FOODS_1_043_CA_3 -413,FOODS_1_043_CA_4 -414,FOODS_1_043_TX_1 -415,FOODS_1_043_TX_2 -416,FOODS_1_043_TX_3 -417,FOODS_1_043_WI_1 -418,FOODS_1_043_WI_2 -419,FOODS_1_043_WI_3 -420,FOODS_1_044_CA_1 -421,FOODS_1_044_CA_2 -422,FOODS_1_044_CA_3 -423,FOODS_1_044_CA_4 -424,FOODS_1_044_TX_1 -425,FOODS_1_044_TX_2 -426,FOODS_1_044_TX_3 -427,FOODS_1_044_WI_1 -428,FOODS_1_044_WI_2 -429,FOODS_1_044_WI_3 -430,FOODS_1_045_CA_1 -431,FOODS_1_045_CA_2 -432,FOODS_1_045_CA_3 -433,FOODS_1_045_CA_4 -434,FOODS_1_045_TX_1 -435,FOODS_1_045_TX_2 -436,FOODS_1_045_TX_3 -437,FOODS_1_045_WI_1 -438,FOODS_1_045_WI_2 -439,FOODS_1_045_WI_3 -440,FOODS_1_046_CA_1 -441,FOODS_1_046_CA_2 -442,FOODS_1_046_CA_3 -443,FOODS_1_046_CA_4 -444,FOODS_1_046_TX_1 -445,FOODS_1_046_TX_2 -446,FOODS_1_046_TX_3 -447,FOODS_1_046_WI_1 -448,FOODS_1_046_WI_2 -449,FOODS_1_046_WI_3 -450,FOODS_1_047_CA_1 -451,FOODS_1_047_CA_2 -452,FOODS_1_047_CA_3 -453,FOODS_1_047_CA_4 -454,FOODS_1_047_TX_1 -455,FOODS_1_047_TX_2 -456,FOODS_1_047_TX_3 -457,FOODS_1_047_WI_1 -458,FOODS_1_047_WI_2 -459,FOODS_1_047_WI_3 -460,FOODS_1_048_CA_1 -461,FOODS_1_048_CA_2 -462,FOODS_1_048_CA_3 -463,FOODS_1_048_CA_4 -464,FOODS_1_048_TX_1 -465,FOODS_1_048_TX_2 -466,FOODS_1_048_TX_3 -467,FOODS_1_048_WI_1 -468,FOODS_1_048_WI_2 -469,FOODS_1_048_WI_3 -470,FOODS_1_049_CA_1 -471,FOODS_1_049_CA_2 -472,FOODS_1_049_CA_3 -473,FOODS_1_049_CA_4 -474,FOODS_1_049_TX_1 -475,FOODS_1_049_TX_2 -476,FOODS_1_049_TX_3 -477,FOODS_1_049_WI_1 -478,FOODS_1_049_WI_2 -479,FOODS_1_049_WI_3 -480,FOODS_1_050_CA_1 -481,FOODS_1_050_CA_2 -482,FOODS_1_050_CA_3 -483,FOODS_1_050_CA_4 -484,FOODS_1_050_TX_1 -485,FOODS_1_050_TX_2 -486,FOODS_1_050_TX_3 -487,FOODS_1_050_WI_1 -488,FOODS_1_050_WI_2 -489,FOODS_1_050_WI_3 -490,FOODS_1_051_CA_1 -491,FOODS_1_051_CA_2 -492,FOODS_1_051_CA_3 -493,FOODS_1_051_CA_4 -494,FOODS_1_051_TX_1 -495,FOODS_1_051_TX_2 -496,FOODS_1_051_TX_3 -497,FOODS_1_051_WI_1 -498,FOODS_1_051_WI_2 -499,FOODS_1_051_WI_3 -500,FOODS_1_052_CA_1 -501,FOODS_1_052_CA_2 -502,FOODS_1_052_CA_3 -503,FOODS_1_052_CA_4 -504,FOODS_1_052_TX_1 -505,FOODS_1_052_TX_2 -506,FOODS_1_052_TX_3 -507,FOODS_1_052_WI_1 -508,FOODS_1_052_WI_2 -509,FOODS_1_052_WI_3 -510,FOODS_1_053_CA_1 -511,FOODS_1_053_CA_2 -512,FOODS_1_053_CA_3 -513,FOODS_1_053_CA_4 -514,FOODS_1_053_TX_1 -515,FOODS_1_053_TX_2 -516,FOODS_1_053_TX_3 -517,FOODS_1_053_WI_1 -518,FOODS_1_053_WI_2 -519,FOODS_1_053_WI_3 -520,FOODS_1_054_CA_1 -521,FOODS_1_054_CA_2 -522,FOODS_1_054_CA_3 -523,FOODS_1_054_CA_4 -524,FOODS_1_054_TX_1 -525,FOODS_1_054_TX_2 -526,FOODS_1_054_TX_3 -527,FOODS_1_054_WI_1 -528,FOODS_1_054_WI_2 -529,FOODS_1_054_WI_3 -530,FOODS_1_055_CA_1 -531,FOODS_1_055_CA_2 -532,FOODS_1_055_CA_3 -533,FOODS_1_055_CA_4 -534,FOODS_1_055_TX_1 -535,FOODS_1_055_TX_2 -536,FOODS_1_055_TX_3 -537,FOODS_1_055_WI_1 -538,FOODS_1_055_WI_2 -539,FOODS_1_055_WI_3 -540,FOODS_1_056_CA_1 -541,FOODS_1_056_CA_2 -542,FOODS_1_056_CA_3 -543,FOODS_1_056_CA_4 -544,FOODS_1_056_TX_1 -545,FOODS_1_056_TX_2 -546,FOODS_1_056_TX_3 -547,FOODS_1_056_WI_1 -548,FOODS_1_056_WI_2 -549,FOODS_1_056_WI_3 -550,FOODS_1_057_CA_1 -551,FOODS_1_057_CA_2 -552,FOODS_1_057_CA_3 -553,FOODS_1_057_CA_4 -554,FOODS_1_057_TX_1 -555,FOODS_1_057_TX_2 -556,FOODS_1_057_TX_3 -557,FOODS_1_057_WI_1 -558,FOODS_1_057_WI_2 -559,FOODS_1_057_WI_3 -560,FOODS_1_058_CA_1 -561,FOODS_1_058_CA_2 -562,FOODS_1_058_CA_3 -563,FOODS_1_058_CA_4 -564,FOODS_1_058_TX_1 -565,FOODS_1_058_TX_2 -566,FOODS_1_058_TX_3 -567,FOODS_1_058_WI_1 -568,FOODS_1_058_WI_2 -569,FOODS_1_058_WI_3 -570,FOODS_1_059_CA_1 -571,FOODS_1_059_CA_2 -572,FOODS_1_059_CA_3 -573,FOODS_1_059_CA_4 -574,FOODS_1_059_TX_1 -575,FOODS_1_059_TX_2 -576,FOODS_1_059_TX_3 -577,FOODS_1_059_WI_1 -578,FOODS_1_059_WI_2 -579,FOODS_1_059_WI_3 -580,FOODS_1_060_CA_1 -581,FOODS_1_060_CA_2 -582,FOODS_1_060_CA_3 -583,FOODS_1_060_CA_4 -584,FOODS_1_060_TX_1 -585,FOODS_1_060_TX_2 -586,FOODS_1_060_TX_3 -587,FOODS_1_060_WI_1 -588,FOODS_1_060_WI_2 -589,FOODS_1_060_WI_3 -590,FOODS_1_061_CA_1 -591,FOODS_1_061_CA_2 -592,FOODS_1_061_CA_3 -593,FOODS_1_061_CA_4 -594,FOODS_1_061_TX_1 -595,FOODS_1_061_TX_2 -596,FOODS_1_061_TX_3 -597,FOODS_1_061_WI_1 -598,FOODS_1_061_WI_2 -599,FOODS_1_061_WI_3 -600,FOODS_1_062_CA_1 -601,FOODS_1_062_CA_2 -602,FOODS_1_062_CA_3 -603,FOODS_1_062_CA_4 -604,FOODS_1_062_TX_1 -605,FOODS_1_062_TX_2 -606,FOODS_1_062_TX_3 -607,FOODS_1_062_WI_1 -608,FOODS_1_062_WI_2 -609,FOODS_1_062_WI_3 -610,FOODS_1_063_CA_1 -611,FOODS_1_063_CA_2 -612,FOODS_1_063_CA_3 -613,FOODS_1_063_CA_4 -614,FOODS_1_063_TX_1 -615,FOODS_1_063_TX_2 -616,FOODS_1_063_TX_3 -617,FOODS_1_063_WI_1 -618,FOODS_1_063_WI_2 -619,FOODS_1_063_WI_3 -620,FOODS_1_064_CA_1 -621,FOODS_1_064_CA_2 -622,FOODS_1_064_CA_3 -623,FOODS_1_064_CA_4 -624,FOODS_1_064_TX_1 -625,FOODS_1_064_TX_2 -626,FOODS_1_064_TX_3 -627,FOODS_1_064_WI_1 -628,FOODS_1_064_WI_2 -629,FOODS_1_064_WI_3 -630,FOODS_1_065_CA_1 -631,FOODS_1_065_CA_2 -632,FOODS_1_065_CA_3 -633,FOODS_1_065_CA_4 -634,FOODS_1_065_TX_1 -635,FOODS_1_065_TX_2 -636,FOODS_1_065_TX_3 -637,FOODS_1_065_WI_1 -638,FOODS_1_065_WI_2 -639,FOODS_1_065_WI_3 -640,FOODS_1_066_CA_1 -641,FOODS_1_066_CA_2 -642,FOODS_1_066_CA_3 -643,FOODS_1_066_CA_4 -644,FOODS_1_066_TX_1 -645,FOODS_1_066_TX_2 -646,FOODS_1_066_TX_3 -647,FOODS_1_066_WI_1 -648,FOODS_1_066_WI_2 -649,FOODS_1_066_WI_3 -650,FOODS_1_067_CA_1 -651,FOODS_1_067_CA_2 -652,FOODS_1_067_CA_3 -653,FOODS_1_067_CA_4 -654,FOODS_1_067_TX_1 -655,FOODS_1_067_TX_2 -656,FOODS_1_067_TX_3 -657,FOODS_1_067_WI_1 -658,FOODS_1_067_WI_2 -659,FOODS_1_067_WI_3 -660,FOODS_1_068_CA_1 -661,FOODS_1_068_CA_2 -662,FOODS_1_068_CA_3 -663,FOODS_1_068_CA_4 -664,FOODS_1_068_TX_1 -665,FOODS_1_068_TX_2 -666,FOODS_1_068_TX_3 -667,FOODS_1_068_WI_1 -668,FOODS_1_068_WI_2 -669,FOODS_1_068_WI_3 -670,FOODS_1_069_CA_1 -671,FOODS_1_069_CA_2 -672,FOODS_1_069_CA_3 -673,FOODS_1_069_CA_4 -674,FOODS_1_069_TX_1 -675,FOODS_1_069_TX_2 -676,FOODS_1_069_TX_3 -677,FOODS_1_069_WI_1 -678,FOODS_1_069_WI_2 -679,FOODS_1_069_WI_3 -680,FOODS_1_070_CA_1 -681,FOODS_1_070_CA_2 -682,FOODS_1_070_CA_3 -683,FOODS_1_070_CA_4 -684,FOODS_1_070_TX_1 -685,FOODS_1_070_TX_2 -686,FOODS_1_070_TX_3 -687,FOODS_1_070_WI_1 -688,FOODS_1_070_WI_2 -689,FOODS_1_070_WI_3 -690,FOODS_1_071_CA_1 -691,FOODS_1_071_CA_2 -692,FOODS_1_071_CA_3 -693,FOODS_1_071_CA_4 -694,FOODS_1_071_TX_1 -695,FOODS_1_071_TX_2 -696,FOODS_1_071_TX_3 -697,FOODS_1_071_WI_1 -698,FOODS_1_071_WI_2 -699,FOODS_1_071_WI_3 -700,FOODS_1_072_CA_1 -701,FOODS_1_072_CA_2 -702,FOODS_1_072_CA_3 -703,FOODS_1_072_CA_4 -704,FOODS_1_072_TX_1 -705,FOODS_1_072_TX_2 -706,FOODS_1_072_TX_3 -707,FOODS_1_072_WI_1 -708,FOODS_1_072_WI_2 -709,FOODS_1_072_WI_3 -710,FOODS_1_073_CA_1 -711,FOODS_1_073_CA_2 -712,FOODS_1_073_CA_3 -713,FOODS_1_073_CA_4 -714,FOODS_1_073_TX_1 -715,FOODS_1_073_TX_2 -716,FOODS_1_073_TX_3 -717,FOODS_1_073_WI_1 -718,FOODS_1_073_WI_2 -719,FOODS_1_073_WI_3 -720,FOODS_1_074_CA_1 -721,FOODS_1_074_CA_2 -722,FOODS_1_074_CA_3 -723,FOODS_1_074_CA_4 -724,FOODS_1_074_TX_1 -725,FOODS_1_074_TX_2 -726,FOODS_1_074_TX_3 -727,FOODS_1_074_WI_1 -728,FOODS_1_074_WI_2 -729,FOODS_1_074_WI_3 -730,FOODS_1_075_CA_1 -731,FOODS_1_075_CA_2 -732,FOODS_1_075_CA_3 -733,FOODS_1_075_CA_4 -734,FOODS_1_075_TX_1 -735,FOODS_1_075_TX_2 -736,FOODS_1_075_TX_3 -737,FOODS_1_075_WI_1 -738,FOODS_1_075_WI_2 -739,FOODS_1_075_WI_3 -740,FOODS_1_076_CA_1 -741,FOODS_1_076_CA_2 -742,FOODS_1_076_CA_3 -743,FOODS_1_076_CA_4 -744,FOODS_1_076_TX_1 -745,FOODS_1_076_TX_2 -746,FOODS_1_076_TX_3 -747,FOODS_1_076_WI_1 -748,FOODS_1_076_WI_2 -749,FOODS_1_076_WI_3 -750,FOODS_1_077_CA_1 -751,FOODS_1_077_CA_2 -752,FOODS_1_077_CA_3 -753,FOODS_1_077_CA_4 -754,FOODS_1_077_TX_1 -755,FOODS_1_077_TX_2 -756,FOODS_1_077_TX_3 -757,FOODS_1_077_WI_1 -758,FOODS_1_077_WI_2 -759,FOODS_1_077_WI_3 -760,FOODS_1_078_CA_1 -761,FOODS_1_078_CA_2 -762,FOODS_1_078_CA_3 -763,FOODS_1_078_CA_4 -764,FOODS_1_078_TX_1 -765,FOODS_1_078_TX_2 -766,FOODS_1_078_TX_3 -767,FOODS_1_078_WI_1 -768,FOODS_1_078_WI_2 -769,FOODS_1_078_WI_3 -770,FOODS_1_079_CA_1 -771,FOODS_1_079_CA_2 -772,FOODS_1_079_CA_3 -773,FOODS_1_079_CA_4 -774,FOODS_1_079_TX_1 -775,FOODS_1_079_TX_2 -776,FOODS_1_079_TX_3 -777,FOODS_1_079_WI_1 -778,FOODS_1_079_WI_2 -779,FOODS_1_079_WI_3 -780,FOODS_1_080_CA_1 -781,FOODS_1_080_CA_2 -782,FOODS_1_080_CA_3 -783,FOODS_1_080_CA_4 -784,FOODS_1_080_TX_1 -785,FOODS_1_080_TX_2 -786,FOODS_1_080_TX_3 -787,FOODS_1_080_WI_1 -788,FOODS_1_080_WI_2 -789,FOODS_1_080_WI_3 -790,FOODS_1_081_CA_1 -791,FOODS_1_081_CA_2 -792,FOODS_1_081_CA_3 -793,FOODS_1_081_CA_4 -794,FOODS_1_081_TX_1 -795,FOODS_1_081_TX_2 -796,FOODS_1_081_TX_3 -797,FOODS_1_081_WI_1 -798,FOODS_1_081_WI_2 -799,FOODS_1_081_WI_3 -800,FOODS_1_082_CA_1 -801,FOODS_1_082_CA_2 -802,FOODS_1_082_CA_3 -803,FOODS_1_082_CA_4 -804,FOODS_1_082_TX_1 -805,FOODS_1_082_TX_2 -806,FOODS_1_082_TX_3 -807,FOODS_1_082_WI_1 -808,FOODS_1_082_WI_2 -809,FOODS_1_082_WI_3 -810,FOODS_1_083_CA_1 -811,FOODS_1_083_CA_2 -812,FOODS_1_083_CA_3 -813,FOODS_1_083_CA_4 -814,FOODS_1_083_TX_1 -815,FOODS_1_083_TX_2 -816,FOODS_1_083_TX_3 -817,FOODS_1_083_WI_1 -818,FOODS_1_083_WI_2 -819,FOODS_1_083_WI_3 -820,FOODS_1_084_CA_1 -821,FOODS_1_084_CA_2 -822,FOODS_1_084_CA_3 -823,FOODS_1_084_CA_4 -824,FOODS_1_084_TX_1 -825,FOODS_1_084_TX_2 -826,FOODS_1_084_TX_3 -827,FOODS_1_084_WI_1 -828,FOODS_1_084_WI_2 -829,FOODS_1_084_WI_3 -830,FOODS_1_085_CA_1 -831,FOODS_1_085_CA_2 -832,FOODS_1_085_CA_3 -833,FOODS_1_085_CA_4 -834,FOODS_1_085_TX_1 -835,FOODS_1_085_TX_2 -836,FOODS_1_085_TX_3 -837,FOODS_1_085_WI_1 -838,FOODS_1_085_WI_2 -839,FOODS_1_085_WI_3 -840,FOODS_1_086_CA_1 -841,FOODS_1_086_CA_2 -842,FOODS_1_086_CA_3 -843,FOODS_1_086_CA_4 -844,FOODS_1_086_TX_1 -845,FOODS_1_086_TX_2 -846,FOODS_1_086_TX_3 -847,FOODS_1_086_WI_1 -848,FOODS_1_086_WI_2 -849,FOODS_1_086_WI_3 -850,FOODS_1_087_CA_1 -851,FOODS_1_087_CA_2 -852,FOODS_1_087_CA_3 -853,FOODS_1_087_CA_4 -854,FOODS_1_087_TX_1 -855,FOODS_1_087_TX_2 -856,FOODS_1_087_TX_3 -857,FOODS_1_087_WI_1 -858,FOODS_1_087_WI_2 -859,FOODS_1_087_WI_3 -860,FOODS_1_088_CA_1 -861,FOODS_1_088_CA_2 -862,FOODS_1_088_CA_3 -863,FOODS_1_088_CA_4 -864,FOODS_1_088_TX_1 -865,FOODS_1_088_TX_2 -866,FOODS_1_088_TX_3 -867,FOODS_1_088_WI_1 -868,FOODS_1_088_WI_2 -869,FOODS_1_088_WI_3 -870,FOODS_1_089_CA_1 -871,FOODS_1_089_CA_2 -872,FOODS_1_089_CA_3 -873,FOODS_1_089_CA_4 -874,FOODS_1_089_TX_1 -875,FOODS_1_089_TX_2 -876,FOODS_1_089_TX_3 -877,FOODS_1_089_WI_1 -878,FOODS_1_089_WI_2 -879,FOODS_1_089_WI_3 -880,FOODS_1_090_CA_1 -881,FOODS_1_090_CA_2 -882,FOODS_1_090_CA_3 -883,FOODS_1_090_CA_4 -884,FOODS_1_090_TX_1 -885,FOODS_1_090_TX_2 -886,FOODS_1_090_TX_3 -887,FOODS_1_090_WI_1 -888,FOODS_1_090_WI_2 -889,FOODS_1_090_WI_3 -890,FOODS_1_091_CA_1 -891,FOODS_1_091_CA_2 -892,FOODS_1_091_CA_3 -893,FOODS_1_091_CA_4 -894,FOODS_1_091_TX_1 -895,FOODS_1_091_TX_2 -896,FOODS_1_091_TX_3 -897,FOODS_1_091_WI_1 -898,FOODS_1_091_WI_2 -899,FOODS_1_091_WI_3 -900,FOODS_1_092_CA_1 -901,FOODS_1_092_CA_2 -902,FOODS_1_092_CA_3 -903,FOODS_1_092_CA_4 -904,FOODS_1_092_TX_1 -905,FOODS_1_092_TX_2 -906,FOODS_1_092_TX_3 -907,FOODS_1_092_WI_1 -908,FOODS_1_092_WI_2 -909,FOODS_1_092_WI_3 -910,FOODS_1_093_CA_1 -911,FOODS_1_093_CA_2 -912,FOODS_1_093_CA_3 -913,FOODS_1_093_CA_4 -914,FOODS_1_093_TX_1 -915,FOODS_1_093_TX_2 -916,FOODS_1_093_TX_3 -917,FOODS_1_093_WI_1 -918,FOODS_1_093_WI_2 -919,FOODS_1_093_WI_3 -920,FOODS_1_094_CA_1 -921,FOODS_1_094_CA_2 -922,FOODS_1_094_CA_3 -923,FOODS_1_094_CA_4 -924,FOODS_1_094_TX_1 -925,FOODS_1_094_TX_2 -926,FOODS_1_094_TX_3 -927,FOODS_1_094_WI_1 -928,FOODS_1_094_WI_2 -929,FOODS_1_094_WI_3 -930,FOODS_1_095_CA_1 -931,FOODS_1_095_CA_2 -932,FOODS_1_095_CA_3 -933,FOODS_1_095_CA_4 -934,FOODS_1_095_TX_1 -935,FOODS_1_095_TX_2 -936,FOODS_1_095_TX_3 -937,FOODS_1_095_WI_1 -938,FOODS_1_095_WI_2 -939,FOODS_1_095_WI_3 -940,FOODS_1_096_CA_1 -941,FOODS_1_096_CA_2 -942,FOODS_1_096_CA_3 -943,FOODS_1_096_CA_4 -944,FOODS_1_096_TX_1 -945,FOODS_1_096_TX_2 -946,FOODS_1_096_TX_3 -947,FOODS_1_096_WI_1 -948,FOODS_1_096_WI_2 -949,FOODS_1_096_WI_3 -950,FOODS_1_097_CA_1 -951,FOODS_1_097_CA_2 -952,FOODS_1_097_CA_3 -953,FOODS_1_097_CA_4 -954,FOODS_1_097_TX_1 -955,FOODS_1_097_TX_2 -956,FOODS_1_097_TX_3 -957,FOODS_1_097_WI_1 -958,FOODS_1_097_WI_2 -959,FOODS_1_097_WI_3 -960,FOODS_1_098_CA_1 -961,FOODS_1_098_CA_2 -962,FOODS_1_098_CA_3 -963,FOODS_1_098_CA_4 -964,FOODS_1_098_TX_1 -965,FOODS_1_098_TX_2 -966,FOODS_1_098_TX_3 -967,FOODS_1_098_WI_1 -968,FOODS_1_098_WI_2 -969,FOODS_1_098_WI_3 -970,FOODS_1_099_CA_1 -971,FOODS_1_099_CA_2 -972,FOODS_1_099_CA_3 -973,FOODS_1_099_CA_4 -974,FOODS_1_099_TX_1 -975,FOODS_1_099_TX_2 -976,FOODS_1_099_TX_3 -977,FOODS_1_099_WI_1 -978,FOODS_1_099_WI_2 -979,FOODS_1_099_WI_3 -980,FOODS_1_101_CA_1 -981,FOODS_1_101_CA_2 -982,FOODS_1_101_CA_3 -983,FOODS_1_101_CA_4 -984,FOODS_1_101_TX_1 -985,FOODS_1_101_TX_2 -986,FOODS_1_101_TX_3 -987,FOODS_1_101_WI_1 -988,FOODS_1_101_WI_2 -989,FOODS_1_101_WI_3 -990,FOODS_1_102_CA_1 -991,FOODS_1_102_CA_2 -992,FOODS_1_102_CA_3 -993,FOODS_1_102_CA_4 -994,FOODS_1_102_TX_1 -995,FOODS_1_102_TX_2 -996,FOODS_1_102_TX_3 -997,FOODS_1_102_WI_1 -998,FOODS_1_102_WI_2 -999,FOODS_1_102_WI_3 -1000,FOODS_1_103_CA_1 -1001,FOODS_1_103_CA_2 -1002,FOODS_1_103_CA_3 -1003,FOODS_1_103_CA_4 -1004,FOODS_1_103_TX_1 -1005,FOODS_1_103_TX_2 -1006,FOODS_1_103_TX_3 -1007,FOODS_1_103_WI_1 -1008,FOODS_1_103_WI_2 -1009,FOODS_1_103_WI_3 -1010,FOODS_1_104_CA_1 -1011,FOODS_1_104_CA_2 -1012,FOODS_1_104_CA_3 -1013,FOODS_1_104_CA_4 -1014,FOODS_1_104_TX_1 -1015,FOODS_1_104_TX_2 -1016,FOODS_1_104_TX_3 -1017,FOODS_1_104_WI_1 -1018,FOODS_1_104_WI_2 -1019,FOODS_1_104_WI_3 -1020,FOODS_1_105_CA_1 -1021,FOODS_1_105_CA_2 -1022,FOODS_1_105_CA_3 -1023,FOODS_1_105_CA_4 -1024,FOODS_1_105_TX_1 -1025,FOODS_1_105_TX_2 -1026,FOODS_1_105_TX_3 -1027,FOODS_1_105_WI_1 -1028,FOODS_1_105_WI_2 -1029,FOODS_1_105_WI_3 -1030,FOODS_1_106_CA_1 -1031,FOODS_1_106_CA_2 -1032,FOODS_1_106_CA_3 -1033,FOODS_1_106_CA_4 -1034,FOODS_1_106_TX_1 -1035,FOODS_1_106_TX_2 -1036,FOODS_1_106_TX_3 -1037,FOODS_1_106_WI_1 -1038,FOODS_1_106_WI_2 -1039,FOODS_1_106_WI_3 -1040,FOODS_1_107_CA_1 -1041,FOODS_1_107_CA_2 -1042,FOODS_1_107_CA_3 -1043,FOODS_1_107_CA_4 -1044,FOODS_1_107_TX_1 -1045,FOODS_1_107_TX_2 -1046,FOODS_1_107_TX_3 -1047,FOODS_1_107_WI_1 -1048,FOODS_1_107_WI_2 -1049,FOODS_1_107_WI_3 -1050,FOODS_1_108_CA_1 -1051,FOODS_1_108_CA_2 -1052,FOODS_1_108_CA_3 -1053,FOODS_1_108_CA_4 -1054,FOODS_1_108_TX_1 -1055,FOODS_1_108_TX_2 -1056,FOODS_1_108_TX_3 -1057,FOODS_1_108_WI_1 -1058,FOODS_1_108_WI_2 -1059,FOODS_1_108_WI_3 -1060,FOODS_1_109_CA_1 -1061,FOODS_1_109_CA_2 -1062,FOODS_1_109_CA_3 -1063,FOODS_1_109_CA_4 -1064,FOODS_1_109_TX_1 -1065,FOODS_1_109_TX_2 -1066,FOODS_1_109_TX_3 -1067,FOODS_1_109_WI_1 -1068,FOODS_1_109_WI_2 -1069,FOODS_1_109_WI_3 -1070,FOODS_1_110_CA_1 -1071,FOODS_1_110_CA_2 -1072,FOODS_1_110_CA_3 -1073,FOODS_1_110_CA_4 -1074,FOODS_1_110_TX_1 -1075,FOODS_1_110_TX_2 -1076,FOODS_1_110_TX_3 -1077,FOODS_1_110_WI_1 -1078,FOODS_1_110_WI_2 -1079,FOODS_1_110_WI_3 -1080,FOODS_1_111_CA_1 -1081,FOODS_1_111_CA_2 -1082,FOODS_1_111_CA_3 -1083,FOODS_1_111_CA_4 -1084,FOODS_1_111_TX_1 -1085,FOODS_1_111_TX_2 -1086,FOODS_1_111_TX_3 -1087,FOODS_1_111_WI_1 -1088,FOODS_1_111_WI_2 -1089,FOODS_1_111_WI_3 -1090,FOODS_1_112_CA_1 -1091,FOODS_1_112_CA_2 -1092,FOODS_1_112_CA_3 -1093,FOODS_1_112_CA_4 -1094,FOODS_1_112_TX_1 -1095,FOODS_1_112_TX_2 -1096,FOODS_1_112_TX_3 -1097,FOODS_1_112_WI_1 -1098,FOODS_1_112_WI_2 -1099,FOODS_1_112_WI_3 -1100,FOODS_1_113_CA_1 -1101,FOODS_1_113_CA_2 -1102,FOODS_1_113_CA_3 -1103,FOODS_1_113_CA_4 -1104,FOODS_1_113_TX_1 -1105,FOODS_1_113_TX_2 -1106,FOODS_1_113_TX_3 -1107,FOODS_1_113_WI_1 -1108,FOODS_1_113_WI_2 -1109,FOODS_1_113_WI_3 -1110,FOODS_1_114_CA_1 -1111,FOODS_1_114_CA_2 -1112,FOODS_1_114_CA_3 -1113,FOODS_1_114_CA_4 -1114,FOODS_1_114_TX_1 -1115,FOODS_1_114_TX_2 -1116,FOODS_1_114_TX_3 -1117,FOODS_1_114_WI_1 -1118,FOODS_1_114_WI_2 -1119,FOODS_1_114_WI_3 -1120,FOODS_1_115_CA_1 -1121,FOODS_1_115_CA_2 -1122,FOODS_1_115_CA_3 -1123,FOODS_1_115_CA_4 -1124,FOODS_1_115_TX_1 -1125,FOODS_1_115_TX_2 -1126,FOODS_1_115_TX_3 -1127,FOODS_1_115_WI_1 -1128,FOODS_1_115_WI_2 -1129,FOODS_1_115_WI_3 -1130,FOODS_1_116_CA_1 -1131,FOODS_1_116_CA_2 -1132,FOODS_1_116_CA_3 -1133,FOODS_1_116_CA_4 -1134,FOODS_1_116_TX_1 -1135,FOODS_1_116_TX_2 -1136,FOODS_1_116_TX_3 -1137,FOODS_1_116_WI_1 -1138,FOODS_1_116_WI_2 -1139,FOODS_1_116_WI_3 -1140,FOODS_1_117_CA_1 -1141,FOODS_1_117_CA_2 -1142,FOODS_1_117_CA_3 -1143,FOODS_1_117_CA_4 -1144,FOODS_1_117_TX_1 -1145,FOODS_1_117_TX_2 -1146,FOODS_1_117_TX_3 -1147,FOODS_1_117_WI_1 -1148,FOODS_1_117_WI_2 -1149,FOODS_1_117_WI_3 -1150,FOODS_1_118_CA_1 -1151,FOODS_1_118_CA_2 -1152,FOODS_1_118_CA_3 -1153,FOODS_1_118_CA_4 -1154,FOODS_1_118_TX_1 -1155,FOODS_1_118_TX_2 -1156,FOODS_1_118_TX_3 -1157,FOODS_1_118_WI_1 -1158,FOODS_1_118_WI_2 -1159,FOODS_1_118_WI_3 -1160,FOODS_1_119_CA_1 -1161,FOODS_1_119_CA_2 -1162,FOODS_1_119_CA_3 -1163,FOODS_1_119_CA_4 -1164,FOODS_1_119_TX_1 -1165,FOODS_1_119_TX_2 -1166,FOODS_1_119_TX_3 -1167,FOODS_1_119_WI_1 -1168,FOODS_1_119_WI_2 -1169,FOODS_1_119_WI_3 -1170,FOODS_1_120_CA_1 -1171,FOODS_1_120_CA_2 -1172,FOODS_1_120_CA_3 -1173,FOODS_1_120_CA_4 -1174,FOODS_1_120_TX_1 -1175,FOODS_1_120_TX_2 -1176,FOODS_1_120_TX_3 -1177,FOODS_1_120_WI_1 -1178,FOODS_1_120_WI_2 -1179,FOODS_1_120_WI_3 -1180,FOODS_1_121_CA_1 -1181,FOODS_1_121_CA_2 -1182,FOODS_1_121_CA_3 -1183,FOODS_1_121_CA_4 -1184,FOODS_1_121_TX_1 -1185,FOODS_1_121_TX_2 -1186,FOODS_1_121_TX_3 -1187,FOODS_1_121_WI_1 -1188,FOODS_1_121_WI_2 -1189,FOODS_1_121_WI_3 -1190,FOODS_1_122_CA_1 -1191,FOODS_1_122_CA_2 -1192,FOODS_1_122_CA_3 -1193,FOODS_1_122_CA_4 -1194,FOODS_1_122_TX_1 -1195,FOODS_1_122_TX_2 -1196,FOODS_1_122_TX_3 -1197,FOODS_1_122_WI_1 -1198,FOODS_1_122_WI_2 -1199,FOODS_1_122_WI_3 -1200,FOODS_1_123_CA_1 -1201,FOODS_1_123_CA_2 -1202,FOODS_1_123_CA_3 -1203,FOODS_1_123_CA_4 -1204,FOODS_1_123_TX_1 -1205,FOODS_1_123_TX_2 -1206,FOODS_1_123_TX_3 -1207,FOODS_1_123_WI_1 -1208,FOODS_1_123_WI_2 -1209,FOODS_1_123_WI_3 -1210,FOODS_1_124_CA_1 -1211,FOODS_1_124_CA_2 -1212,FOODS_1_124_CA_3 -1213,FOODS_1_124_CA_4 -1214,FOODS_1_124_TX_1 -1215,FOODS_1_124_TX_2 -1216,FOODS_1_124_TX_3 -1217,FOODS_1_124_WI_1 -1218,FOODS_1_124_WI_2 -1219,FOODS_1_124_WI_3 -1220,FOODS_1_125_CA_1 -1221,FOODS_1_125_CA_2 -1222,FOODS_1_125_CA_3 -1223,FOODS_1_125_CA_4 -1224,FOODS_1_125_TX_1 -1225,FOODS_1_125_TX_2 -1226,FOODS_1_125_TX_3 -1227,FOODS_1_125_WI_1 -1228,FOODS_1_125_WI_2 -1229,FOODS_1_125_WI_3 -1230,FOODS_1_126_CA_1 -1231,FOODS_1_126_CA_2 -1232,FOODS_1_126_CA_3 -1233,FOODS_1_126_CA_4 -1234,FOODS_1_126_TX_1 -1235,FOODS_1_126_TX_2 -1236,FOODS_1_126_TX_3 -1237,FOODS_1_126_WI_1 -1238,FOODS_1_126_WI_2 -1239,FOODS_1_126_WI_3 -1240,FOODS_1_127_CA_1 -1241,FOODS_1_127_CA_2 -1242,FOODS_1_127_CA_3 -1243,FOODS_1_127_CA_4 -1244,FOODS_1_127_TX_1 -1245,FOODS_1_127_TX_2 -1246,FOODS_1_127_TX_3 -1247,FOODS_1_127_WI_1 -1248,FOODS_1_127_WI_2 -1249,FOODS_1_127_WI_3 -1250,FOODS_1_128_CA_1 -1251,FOODS_1_128_CA_2 -1252,FOODS_1_128_CA_3 -1253,FOODS_1_128_CA_4 -1254,FOODS_1_128_TX_1 -1255,FOODS_1_128_TX_2 -1256,FOODS_1_128_TX_3 -1257,FOODS_1_128_WI_1 -1258,FOODS_1_128_WI_2 -1259,FOODS_1_128_WI_3 -1260,FOODS_1_129_CA_1 -1261,FOODS_1_129_CA_2 -1262,FOODS_1_129_CA_3 -1263,FOODS_1_129_CA_4 -1264,FOODS_1_129_TX_1 -1265,FOODS_1_129_TX_2 -1266,FOODS_1_129_TX_3 -1267,FOODS_1_129_WI_1 -1268,FOODS_1_129_WI_2 -1269,FOODS_1_129_WI_3 -1270,FOODS_1_130_CA_1 -1271,FOODS_1_130_CA_2 -1272,FOODS_1_130_CA_3 -1273,FOODS_1_130_CA_4 -1274,FOODS_1_130_TX_1 -1275,FOODS_1_130_TX_2 -1276,FOODS_1_130_TX_3 -1277,FOODS_1_130_WI_1 -1278,FOODS_1_130_WI_2 -1279,FOODS_1_130_WI_3 -1280,FOODS_1_131_CA_1 -1281,FOODS_1_131_CA_2 -1282,FOODS_1_131_CA_3 -1283,FOODS_1_131_CA_4 -1284,FOODS_1_131_TX_1 -1285,FOODS_1_131_TX_2 -1286,FOODS_1_131_TX_3 -1287,FOODS_1_131_WI_1 -1288,FOODS_1_131_WI_2 -1289,FOODS_1_131_WI_3 -1290,FOODS_1_132_CA_1 -1291,FOODS_1_132_CA_2 -1292,FOODS_1_132_CA_3 -1293,FOODS_1_132_CA_4 -1294,FOODS_1_132_TX_1 -1295,FOODS_1_132_TX_2 -1296,FOODS_1_132_TX_3 -1297,FOODS_1_132_WI_1 -1298,FOODS_1_132_WI_2 -1299,FOODS_1_132_WI_3 -1300,FOODS_1_133_CA_1 -1301,FOODS_1_133_CA_2 -1302,FOODS_1_133_CA_3 -1303,FOODS_1_133_CA_4 -1304,FOODS_1_133_TX_1 -1305,FOODS_1_133_TX_2 -1306,FOODS_1_133_TX_3 -1307,FOODS_1_133_WI_1 -1308,FOODS_1_133_WI_2 -1309,FOODS_1_133_WI_3 -1310,FOODS_1_134_CA_1 -1311,FOODS_1_134_CA_2 -1312,FOODS_1_134_CA_3 -1313,FOODS_1_134_CA_4 -1314,FOODS_1_134_TX_1 -1315,FOODS_1_134_TX_2 -1316,FOODS_1_134_TX_3 -1317,FOODS_1_134_WI_1 -1318,FOODS_1_134_WI_2 -1319,FOODS_1_134_WI_3 -1320,FOODS_1_135_CA_1 -1321,FOODS_1_135_CA_2 -1322,FOODS_1_135_CA_3 -1323,FOODS_1_135_CA_4 -1324,FOODS_1_135_TX_1 -1325,FOODS_1_135_TX_2 -1326,FOODS_1_135_TX_3 -1327,FOODS_1_135_WI_1 -1328,FOODS_1_135_WI_2 -1329,FOODS_1_135_WI_3 -1330,FOODS_1_136_CA_1 -1331,FOODS_1_136_CA_2 -1332,FOODS_1_136_CA_3 -1333,FOODS_1_136_CA_4 -1334,FOODS_1_136_TX_1 -1335,FOODS_1_136_TX_2 -1336,FOODS_1_136_TX_3 -1337,FOODS_1_136_WI_1 -1338,FOODS_1_136_WI_2 -1339,FOODS_1_136_WI_3 -1340,FOODS_1_137_CA_1 -1341,FOODS_1_137_CA_2 -1342,FOODS_1_137_CA_3 -1343,FOODS_1_137_CA_4 -1344,FOODS_1_137_TX_1 -1345,FOODS_1_137_TX_2 -1346,FOODS_1_137_TX_3 -1347,FOODS_1_137_WI_1 -1348,FOODS_1_137_WI_2 -1349,FOODS_1_137_WI_3 -1350,FOODS_1_138_CA_1 -1351,FOODS_1_138_CA_2 -1352,FOODS_1_138_CA_3 -1353,FOODS_1_138_CA_4 -1354,FOODS_1_138_TX_1 -1355,FOODS_1_138_TX_2 -1356,FOODS_1_138_TX_3 -1357,FOODS_1_138_WI_1 -1358,FOODS_1_138_WI_2 -1359,FOODS_1_138_WI_3 -1360,FOODS_1_139_CA_1 -1361,FOODS_1_139_CA_2 -1362,FOODS_1_139_CA_3 -1363,FOODS_1_139_CA_4 -1364,FOODS_1_139_TX_1 -1365,FOODS_1_139_TX_2 -1366,FOODS_1_139_TX_3 -1367,FOODS_1_139_WI_1 -1368,FOODS_1_139_WI_2 -1369,FOODS_1_139_WI_3 -1370,FOODS_1_140_CA_1 -1371,FOODS_1_140_CA_2 -1372,FOODS_1_140_CA_3 -1373,FOODS_1_140_CA_4 -1374,FOODS_1_140_TX_1 -1375,FOODS_1_140_TX_2 -1376,FOODS_1_140_TX_3 -1377,FOODS_1_140_WI_1 -1378,FOODS_1_140_WI_2 -1379,FOODS_1_140_WI_3 -1380,FOODS_1_141_CA_1 -1381,FOODS_1_141_CA_2 -1382,FOODS_1_141_CA_3 -1383,FOODS_1_141_CA_4 -1384,FOODS_1_141_TX_1 -1385,FOODS_1_141_TX_2 -1386,FOODS_1_141_TX_3 -1387,FOODS_1_141_WI_1 -1388,FOODS_1_141_WI_2 -1389,FOODS_1_141_WI_3 -1390,FOODS_1_142_CA_1 -1391,FOODS_1_142_CA_2 -1392,FOODS_1_142_CA_3 -1393,FOODS_1_142_CA_4 -1394,FOODS_1_142_TX_1 -1395,FOODS_1_142_TX_2 -1396,FOODS_1_142_TX_3 -1397,FOODS_1_142_WI_1 -1398,FOODS_1_142_WI_2 -1399,FOODS_1_142_WI_3 -1400,FOODS_1_143_CA_1 -1401,FOODS_1_143_CA_2 -1402,FOODS_1_143_CA_3 -1403,FOODS_1_143_CA_4 -1404,FOODS_1_143_TX_1 -1405,FOODS_1_143_TX_2 -1406,FOODS_1_143_TX_3 -1407,FOODS_1_143_WI_1 -1408,FOODS_1_143_WI_2 -1409,FOODS_1_143_WI_3 -1410,FOODS_1_144_CA_1 -1411,FOODS_1_144_CA_2 -1412,FOODS_1_144_CA_3 -1413,FOODS_1_144_CA_4 -1414,FOODS_1_144_TX_1 -1415,FOODS_1_144_TX_2 -1416,FOODS_1_144_TX_3 -1417,FOODS_1_144_WI_1 -1418,FOODS_1_144_WI_2 -1419,FOODS_1_144_WI_3 -1420,FOODS_1_145_CA_1 -1421,FOODS_1_145_CA_2 -1422,FOODS_1_145_CA_3 -1423,FOODS_1_145_CA_4 -1424,FOODS_1_145_TX_1 -1425,FOODS_1_145_TX_2 -1426,FOODS_1_145_TX_3 -1427,FOODS_1_145_WI_1 -1428,FOODS_1_145_WI_2 -1429,FOODS_1_145_WI_3 -1430,FOODS_1_146_CA_1 -1431,FOODS_1_146_CA_2 -1432,FOODS_1_146_CA_3 -1433,FOODS_1_146_CA_4 -1434,FOODS_1_146_TX_1 -1435,FOODS_1_146_TX_2 -1436,FOODS_1_146_TX_3 -1437,FOODS_1_146_WI_1 -1438,FOODS_1_146_WI_2 -1439,FOODS_1_146_WI_3 -1440,FOODS_1_147_CA_1 -1441,FOODS_1_147_CA_2 -1442,FOODS_1_147_CA_3 -1443,FOODS_1_147_CA_4 -1444,FOODS_1_147_TX_1 -1445,FOODS_1_147_TX_2 -1446,FOODS_1_147_TX_3 -1447,FOODS_1_147_WI_1 -1448,FOODS_1_147_WI_2 -1449,FOODS_1_147_WI_3 -1450,FOODS_1_148_CA_1 -1451,FOODS_1_148_CA_2 -1452,FOODS_1_148_CA_3 -1453,FOODS_1_148_CA_4 -1454,FOODS_1_148_TX_1 -1455,FOODS_1_148_TX_2 -1456,FOODS_1_148_TX_3 -1457,FOODS_1_148_WI_1 -1458,FOODS_1_148_WI_2 -1459,FOODS_1_148_WI_3 -1460,FOODS_1_149_CA_1 -1461,FOODS_1_149_CA_2 -1462,FOODS_1_149_CA_3 -1463,FOODS_1_149_CA_4 -1464,FOODS_1_149_TX_1 -1465,FOODS_1_149_TX_2 -1466,FOODS_1_149_TX_3 -1467,FOODS_1_149_WI_1 -1468,FOODS_1_149_WI_2 -1469,FOODS_1_149_WI_3 -1470,FOODS_1_150_CA_1 -1471,FOODS_1_150_CA_2 -1472,FOODS_1_150_CA_3 -1473,FOODS_1_150_CA_4 -1474,FOODS_1_150_TX_1 -1475,FOODS_1_150_TX_2 -1476,FOODS_1_150_TX_3 -1477,FOODS_1_150_WI_1 -1478,FOODS_1_150_WI_2 -1479,FOODS_1_150_WI_3 -1480,FOODS_1_151_CA_1 -1481,FOODS_1_151_CA_2 -1482,FOODS_1_151_CA_3 -1483,FOODS_1_151_CA_4 -1484,FOODS_1_151_TX_1 -1485,FOODS_1_151_TX_2 -1486,FOODS_1_151_TX_3 -1487,FOODS_1_151_WI_1 -1488,FOODS_1_151_WI_2 -1489,FOODS_1_151_WI_3 -1490,FOODS_1_152_CA_1 -1491,FOODS_1_152_CA_2 -1492,FOODS_1_152_CA_3 -1493,FOODS_1_152_CA_4 -1494,FOODS_1_152_TX_1 -1495,FOODS_1_152_TX_2 -1496,FOODS_1_152_TX_3 -1497,FOODS_1_152_WI_1 -1498,FOODS_1_152_WI_2 -1499,FOODS_1_152_WI_3 -1500,FOODS_1_153_CA_1 -1501,FOODS_1_153_CA_2 -1502,FOODS_1_153_CA_3 -1503,FOODS_1_153_CA_4 -1504,FOODS_1_153_TX_1 -1505,FOODS_1_153_TX_2 -1506,FOODS_1_153_TX_3 -1507,FOODS_1_153_WI_1 -1508,FOODS_1_153_WI_2 -1509,FOODS_1_153_WI_3 -1510,FOODS_1_154_CA_1 -1511,FOODS_1_154_CA_2 -1512,FOODS_1_154_CA_3 -1513,FOODS_1_154_CA_4 -1514,FOODS_1_154_TX_1 -1515,FOODS_1_154_TX_2 -1516,FOODS_1_154_TX_3 -1517,FOODS_1_154_WI_1 -1518,FOODS_1_154_WI_2 -1519,FOODS_1_154_WI_3 -1520,FOODS_1_155_CA_1 -1521,FOODS_1_155_CA_2 -1522,FOODS_1_155_CA_3 -1523,FOODS_1_155_CA_4 -1524,FOODS_1_155_TX_1 -1525,FOODS_1_155_TX_2 -1526,FOODS_1_155_TX_3 -1527,FOODS_1_155_WI_1 -1528,FOODS_1_155_WI_2 -1529,FOODS_1_155_WI_3 -1530,FOODS_1_156_CA_1 -1531,FOODS_1_156_CA_2 -1532,FOODS_1_156_CA_3 -1533,FOODS_1_156_CA_4 -1534,FOODS_1_156_TX_1 -1535,FOODS_1_156_TX_2 -1536,FOODS_1_156_TX_3 -1537,FOODS_1_156_WI_1 -1538,FOODS_1_156_WI_2 -1539,FOODS_1_156_WI_3 -1540,FOODS_1_157_CA_1 -1541,FOODS_1_157_CA_2 -1542,FOODS_1_157_CA_3 -1543,FOODS_1_157_CA_4 -1544,FOODS_1_157_TX_1 -1545,FOODS_1_157_TX_2 -1546,FOODS_1_157_TX_3 -1547,FOODS_1_157_WI_1 -1548,FOODS_1_157_WI_2 -1549,FOODS_1_157_WI_3 -1550,FOODS_1_158_CA_1 -1551,FOODS_1_158_CA_2 -1552,FOODS_1_158_CA_3 -1553,FOODS_1_158_CA_4 -1554,FOODS_1_158_TX_1 -1555,FOODS_1_158_TX_2 -1556,FOODS_1_158_TX_3 -1557,FOODS_1_158_WI_1 -1558,FOODS_1_158_WI_2 -1559,FOODS_1_158_WI_3 -1560,FOODS_1_159_CA_1 -1561,FOODS_1_159_CA_2 -1562,FOODS_1_159_CA_3 -1563,FOODS_1_159_CA_4 -1564,FOODS_1_159_TX_1 -1565,FOODS_1_159_TX_2 -1566,FOODS_1_159_TX_3 -1567,FOODS_1_159_WI_1 -1568,FOODS_1_159_WI_2 -1569,FOODS_1_159_WI_3 -1570,FOODS_1_160_CA_1 -1571,FOODS_1_160_CA_2 -1572,FOODS_1_160_CA_3 -1573,FOODS_1_160_CA_4 -1574,FOODS_1_160_TX_1 -1575,FOODS_1_160_TX_2 -1576,FOODS_1_160_TX_3 -1577,FOODS_1_160_WI_1 -1578,FOODS_1_160_WI_2 -1579,FOODS_1_160_WI_3 -1580,FOODS_1_161_CA_1 -1581,FOODS_1_161_CA_2 -1582,FOODS_1_161_CA_3 -1583,FOODS_1_161_CA_4 -1584,FOODS_1_161_TX_1 -1585,FOODS_1_161_TX_2 -1586,FOODS_1_161_TX_3 -1587,FOODS_1_161_WI_1 -1588,FOODS_1_161_WI_2 -1589,FOODS_1_161_WI_3 -1590,FOODS_1_162_CA_1 -1591,FOODS_1_162_CA_2 -1592,FOODS_1_162_CA_3 -1593,FOODS_1_162_CA_4 -1594,FOODS_1_162_TX_1 -1595,FOODS_1_162_TX_2 -1596,FOODS_1_162_TX_3 -1597,FOODS_1_162_WI_1 -1598,FOODS_1_162_WI_2 -1599,FOODS_1_162_WI_3 -1600,FOODS_1_163_CA_1 -1601,FOODS_1_163_CA_2 -1602,FOODS_1_163_CA_3 -1603,FOODS_1_163_CA_4 -1604,FOODS_1_163_TX_1 -1605,FOODS_1_163_TX_2 -1606,FOODS_1_163_TX_3 -1607,FOODS_1_163_WI_1 -1608,FOODS_1_163_WI_2 -1609,FOODS_1_163_WI_3 -1610,FOODS_1_164_CA_1 -1611,FOODS_1_164_CA_2 -1612,FOODS_1_164_CA_3 -1613,FOODS_1_164_CA_4 -1614,FOODS_1_164_TX_1 -1615,FOODS_1_164_TX_2 -1616,FOODS_1_164_TX_3 -1617,FOODS_1_164_WI_1 -1618,FOODS_1_164_WI_2 -1619,FOODS_1_164_WI_3 -1620,FOODS_1_166_CA_1 -1621,FOODS_1_166_CA_2 -1622,FOODS_1_166_CA_3 -1623,FOODS_1_166_CA_4 -1624,FOODS_1_166_TX_1 -1625,FOODS_1_166_TX_2 -1626,FOODS_1_166_TX_3 -1627,FOODS_1_166_WI_1 -1628,FOODS_1_166_WI_2 -1629,FOODS_1_166_WI_3 -1630,FOODS_1_167_CA_1 -1631,FOODS_1_167_CA_2 -1632,FOODS_1_167_CA_3 -1633,FOODS_1_167_CA_4 -1634,FOODS_1_167_TX_1 -1635,FOODS_1_167_TX_2 -1636,FOODS_1_167_TX_3 -1637,FOODS_1_167_WI_1 -1638,FOODS_1_167_WI_2 -1639,FOODS_1_167_WI_3 -1640,FOODS_1_168_CA_1 -1641,FOODS_1_168_CA_2 -1642,FOODS_1_168_CA_3 -1643,FOODS_1_168_CA_4 -1644,FOODS_1_168_TX_1 -1645,FOODS_1_168_TX_2 -1646,FOODS_1_168_TX_3 -1647,FOODS_1_168_WI_1 -1648,FOODS_1_168_WI_2 -1649,FOODS_1_168_WI_3 -1650,FOODS_1_169_CA_1 -1651,FOODS_1_169_CA_2 -1652,FOODS_1_169_CA_3 -1653,FOODS_1_169_CA_4 -1654,FOODS_1_169_TX_1 -1655,FOODS_1_169_TX_2 -1656,FOODS_1_169_TX_3 -1657,FOODS_1_169_WI_1 -1658,FOODS_1_169_WI_2 -1659,FOODS_1_169_WI_3 -1660,FOODS_1_170_CA_1 -1661,FOODS_1_170_CA_2 -1662,FOODS_1_170_CA_3 -1663,FOODS_1_170_CA_4 -1664,FOODS_1_170_TX_1 -1665,FOODS_1_170_TX_2 -1666,FOODS_1_170_TX_3 -1667,FOODS_1_170_WI_1 -1668,FOODS_1_170_WI_2 -1669,FOODS_1_170_WI_3 -1670,FOODS_1_171_CA_1 -1671,FOODS_1_171_CA_2 -1672,FOODS_1_171_CA_3 -1673,FOODS_1_171_CA_4 -1674,FOODS_1_171_TX_1 -1675,FOODS_1_171_TX_2 -1676,FOODS_1_171_TX_3 -1677,FOODS_1_171_WI_1 -1678,FOODS_1_171_WI_2 -1679,FOODS_1_171_WI_3 -1680,FOODS_1_172_CA_1 -1681,FOODS_1_172_CA_2 -1682,FOODS_1_172_CA_3 -1683,FOODS_1_172_CA_4 -1684,FOODS_1_172_TX_1 -1685,FOODS_1_172_TX_2 -1686,FOODS_1_172_TX_3 -1687,FOODS_1_172_WI_1 -1688,FOODS_1_172_WI_2 -1689,FOODS_1_172_WI_3 -1690,FOODS_1_173_CA_1 -1691,FOODS_1_173_CA_2 -1692,FOODS_1_173_CA_3 -1693,FOODS_1_173_CA_4 -1694,FOODS_1_173_TX_1 -1695,FOODS_1_173_TX_2 -1696,FOODS_1_173_TX_3 -1697,FOODS_1_173_WI_1 -1698,FOODS_1_173_WI_2 -1699,FOODS_1_173_WI_3 -1700,FOODS_1_174_CA_1 -1701,FOODS_1_174_CA_2 -1702,FOODS_1_174_CA_3 -1703,FOODS_1_174_CA_4 -1704,FOODS_1_174_TX_1 -1705,FOODS_1_174_TX_2 -1706,FOODS_1_174_TX_3 -1707,FOODS_1_174_WI_1 -1708,FOODS_1_174_WI_2 -1709,FOODS_1_174_WI_3 -1710,FOODS_1_175_CA_1 -1711,FOODS_1_175_CA_2 -1712,FOODS_1_175_CA_3 -1713,FOODS_1_175_CA_4 -1714,FOODS_1_175_TX_1 -1715,FOODS_1_175_TX_2 -1716,FOODS_1_175_TX_3 -1717,FOODS_1_175_WI_1 -1718,FOODS_1_175_WI_2 -1719,FOODS_1_175_WI_3 -1720,FOODS_1_176_CA_1 -1721,FOODS_1_176_CA_2 -1722,FOODS_1_176_CA_3 -1723,FOODS_1_176_CA_4 -1724,FOODS_1_176_TX_1 -1725,FOODS_1_176_TX_2 -1726,FOODS_1_176_TX_3 -1727,FOODS_1_176_WI_1 -1728,FOODS_1_176_WI_2 -1729,FOODS_1_176_WI_3 -1730,FOODS_1_177_CA_1 -1731,FOODS_1_177_CA_2 -1732,FOODS_1_177_CA_3 -1733,FOODS_1_177_CA_4 -1734,FOODS_1_177_TX_1 -1735,FOODS_1_177_TX_2 -1736,FOODS_1_177_TX_3 -1737,FOODS_1_177_WI_1 -1738,FOODS_1_177_WI_2 -1739,FOODS_1_177_WI_3 -1740,FOODS_1_178_CA_1 -1741,FOODS_1_178_CA_2 -1742,FOODS_1_178_CA_3 -1743,FOODS_1_178_CA_4 -1744,FOODS_1_178_TX_1 -1745,FOODS_1_178_TX_2 -1746,FOODS_1_178_TX_3 -1747,FOODS_1_178_WI_1 -1748,FOODS_1_178_WI_2 -1749,FOODS_1_178_WI_3 -1750,FOODS_1_179_CA_1 -1751,FOODS_1_179_CA_2 -1752,FOODS_1_179_CA_3 -1753,FOODS_1_179_CA_4 -1754,FOODS_1_179_TX_1 -1755,FOODS_1_179_TX_2 -1756,FOODS_1_179_TX_3 -1757,FOODS_1_179_WI_1 -1758,FOODS_1_179_WI_2 -1759,FOODS_1_179_WI_3 -1760,FOODS_1_180_CA_1 -1761,FOODS_1_180_CA_2 -1762,FOODS_1_180_CA_3 -1763,FOODS_1_180_CA_4 -1764,FOODS_1_180_TX_1 -1765,FOODS_1_180_TX_2 -1766,FOODS_1_180_TX_3 -1767,FOODS_1_180_WI_1 -1768,FOODS_1_180_WI_2 -1769,FOODS_1_180_WI_3 -1770,FOODS_1_181_CA_1 -1771,FOODS_1_181_CA_2 -1772,FOODS_1_181_CA_3 -1773,FOODS_1_181_CA_4 -1774,FOODS_1_181_TX_1 -1775,FOODS_1_181_TX_2 -1776,FOODS_1_181_TX_3 -1777,FOODS_1_181_WI_1 -1778,FOODS_1_181_WI_2 -1779,FOODS_1_181_WI_3 -1780,FOODS_1_182_CA_1 -1781,FOODS_1_182_CA_2 -1782,FOODS_1_182_CA_3 -1783,FOODS_1_182_CA_4 -1784,FOODS_1_182_TX_1 -1785,FOODS_1_182_TX_2 -1786,FOODS_1_182_TX_3 -1787,FOODS_1_182_WI_1 -1788,FOODS_1_182_WI_2 -1789,FOODS_1_182_WI_3 -1790,FOODS_1_183_CA_1 -1791,FOODS_1_183_CA_2 -1792,FOODS_1_183_CA_3 -1793,FOODS_1_183_CA_4 -1794,FOODS_1_183_TX_1 -1795,FOODS_1_183_TX_2 -1796,FOODS_1_183_TX_3 -1797,FOODS_1_183_WI_1 -1798,FOODS_1_183_WI_2 -1799,FOODS_1_183_WI_3 -1800,FOODS_1_184_CA_1 -1801,FOODS_1_184_CA_2 -1802,FOODS_1_184_CA_3 -1803,FOODS_1_184_CA_4 -1804,FOODS_1_184_TX_1 -1805,FOODS_1_184_TX_2 -1806,FOODS_1_184_TX_3 -1807,FOODS_1_184_WI_1 -1808,FOODS_1_184_WI_2 -1809,FOODS_1_184_WI_3 -1810,FOODS_1_185_CA_1 -1811,FOODS_1_185_CA_2 -1812,FOODS_1_185_CA_3 -1813,FOODS_1_185_CA_4 -1814,FOODS_1_185_TX_1 -1815,FOODS_1_185_TX_2 -1816,FOODS_1_185_TX_3 -1817,FOODS_1_185_WI_1 -1818,FOODS_1_185_WI_2 -1819,FOODS_1_185_WI_3 -1820,FOODS_1_186_CA_1 -1821,FOODS_1_186_CA_2 -1822,FOODS_1_186_CA_3 -1823,FOODS_1_186_CA_4 -1824,FOODS_1_186_TX_1 -1825,FOODS_1_186_TX_2 -1826,FOODS_1_186_TX_3 -1827,FOODS_1_186_WI_1 -1828,FOODS_1_186_WI_2 -1829,FOODS_1_186_WI_3 -1830,FOODS_1_187_CA_1 -1831,FOODS_1_187_CA_2 -1832,FOODS_1_187_CA_3 -1833,FOODS_1_187_CA_4 -1834,FOODS_1_187_TX_1 -1835,FOODS_1_187_TX_2 -1836,FOODS_1_187_TX_3 -1837,FOODS_1_187_WI_1 -1838,FOODS_1_187_WI_2 -1839,FOODS_1_187_WI_3 -1840,FOODS_1_188_CA_1 -1841,FOODS_1_188_CA_2 -1842,FOODS_1_188_CA_3 -1843,FOODS_1_188_CA_4 -1844,FOODS_1_188_TX_1 -1845,FOODS_1_188_TX_2 -1846,FOODS_1_188_TX_3 -1847,FOODS_1_188_WI_1 -1848,FOODS_1_188_WI_2 -1849,FOODS_1_188_WI_3 -1850,FOODS_1_189_CA_1 -1851,FOODS_1_189_CA_2 -1852,FOODS_1_189_CA_3 -1853,FOODS_1_189_CA_4 -1854,FOODS_1_189_TX_1 -1855,FOODS_1_189_TX_2 -1856,FOODS_1_189_TX_3 -1857,FOODS_1_189_WI_1 -1858,FOODS_1_189_WI_2 -1859,FOODS_1_189_WI_3 -1860,FOODS_1_190_CA_1 -1861,FOODS_1_190_CA_2 -1862,FOODS_1_190_CA_3 -1863,FOODS_1_190_CA_4 -1864,FOODS_1_190_TX_1 -1865,FOODS_1_190_TX_2 -1866,FOODS_1_190_TX_3 -1867,FOODS_1_190_WI_1 -1868,FOODS_1_190_WI_2 -1869,FOODS_1_190_WI_3 -1870,FOODS_1_191_CA_1 -1871,FOODS_1_191_CA_2 -1872,FOODS_1_191_CA_3 -1873,FOODS_1_191_CA_4 -1874,FOODS_1_191_TX_1 -1875,FOODS_1_191_TX_2 -1876,FOODS_1_191_TX_3 -1877,FOODS_1_191_WI_1 -1878,FOODS_1_191_WI_2 -1879,FOODS_1_191_WI_3 -1880,FOODS_1_192_CA_1 -1881,FOODS_1_192_CA_2 -1882,FOODS_1_192_CA_3 -1883,FOODS_1_192_CA_4 -1884,FOODS_1_192_TX_1 -1885,FOODS_1_192_TX_2 -1886,FOODS_1_192_TX_3 -1887,FOODS_1_192_WI_1 -1888,FOODS_1_192_WI_2 -1889,FOODS_1_192_WI_3 -1890,FOODS_1_193_CA_1 -1891,FOODS_1_193_CA_2 -1892,FOODS_1_193_CA_3 -1893,FOODS_1_193_CA_4 -1894,FOODS_1_193_TX_1 -1895,FOODS_1_193_TX_2 -1896,FOODS_1_193_TX_3 -1897,FOODS_1_193_WI_1 -1898,FOODS_1_193_WI_2 -1899,FOODS_1_193_WI_3 -1900,FOODS_1_194_CA_1 -1901,FOODS_1_194_CA_2 -1902,FOODS_1_194_CA_3 -1903,FOODS_1_194_CA_4 -1904,FOODS_1_194_TX_1 -1905,FOODS_1_194_TX_2 -1906,FOODS_1_194_TX_3 -1907,FOODS_1_194_WI_1 -1908,FOODS_1_194_WI_2 -1909,FOODS_1_194_WI_3 -1910,FOODS_1_195_CA_1 -1911,FOODS_1_195_CA_2 -1912,FOODS_1_195_CA_3 -1913,FOODS_1_195_CA_4 -1914,FOODS_1_195_TX_1 -1915,FOODS_1_195_TX_2 -1916,FOODS_1_195_TX_3 -1917,FOODS_1_195_WI_1 -1918,FOODS_1_195_WI_2 -1919,FOODS_1_195_WI_3 -1920,FOODS_1_196_CA_1 -1921,FOODS_1_196_CA_2 -1922,FOODS_1_196_CA_3 -1923,FOODS_1_196_CA_4 -1924,FOODS_1_196_TX_1 -1925,FOODS_1_196_TX_2 -1926,FOODS_1_196_TX_3 -1927,FOODS_1_196_WI_1 -1928,FOODS_1_196_WI_2 -1929,FOODS_1_196_WI_3 -1930,FOODS_1_197_CA_1 -1931,FOODS_1_197_CA_2 -1932,FOODS_1_197_CA_3 -1933,FOODS_1_197_CA_4 -1934,FOODS_1_197_TX_1 -1935,FOODS_1_197_TX_2 -1936,FOODS_1_197_TX_3 -1937,FOODS_1_197_WI_1 -1938,FOODS_1_197_WI_2 -1939,FOODS_1_197_WI_3 -1940,FOODS_1_198_CA_1 -1941,FOODS_1_198_CA_2 -1942,FOODS_1_198_CA_3 -1943,FOODS_1_198_CA_4 -1944,FOODS_1_198_TX_1 -1945,FOODS_1_198_TX_2 -1946,FOODS_1_198_TX_3 -1947,FOODS_1_198_WI_1 -1948,FOODS_1_198_WI_2 -1949,FOODS_1_198_WI_3 -1950,FOODS_1_199_CA_1 -1951,FOODS_1_199_CA_2 -1952,FOODS_1_199_CA_3 -1953,FOODS_1_199_CA_4 -1954,FOODS_1_199_TX_1 -1955,FOODS_1_199_TX_2 -1956,FOODS_1_199_TX_3 -1957,FOODS_1_199_WI_1 -1958,FOODS_1_199_WI_2 -1959,FOODS_1_199_WI_3 -1960,FOODS_1_200_CA_1 -1961,FOODS_1_200_CA_2 -1962,FOODS_1_200_CA_3 -1963,FOODS_1_200_CA_4 -1964,FOODS_1_200_TX_1 -1965,FOODS_1_200_TX_2 -1966,FOODS_1_200_TX_3 -1967,FOODS_1_200_WI_1 -1968,FOODS_1_200_WI_2 -1969,FOODS_1_200_WI_3 -1970,FOODS_1_201_CA_1 -1971,FOODS_1_201_CA_2 -1972,FOODS_1_201_CA_3 -1973,FOODS_1_201_CA_4 -1974,FOODS_1_201_TX_1 -1975,FOODS_1_201_TX_2 -1976,FOODS_1_201_TX_3 -1977,FOODS_1_201_WI_1 -1978,FOODS_1_201_WI_2 -1979,FOODS_1_201_WI_3 -1980,FOODS_1_202_CA_1 -1981,FOODS_1_202_CA_2 -1982,FOODS_1_202_CA_3 -1983,FOODS_1_202_CA_4 -1984,FOODS_1_202_TX_1 -1985,FOODS_1_202_TX_2 -1986,FOODS_1_202_TX_3 -1987,FOODS_1_202_WI_1 -1988,FOODS_1_202_WI_2 -1989,FOODS_1_202_WI_3 -1990,FOODS_1_203_CA_1 -1991,FOODS_1_203_CA_2 -1992,FOODS_1_203_CA_3 -1993,FOODS_1_203_CA_4 -1994,FOODS_1_203_TX_1 -1995,FOODS_1_203_TX_2 -1996,FOODS_1_203_TX_3 -1997,FOODS_1_203_WI_1 -1998,FOODS_1_203_WI_2 -1999,FOODS_1_203_WI_3 -2000,FOODS_1_204_CA_1 -2001,FOODS_1_204_CA_2 -2002,FOODS_1_204_CA_3 -2003,FOODS_1_204_CA_4 -2004,FOODS_1_204_TX_1 -2005,FOODS_1_204_TX_2 -2006,FOODS_1_204_TX_3 -2007,FOODS_1_204_WI_1 -2008,FOODS_1_204_WI_2 -2009,FOODS_1_204_WI_3 -2010,FOODS_1_205_CA_1 -2011,FOODS_1_205_CA_2 -2012,FOODS_1_205_CA_3 -2013,FOODS_1_205_CA_4 -2014,FOODS_1_205_TX_1 -2015,FOODS_1_205_TX_2 -2016,FOODS_1_205_TX_3 -2017,FOODS_1_205_WI_1 -2018,FOODS_1_205_WI_2 -2019,FOODS_1_205_WI_3 -2020,FOODS_1_206_CA_1 -2021,FOODS_1_206_CA_2 -2022,FOODS_1_206_CA_3 -2023,FOODS_1_206_CA_4 -2024,FOODS_1_206_TX_1 -2025,FOODS_1_206_TX_2 -2026,FOODS_1_206_TX_3 -2027,FOODS_1_206_WI_1 -2028,FOODS_1_206_WI_2 -2029,FOODS_1_206_WI_3 -2030,FOODS_1_207_CA_1 -2031,FOODS_1_207_CA_2 -2032,FOODS_1_207_CA_3 -2033,FOODS_1_207_CA_4 -2034,FOODS_1_207_TX_1 -2035,FOODS_1_207_TX_2 -2036,FOODS_1_207_TX_3 -2037,FOODS_1_207_WI_1 -2038,FOODS_1_207_WI_2 -2039,FOODS_1_207_WI_3 -2040,FOODS_1_208_CA_1 -2041,FOODS_1_208_CA_2 -2042,FOODS_1_208_CA_3 -2043,FOODS_1_208_CA_4 -2044,FOODS_1_208_TX_1 -2045,FOODS_1_208_TX_2 -2046,FOODS_1_208_TX_3 -2047,FOODS_1_208_WI_1 -2048,FOODS_1_208_WI_2 -2049,FOODS_1_208_WI_3 -2050,FOODS_1_209_CA_1 -2051,FOODS_1_209_CA_2 -2052,FOODS_1_209_CA_3 -2053,FOODS_1_209_CA_4 -2054,FOODS_1_209_TX_1 -2055,FOODS_1_209_TX_2 -2056,FOODS_1_209_TX_3 -2057,FOODS_1_209_WI_1 -2058,FOODS_1_209_WI_2 -2059,FOODS_1_209_WI_3 -2060,FOODS_1_210_CA_1 -2061,FOODS_1_210_CA_2 -2062,FOODS_1_210_CA_3 -2063,FOODS_1_210_CA_4 -2064,FOODS_1_210_TX_1 -2065,FOODS_1_210_TX_2 -2066,FOODS_1_210_TX_3 -2067,FOODS_1_210_WI_1 -2068,FOODS_1_210_WI_2 -2069,FOODS_1_210_WI_3 -2070,FOODS_1_211_CA_1 -2071,FOODS_1_211_CA_2 -2072,FOODS_1_211_CA_3 -2073,FOODS_1_211_CA_4 -2074,FOODS_1_211_TX_1 -2075,FOODS_1_211_TX_2 -2076,FOODS_1_211_TX_3 -2077,FOODS_1_211_WI_1 -2078,FOODS_1_211_WI_2 -2079,FOODS_1_211_WI_3 -2080,FOODS_1_212_CA_1 -2081,FOODS_1_212_CA_2 -2082,FOODS_1_212_CA_3 -2083,FOODS_1_212_CA_4 -2084,FOODS_1_212_TX_1 -2085,FOODS_1_212_TX_2 -2086,FOODS_1_212_TX_3 -2087,FOODS_1_212_WI_1 -2088,FOODS_1_212_WI_2 -2089,FOODS_1_212_WI_3 -2090,FOODS_1_213_CA_1 -2091,FOODS_1_213_CA_2 -2092,FOODS_1_213_CA_3 -2093,FOODS_1_213_CA_4 -2094,FOODS_1_213_TX_1 -2095,FOODS_1_213_TX_2 -2096,FOODS_1_213_TX_3 -2097,FOODS_1_213_WI_1 -2098,FOODS_1_213_WI_2 -2099,FOODS_1_213_WI_3 -2100,FOODS_1_214_CA_1 -2101,FOODS_1_214_CA_2 -2102,FOODS_1_214_CA_3 -2103,FOODS_1_214_CA_4 -2104,FOODS_1_214_TX_1 -2105,FOODS_1_214_TX_2 -2106,FOODS_1_214_TX_3 -2107,FOODS_1_214_WI_1 -2108,FOODS_1_214_WI_2 -2109,FOODS_1_214_WI_3 -2110,FOODS_1_215_CA_1 -2111,FOODS_1_215_CA_2 -2112,FOODS_1_215_CA_3 -2113,FOODS_1_215_CA_4 -2114,FOODS_1_215_TX_1 -2115,FOODS_1_215_TX_2 -2116,FOODS_1_215_TX_3 -2117,FOODS_1_215_WI_1 -2118,FOODS_1_215_WI_2 -2119,FOODS_1_215_WI_3 -2120,FOODS_1_216_CA_1 -2121,FOODS_1_216_CA_2 -2122,FOODS_1_216_CA_3 -2123,FOODS_1_216_CA_4 -2124,FOODS_1_216_TX_1 -2125,FOODS_1_216_TX_2 -2126,FOODS_1_216_TX_3 -2127,FOODS_1_216_WI_1 -2128,FOODS_1_216_WI_2 -2129,FOODS_1_216_WI_3 -2130,FOODS_1_217_CA_1 -2131,FOODS_1_217_CA_2 -2132,FOODS_1_217_CA_3 -2133,FOODS_1_217_CA_4 -2134,FOODS_1_217_TX_1 -2135,FOODS_1_217_TX_2 -2136,FOODS_1_217_TX_3 -2137,FOODS_1_217_WI_1 -2138,FOODS_1_217_WI_2 -2139,FOODS_1_217_WI_3 -2140,FOODS_1_218_CA_1 -2141,FOODS_1_218_CA_2 -2142,FOODS_1_218_CA_3 -2143,FOODS_1_218_CA_4 -2144,FOODS_1_218_TX_1 -2145,FOODS_1_218_TX_2 -2146,FOODS_1_218_TX_3 -2147,FOODS_1_218_WI_1 -2148,FOODS_1_218_WI_2 -2149,FOODS_1_218_WI_3 -2150,FOODS_1_219_CA_1 -2151,FOODS_1_219_CA_2 -2152,FOODS_1_219_CA_3 -2153,FOODS_1_219_CA_4 -2154,FOODS_1_219_TX_1 -2155,FOODS_1_219_TX_2 -2156,FOODS_1_219_TX_3 -2157,FOODS_1_219_WI_1 -2158,FOODS_1_219_WI_2 -2159,FOODS_1_219_WI_3 -2160,FOODS_2_001_CA_1 -2161,FOODS_2_001_CA_2 -2162,FOODS_2_001_CA_3 -2163,FOODS_2_001_CA_4 -2164,FOODS_2_001_TX_1 -2165,FOODS_2_001_TX_2 -2166,FOODS_2_001_TX_3 -2167,FOODS_2_001_WI_1 -2168,FOODS_2_001_WI_2 -2169,FOODS_2_001_WI_3 -2170,FOODS_2_002_CA_1 -2171,FOODS_2_002_CA_2 -2172,FOODS_2_002_CA_3 -2173,FOODS_2_002_CA_4 -2174,FOODS_2_002_TX_1 -2175,FOODS_2_002_TX_2 -2176,FOODS_2_002_TX_3 -2177,FOODS_2_002_WI_1 -2178,FOODS_2_002_WI_2 -2179,FOODS_2_002_WI_3 -2180,FOODS_2_003_CA_1 -2181,FOODS_2_003_CA_2 -2182,FOODS_2_003_CA_3 -2183,FOODS_2_003_CA_4 -2184,FOODS_2_003_TX_1 -2185,FOODS_2_003_TX_2 -2186,FOODS_2_003_TX_3 -2187,FOODS_2_003_WI_1 -2188,FOODS_2_003_WI_2 -2189,FOODS_2_003_WI_3 -2190,FOODS_2_004_CA_1 -2191,FOODS_2_004_CA_2 -2192,FOODS_2_004_CA_3 -2193,FOODS_2_004_CA_4 -2194,FOODS_2_004_TX_1 -2195,FOODS_2_004_TX_2 -2196,FOODS_2_004_TX_3 -2197,FOODS_2_004_WI_1 -2198,FOODS_2_004_WI_2 -2199,FOODS_2_004_WI_3 -2200,FOODS_2_005_CA_1 -2201,FOODS_2_005_CA_2 -2202,FOODS_2_005_CA_3 -2203,FOODS_2_005_CA_4 -2204,FOODS_2_005_TX_1 -2205,FOODS_2_005_TX_2 -2206,FOODS_2_005_TX_3 -2207,FOODS_2_005_WI_1 -2208,FOODS_2_005_WI_2 -2209,FOODS_2_005_WI_3 -2210,FOODS_2_006_CA_1 -2211,FOODS_2_006_CA_2 -2212,FOODS_2_006_CA_3 -2213,FOODS_2_006_CA_4 -2214,FOODS_2_006_TX_1 -2215,FOODS_2_006_TX_2 -2216,FOODS_2_006_TX_3 -2217,FOODS_2_006_WI_1 -2218,FOODS_2_006_WI_2 -2219,FOODS_2_006_WI_3 -2220,FOODS_2_007_CA_1 -2221,FOODS_2_007_CA_2 -2222,FOODS_2_007_CA_3 -2223,FOODS_2_007_CA_4 -2224,FOODS_2_007_TX_1 -2225,FOODS_2_007_TX_2 -2226,FOODS_2_007_TX_3 -2227,FOODS_2_007_WI_1 -2228,FOODS_2_007_WI_2 -2229,FOODS_2_007_WI_3 -2230,FOODS_2_008_CA_1 -2231,FOODS_2_008_CA_2 -2232,FOODS_2_008_CA_3 -2233,FOODS_2_008_CA_4 -2234,FOODS_2_008_TX_1 -2235,FOODS_2_008_TX_2 -2236,FOODS_2_008_TX_3 -2237,FOODS_2_008_WI_1 -2238,FOODS_2_008_WI_2 -2239,FOODS_2_008_WI_3 -2240,FOODS_2_009_CA_1 -2241,FOODS_2_009_CA_2 -2242,FOODS_2_009_CA_3 -2243,FOODS_2_009_CA_4 -2244,FOODS_2_009_TX_1 -2245,FOODS_2_009_TX_2 -2246,FOODS_2_009_TX_3 -2247,FOODS_2_009_WI_1 -2248,FOODS_2_009_WI_2 -2249,FOODS_2_009_WI_3 -2250,FOODS_2_010_CA_1 -2251,FOODS_2_010_CA_2 -2252,FOODS_2_010_CA_3 -2253,FOODS_2_010_CA_4 -2254,FOODS_2_010_TX_1 -2255,FOODS_2_010_TX_2 -2256,FOODS_2_010_TX_3 -2257,FOODS_2_010_WI_1 -2258,FOODS_2_010_WI_2 -2259,FOODS_2_010_WI_3 -2260,FOODS_2_011_CA_1 -2261,FOODS_2_011_CA_2 -2262,FOODS_2_011_CA_3 -2263,FOODS_2_011_CA_4 -2264,FOODS_2_011_TX_1 -2265,FOODS_2_011_TX_2 -2266,FOODS_2_011_TX_3 -2267,FOODS_2_011_WI_1 -2268,FOODS_2_011_WI_2 -2269,FOODS_2_011_WI_3 -2270,FOODS_2_012_CA_1 -2271,FOODS_2_012_CA_2 -2272,FOODS_2_012_CA_3 -2273,FOODS_2_012_CA_4 -2274,FOODS_2_012_TX_1 -2275,FOODS_2_012_TX_2 -2276,FOODS_2_012_TX_3 -2277,FOODS_2_012_WI_1 -2278,FOODS_2_012_WI_2 -2279,FOODS_2_012_WI_3 -2280,FOODS_2_013_CA_1 -2281,FOODS_2_013_CA_2 -2282,FOODS_2_013_CA_3 -2283,FOODS_2_013_CA_4 -2284,FOODS_2_013_TX_1 -2285,FOODS_2_013_TX_2 -2286,FOODS_2_013_TX_3 -2287,FOODS_2_013_WI_1 -2288,FOODS_2_013_WI_2 -2289,FOODS_2_013_WI_3 -2290,FOODS_2_014_CA_1 -2291,FOODS_2_014_CA_2 -2292,FOODS_2_014_CA_3 -2293,FOODS_2_014_CA_4 -2294,FOODS_2_014_TX_1 -2295,FOODS_2_014_TX_2 -2296,FOODS_2_014_TX_3 -2297,FOODS_2_014_WI_1 -2298,FOODS_2_014_WI_2 -2299,FOODS_2_014_WI_3 -2300,FOODS_2_015_CA_1 -2301,FOODS_2_015_CA_2 -2302,FOODS_2_015_CA_3 -2303,FOODS_2_015_CA_4 -2304,FOODS_2_015_TX_1 -2305,FOODS_2_015_TX_2 -2306,FOODS_2_015_TX_3 -2307,FOODS_2_015_WI_1 -2308,FOODS_2_015_WI_2 -2309,FOODS_2_015_WI_3 -2310,FOODS_2_016_CA_1 -2311,FOODS_2_016_CA_2 -2312,FOODS_2_016_CA_3 -2313,FOODS_2_016_CA_4 -2314,FOODS_2_016_TX_1 -2315,FOODS_2_016_TX_2 -2316,FOODS_2_016_TX_3 -2317,FOODS_2_016_WI_1 -2318,FOODS_2_016_WI_2 -2319,FOODS_2_016_WI_3 -2320,FOODS_2_017_CA_1 -2321,FOODS_2_017_CA_2 -2322,FOODS_2_017_CA_3 -2323,FOODS_2_017_CA_4 -2324,FOODS_2_017_TX_1 -2325,FOODS_2_017_TX_2 -2326,FOODS_2_017_TX_3 -2327,FOODS_2_017_WI_1 -2328,FOODS_2_017_WI_2 -2329,FOODS_2_017_WI_3 -2330,FOODS_2_018_CA_1 -2331,FOODS_2_018_CA_2 -2332,FOODS_2_018_CA_3 -2333,FOODS_2_018_CA_4 -2334,FOODS_2_018_TX_1 -2335,FOODS_2_018_TX_2 -2336,FOODS_2_018_TX_3 -2337,FOODS_2_018_WI_1 -2338,FOODS_2_018_WI_2 -2339,FOODS_2_018_WI_3 -2340,FOODS_2_019_CA_1 -2341,FOODS_2_019_CA_2 -2342,FOODS_2_019_CA_3 -2343,FOODS_2_019_CA_4 -2344,FOODS_2_019_TX_1 -2345,FOODS_2_019_TX_2 -2346,FOODS_2_019_TX_3 -2347,FOODS_2_019_WI_1 -2348,FOODS_2_019_WI_2 -2349,FOODS_2_019_WI_3 -2350,FOODS_2_020_CA_1 -2351,FOODS_2_020_CA_2 -2352,FOODS_2_020_CA_3 -2353,FOODS_2_020_CA_4 -2354,FOODS_2_020_TX_1 -2355,FOODS_2_020_TX_2 -2356,FOODS_2_020_TX_3 -2357,FOODS_2_020_WI_1 -2358,FOODS_2_020_WI_2 -2359,FOODS_2_020_WI_3 -2360,FOODS_2_021_CA_1 -2361,FOODS_2_021_CA_2 -2362,FOODS_2_021_CA_3 -2363,FOODS_2_021_CA_4 -2364,FOODS_2_021_TX_1 -2365,FOODS_2_021_TX_2 -2366,FOODS_2_021_TX_3 -2367,FOODS_2_021_WI_1 -2368,FOODS_2_021_WI_2 -2369,FOODS_2_021_WI_3 -2370,FOODS_2_022_CA_1 -2371,FOODS_2_022_CA_2 -2372,FOODS_2_022_CA_3 -2373,FOODS_2_022_CA_4 -2374,FOODS_2_022_TX_1 -2375,FOODS_2_022_TX_2 -2376,FOODS_2_022_TX_3 -2377,FOODS_2_022_WI_1 -2378,FOODS_2_022_WI_2 -2379,FOODS_2_022_WI_3 -2380,FOODS_2_023_CA_1 -2381,FOODS_2_023_CA_2 -2382,FOODS_2_023_CA_3 -2383,FOODS_2_023_CA_4 -2384,FOODS_2_023_TX_1 -2385,FOODS_2_023_TX_2 -2386,FOODS_2_023_TX_3 -2387,FOODS_2_023_WI_1 -2388,FOODS_2_023_WI_2 -2389,FOODS_2_023_WI_3 -2390,FOODS_2_024_CA_1 -2391,FOODS_2_024_CA_2 -2392,FOODS_2_024_CA_3 -2393,FOODS_2_024_CA_4 -2394,FOODS_2_024_TX_1 -2395,FOODS_2_024_TX_2 -2396,FOODS_2_024_TX_3 -2397,FOODS_2_024_WI_1 -2398,FOODS_2_024_WI_2 -2399,FOODS_2_024_WI_3 -2400,FOODS_2_025_CA_1 -2401,FOODS_2_025_CA_2 -2402,FOODS_2_025_CA_3 -2403,FOODS_2_025_CA_4 -2404,FOODS_2_025_TX_1 -2405,FOODS_2_025_TX_2 -2406,FOODS_2_025_TX_3 -2407,FOODS_2_025_WI_1 -2408,FOODS_2_025_WI_2 -2409,FOODS_2_025_WI_3 -2410,FOODS_2_026_CA_1 -2411,FOODS_2_026_CA_2 -2412,FOODS_2_026_CA_3 -2413,FOODS_2_026_CA_4 -2414,FOODS_2_026_TX_1 -2415,FOODS_2_026_TX_2 -2416,FOODS_2_026_TX_3 -2417,FOODS_2_026_WI_1 -2418,FOODS_2_026_WI_2 -2419,FOODS_2_026_WI_3 -2420,FOODS_2_027_CA_1 -2421,FOODS_2_027_CA_2 -2422,FOODS_2_027_CA_3 -2423,FOODS_2_027_CA_4 -2424,FOODS_2_027_TX_1 -2425,FOODS_2_027_TX_2 -2426,FOODS_2_027_TX_3 -2427,FOODS_2_027_WI_1 -2428,FOODS_2_027_WI_2 -2429,FOODS_2_027_WI_3 -2430,FOODS_2_028_CA_1 -2431,FOODS_2_028_CA_2 -2432,FOODS_2_028_CA_3 -2433,FOODS_2_028_CA_4 -2434,FOODS_2_028_TX_1 -2435,FOODS_2_028_TX_2 -2436,FOODS_2_028_TX_3 -2437,FOODS_2_028_WI_1 -2438,FOODS_2_028_WI_2 -2439,FOODS_2_028_WI_3 -2440,FOODS_2_029_CA_1 -2441,FOODS_2_029_CA_2 -2442,FOODS_2_029_CA_3 -2443,FOODS_2_029_CA_4 -2444,FOODS_2_029_TX_1 -2445,FOODS_2_029_TX_2 -2446,FOODS_2_029_TX_3 -2447,FOODS_2_029_WI_1 -2448,FOODS_2_029_WI_2 -2449,FOODS_2_029_WI_3 -2450,FOODS_2_030_CA_1 -2451,FOODS_2_030_CA_2 -2452,FOODS_2_030_CA_3 -2453,FOODS_2_030_CA_4 -2454,FOODS_2_030_TX_1 -2455,FOODS_2_030_TX_2 -2456,FOODS_2_030_TX_3 -2457,FOODS_2_030_WI_1 -2458,FOODS_2_030_WI_2 -2459,FOODS_2_030_WI_3 -2460,FOODS_2_031_CA_1 -2461,FOODS_2_031_CA_2 -2462,FOODS_2_031_CA_3 -2463,FOODS_2_031_CA_4 -2464,FOODS_2_031_TX_1 -2465,FOODS_2_031_TX_2 -2466,FOODS_2_031_TX_3 -2467,FOODS_2_031_WI_1 -2468,FOODS_2_031_WI_2 -2469,FOODS_2_031_WI_3 -2470,FOODS_2_032_CA_1 -2471,FOODS_2_032_CA_2 -2472,FOODS_2_032_CA_3 -2473,FOODS_2_032_CA_4 -2474,FOODS_2_032_TX_1 -2475,FOODS_2_032_TX_2 -2476,FOODS_2_032_TX_3 -2477,FOODS_2_032_WI_1 -2478,FOODS_2_032_WI_2 -2479,FOODS_2_032_WI_3 -2480,FOODS_2_033_CA_1 -2481,FOODS_2_033_CA_2 -2482,FOODS_2_033_CA_3 -2483,FOODS_2_033_CA_4 -2484,FOODS_2_033_TX_1 -2485,FOODS_2_033_TX_2 -2486,FOODS_2_033_TX_3 -2487,FOODS_2_033_WI_1 -2488,FOODS_2_033_WI_2 -2489,FOODS_2_033_WI_3 -2490,FOODS_2_034_CA_1 -2491,FOODS_2_034_CA_2 -2492,FOODS_2_034_CA_3 -2493,FOODS_2_034_CA_4 -2494,FOODS_2_034_TX_1 -2495,FOODS_2_034_TX_2 -2496,FOODS_2_034_TX_3 -2497,FOODS_2_034_WI_1 -2498,FOODS_2_034_WI_2 -2499,FOODS_2_034_WI_3 -2500,FOODS_2_035_CA_1 -2501,FOODS_2_035_CA_2 -2502,FOODS_2_035_CA_3 -2503,FOODS_2_035_CA_4 -2504,FOODS_2_035_TX_1 -2505,FOODS_2_035_TX_2 -2506,FOODS_2_035_TX_3 -2507,FOODS_2_035_WI_1 -2508,FOODS_2_035_WI_2 -2509,FOODS_2_035_WI_3 -2510,FOODS_2_036_CA_1 -2511,FOODS_2_036_CA_2 -2512,FOODS_2_036_CA_3 -2513,FOODS_2_036_CA_4 -2514,FOODS_2_036_TX_1 -2515,FOODS_2_036_TX_2 -2516,FOODS_2_036_TX_3 -2517,FOODS_2_036_WI_1 -2518,FOODS_2_036_WI_2 -2519,FOODS_2_036_WI_3 -2520,FOODS_2_037_CA_1 -2521,FOODS_2_037_CA_2 -2522,FOODS_2_037_CA_3 -2523,FOODS_2_037_CA_4 -2524,FOODS_2_037_TX_1 -2525,FOODS_2_037_TX_2 -2526,FOODS_2_037_TX_3 -2527,FOODS_2_037_WI_1 -2528,FOODS_2_037_WI_2 -2529,FOODS_2_037_WI_3 -2530,FOODS_2_038_CA_1 -2531,FOODS_2_038_CA_2 -2532,FOODS_2_038_CA_3 -2533,FOODS_2_038_CA_4 -2534,FOODS_2_038_TX_1 -2535,FOODS_2_038_TX_2 -2536,FOODS_2_038_TX_3 -2537,FOODS_2_038_WI_1 -2538,FOODS_2_038_WI_2 -2539,FOODS_2_038_WI_3 -2540,FOODS_2_039_CA_1 -2541,FOODS_2_039_CA_2 -2542,FOODS_2_039_CA_3 -2543,FOODS_2_039_CA_4 -2544,FOODS_2_039_TX_1 -2545,FOODS_2_039_TX_2 -2546,FOODS_2_039_TX_3 -2547,FOODS_2_039_WI_1 -2548,FOODS_2_039_WI_2 -2549,FOODS_2_039_WI_3 -2550,FOODS_2_040_CA_1 -2551,FOODS_2_040_CA_2 -2552,FOODS_2_040_CA_3 -2553,FOODS_2_040_CA_4 -2554,FOODS_2_040_TX_1 -2555,FOODS_2_040_TX_2 -2556,FOODS_2_040_TX_3 -2557,FOODS_2_040_WI_1 -2558,FOODS_2_040_WI_2 -2559,FOODS_2_040_WI_3 -2560,FOODS_2_041_CA_1 -2561,FOODS_2_041_CA_2 -2562,FOODS_2_041_CA_3 -2563,FOODS_2_041_CA_4 -2564,FOODS_2_041_TX_1 -2565,FOODS_2_041_TX_2 -2566,FOODS_2_041_TX_3 -2567,FOODS_2_041_WI_1 -2568,FOODS_2_041_WI_2 -2569,FOODS_2_041_WI_3 -2570,FOODS_2_042_CA_1 -2571,FOODS_2_042_CA_2 -2572,FOODS_2_042_CA_3 -2573,FOODS_2_042_CA_4 -2574,FOODS_2_042_TX_1 -2575,FOODS_2_042_TX_2 -2576,FOODS_2_042_TX_3 -2577,FOODS_2_042_WI_1 -2578,FOODS_2_042_WI_2 -2579,FOODS_2_042_WI_3 -2580,FOODS_2_043_CA_1 -2581,FOODS_2_043_CA_2 -2582,FOODS_2_043_CA_3 -2583,FOODS_2_043_CA_4 -2584,FOODS_2_043_TX_1 -2585,FOODS_2_043_TX_2 -2586,FOODS_2_043_TX_3 -2587,FOODS_2_043_WI_1 -2588,FOODS_2_043_WI_2 -2589,FOODS_2_043_WI_3 -2590,FOODS_2_044_CA_1 -2591,FOODS_2_044_CA_2 -2592,FOODS_2_044_CA_3 -2593,FOODS_2_044_CA_4 -2594,FOODS_2_044_TX_1 -2595,FOODS_2_044_TX_2 -2596,FOODS_2_044_TX_3 -2597,FOODS_2_044_WI_1 -2598,FOODS_2_044_WI_2 -2599,FOODS_2_044_WI_3 -2600,FOODS_2_045_CA_1 -2601,FOODS_2_045_CA_2 -2602,FOODS_2_045_CA_3 -2603,FOODS_2_045_CA_4 -2604,FOODS_2_045_TX_1 -2605,FOODS_2_045_TX_2 -2606,FOODS_2_045_TX_3 -2607,FOODS_2_045_WI_1 -2608,FOODS_2_045_WI_2 -2609,FOODS_2_045_WI_3 -2610,FOODS_2_046_CA_1 -2611,FOODS_2_046_CA_2 -2612,FOODS_2_046_CA_3 -2613,FOODS_2_046_CA_4 -2614,FOODS_2_046_TX_1 -2615,FOODS_2_046_TX_2 -2616,FOODS_2_046_TX_3 -2617,FOODS_2_046_WI_1 -2618,FOODS_2_046_WI_2 -2619,FOODS_2_046_WI_3 -2620,FOODS_2_047_CA_1 -2621,FOODS_2_047_CA_2 -2622,FOODS_2_047_CA_3 -2623,FOODS_2_047_CA_4 -2624,FOODS_2_047_TX_1 -2625,FOODS_2_047_TX_2 -2626,FOODS_2_047_TX_3 -2627,FOODS_2_047_WI_1 -2628,FOODS_2_047_WI_2 -2629,FOODS_2_047_WI_3 -2630,FOODS_2_048_CA_1 -2631,FOODS_2_048_CA_2 -2632,FOODS_2_048_CA_3 -2633,FOODS_2_048_CA_4 -2634,FOODS_2_048_TX_1 -2635,FOODS_2_048_TX_2 -2636,FOODS_2_048_TX_3 -2637,FOODS_2_048_WI_1 -2638,FOODS_2_048_WI_2 -2639,FOODS_2_048_WI_3 -2640,FOODS_2_049_CA_1 -2641,FOODS_2_049_CA_2 -2642,FOODS_2_049_CA_3 -2643,FOODS_2_049_CA_4 -2644,FOODS_2_049_TX_1 -2645,FOODS_2_049_TX_2 -2646,FOODS_2_049_TX_3 -2647,FOODS_2_049_WI_1 -2648,FOODS_2_049_WI_2 -2649,FOODS_2_049_WI_3 -2650,FOODS_2_050_CA_1 -2651,FOODS_2_050_CA_2 -2652,FOODS_2_050_CA_3 -2653,FOODS_2_050_CA_4 -2654,FOODS_2_050_TX_1 -2655,FOODS_2_050_TX_2 -2656,FOODS_2_050_TX_3 -2657,FOODS_2_050_WI_1 -2658,FOODS_2_050_WI_2 -2659,FOODS_2_050_WI_3 -2660,FOODS_2_051_CA_1 -2661,FOODS_2_051_CA_2 -2662,FOODS_2_051_CA_3 -2663,FOODS_2_051_CA_4 -2664,FOODS_2_051_TX_1 -2665,FOODS_2_051_TX_2 -2666,FOODS_2_051_TX_3 -2667,FOODS_2_051_WI_1 -2668,FOODS_2_051_WI_2 -2669,FOODS_2_051_WI_3 -2670,FOODS_2_052_CA_1 -2671,FOODS_2_052_CA_2 -2672,FOODS_2_052_CA_3 -2673,FOODS_2_052_CA_4 -2674,FOODS_2_052_TX_1 -2675,FOODS_2_052_TX_2 -2676,FOODS_2_052_TX_3 -2677,FOODS_2_052_WI_1 -2678,FOODS_2_052_WI_2 -2679,FOODS_2_052_WI_3 -2680,FOODS_2_053_CA_1 -2681,FOODS_2_053_CA_2 -2682,FOODS_2_053_CA_3 -2683,FOODS_2_053_CA_4 -2684,FOODS_2_053_TX_1 -2685,FOODS_2_053_TX_2 -2686,FOODS_2_053_TX_3 -2687,FOODS_2_053_WI_1 -2688,FOODS_2_053_WI_2 -2689,FOODS_2_053_WI_3 -2690,FOODS_2_054_CA_1 -2691,FOODS_2_054_CA_2 -2692,FOODS_2_054_CA_3 -2693,FOODS_2_054_CA_4 -2694,FOODS_2_054_TX_1 -2695,FOODS_2_054_TX_2 -2696,FOODS_2_054_TX_3 -2697,FOODS_2_054_WI_1 -2698,FOODS_2_054_WI_2 -2699,FOODS_2_054_WI_3 -2700,FOODS_2_055_CA_1 -2701,FOODS_2_055_CA_2 -2702,FOODS_2_055_CA_3 -2703,FOODS_2_055_CA_4 -2704,FOODS_2_055_TX_1 -2705,FOODS_2_055_TX_2 -2706,FOODS_2_055_TX_3 -2707,FOODS_2_055_WI_1 -2708,FOODS_2_055_WI_2 -2709,FOODS_2_055_WI_3 -2710,FOODS_2_056_CA_1 -2711,FOODS_2_056_CA_2 -2712,FOODS_2_056_CA_3 -2713,FOODS_2_056_CA_4 -2714,FOODS_2_056_TX_1 -2715,FOODS_2_056_TX_2 -2716,FOODS_2_056_TX_3 -2717,FOODS_2_056_WI_1 -2718,FOODS_2_056_WI_2 -2719,FOODS_2_056_WI_3 -2720,FOODS_2_057_CA_1 -2721,FOODS_2_057_CA_2 -2722,FOODS_2_057_CA_3 -2723,FOODS_2_057_CA_4 -2724,FOODS_2_057_TX_1 -2725,FOODS_2_057_TX_2 -2726,FOODS_2_057_TX_3 -2727,FOODS_2_057_WI_1 -2728,FOODS_2_057_WI_2 -2729,FOODS_2_057_WI_3 -2730,FOODS_2_058_CA_1 -2731,FOODS_2_058_CA_2 -2732,FOODS_2_058_CA_3 -2733,FOODS_2_058_CA_4 -2734,FOODS_2_058_TX_1 -2735,FOODS_2_058_TX_2 -2736,FOODS_2_058_TX_3 -2737,FOODS_2_058_WI_1 -2738,FOODS_2_058_WI_2 -2739,FOODS_2_058_WI_3 -2740,FOODS_2_059_CA_1 -2741,FOODS_2_059_CA_2 -2742,FOODS_2_059_CA_3 -2743,FOODS_2_059_CA_4 -2744,FOODS_2_059_TX_1 -2745,FOODS_2_059_TX_2 -2746,FOODS_2_059_TX_3 -2747,FOODS_2_059_WI_1 -2748,FOODS_2_059_WI_2 -2749,FOODS_2_059_WI_3 -2750,FOODS_2_060_CA_1 -2751,FOODS_2_060_CA_2 -2752,FOODS_2_060_CA_3 -2753,FOODS_2_060_CA_4 -2754,FOODS_2_060_TX_1 -2755,FOODS_2_060_TX_2 -2756,FOODS_2_060_TX_3 -2757,FOODS_2_060_WI_1 -2758,FOODS_2_060_WI_2 -2759,FOODS_2_060_WI_3 -2760,FOODS_2_061_CA_1 -2761,FOODS_2_061_CA_2 -2762,FOODS_2_061_CA_3 -2763,FOODS_2_061_CA_4 -2764,FOODS_2_061_TX_1 -2765,FOODS_2_061_TX_2 -2766,FOODS_2_061_TX_3 -2767,FOODS_2_061_WI_1 -2768,FOODS_2_061_WI_2 -2769,FOODS_2_061_WI_3 -2770,FOODS_2_062_CA_1 -2771,FOODS_2_062_CA_2 -2772,FOODS_2_062_CA_3 -2773,FOODS_2_062_CA_4 -2774,FOODS_2_062_TX_1 -2775,FOODS_2_062_TX_2 -2776,FOODS_2_062_TX_3 -2777,FOODS_2_062_WI_1 -2778,FOODS_2_062_WI_2 -2779,FOODS_2_062_WI_3 -2780,FOODS_2_063_CA_1 -2781,FOODS_2_063_CA_2 -2782,FOODS_2_063_CA_3 -2783,FOODS_2_063_CA_4 -2784,FOODS_2_063_TX_1 -2785,FOODS_2_063_TX_2 -2786,FOODS_2_063_TX_3 -2787,FOODS_2_063_WI_1 -2788,FOODS_2_063_WI_2 -2789,FOODS_2_063_WI_3 -2790,FOODS_2_064_CA_1 -2791,FOODS_2_064_CA_2 -2792,FOODS_2_064_CA_3 -2793,FOODS_2_064_CA_4 -2794,FOODS_2_064_TX_1 -2795,FOODS_2_064_TX_2 -2796,FOODS_2_064_TX_3 -2797,FOODS_2_064_WI_1 -2798,FOODS_2_064_WI_2 -2799,FOODS_2_064_WI_3 -2800,FOODS_2_065_CA_1 -2801,FOODS_2_065_CA_2 -2802,FOODS_2_065_CA_3 -2803,FOODS_2_065_CA_4 -2804,FOODS_2_065_TX_1 -2805,FOODS_2_065_TX_2 -2806,FOODS_2_065_TX_3 -2807,FOODS_2_065_WI_1 -2808,FOODS_2_065_WI_2 -2809,FOODS_2_065_WI_3 -2810,FOODS_2_066_CA_1 -2811,FOODS_2_066_CA_2 -2812,FOODS_2_066_CA_3 -2813,FOODS_2_066_CA_4 -2814,FOODS_2_066_TX_1 -2815,FOODS_2_066_TX_2 -2816,FOODS_2_066_TX_3 -2817,FOODS_2_066_WI_1 -2818,FOODS_2_066_WI_2 -2819,FOODS_2_066_WI_3 -2820,FOODS_2_067_CA_1 -2821,FOODS_2_067_CA_2 -2822,FOODS_2_067_CA_3 -2823,FOODS_2_067_CA_4 -2824,FOODS_2_067_TX_1 -2825,FOODS_2_067_TX_2 -2826,FOODS_2_067_TX_3 -2827,FOODS_2_067_WI_1 -2828,FOODS_2_067_WI_2 -2829,FOODS_2_067_WI_3 -2830,FOODS_2_068_CA_1 -2831,FOODS_2_068_CA_2 -2832,FOODS_2_068_CA_3 -2833,FOODS_2_068_CA_4 -2834,FOODS_2_068_TX_1 -2835,FOODS_2_068_TX_2 -2836,FOODS_2_068_TX_3 -2837,FOODS_2_068_WI_1 -2838,FOODS_2_068_WI_2 -2839,FOODS_2_068_WI_3 -2840,FOODS_2_069_CA_1 -2841,FOODS_2_069_CA_2 -2842,FOODS_2_069_CA_3 -2843,FOODS_2_069_CA_4 -2844,FOODS_2_069_TX_1 -2845,FOODS_2_069_TX_2 -2846,FOODS_2_069_TX_3 -2847,FOODS_2_069_WI_1 -2848,FOODS_2_069_WI_2 -2849,FOODS_2_069_WI_3 -2850,FOODS_2_070_CA_1 -2851,FOODS_2_070_CA_2 -2852,FOODS_2_070_CA_3 -2853,FOODS_2_070_CA_4 -2854,FOODS_2_070_TX_1 -2855,FOODS_2_070_TX_2 -2856,FOODS_2_070_TX_3 -2857,FOODS_2_070_WI_1 -2858,FOODS_2_070_WI_2 -2859,FOODS_2_070_WI_3 -2860,FOODS_2_071_CA_1 -2861,FOODS_2_071_CA_2 -2862,FOODS_2_071_CA_3 -2863,FOODS_2_071_CA_4 -2864,FOODS_2_071_TX_1 -2865,FOODS_2_071_TX_2 -2866,FOODS_2_071_TX_3 -2867,FOODS_2_071_WI_1 -2868,FOODS_2_071_WI_2 -2869,FOODS_2_071_WI_3 -2870,FOODS_2_072_CA_1 -2871,FOODS_2_072_CA_2 -2872,FOODS_2_072_CA_3 -2873,FOODS_2_072_CA_4 -2874,FOODS_2_072_TX_1 -2875,FOODS_2_072_TX_2 -2876,FOODS_2_072_TX_3 -2877,FOODS_2_072_WI_1 -2878,FOODS_2_072_WI_2 -2879,FOODS_2_072_WI_3 -2880,FOODS_2_073_CA_1 -2881,FOODS_2_073_CA_2 -2882,FOODS_2_073_CA_3 -2883,FOODS_2_073_CA_4 -2884,FOODS_2_073_TX_1 -2885,FOODS_2_073_TX_2 -2886,FOODS_2_073_TX_3 -2887,FOODS_2_073_WI_1 -2888,FOODS_2_073_WI_2 -2889,FOODS_2_073_WI_3 -2890,FOODS_2_074_CA_1 -2891,FOODS_2_074_CA_2 -2892,FOODS_2_074_CA_3 -2893,FOODS_2_074_CA_4 -2894,FOODS_2_074_TX_1 -2895,FOODS_2_074_TX_2 -2896,FOODS_2_074_TX_3 -2897,FOODS_2_074_WI_1 -2898,FOODS_2_074_WI_2 -2899,FOODS_2_074_WI_3 -2900,FOODS_2_075_CA_1 -2901,FOODS_2_075_CA_2 -2902,FOODS_2_075_CA_3 -2903,FOODS_2_075_CA_4 -2904,FOODS_2_075_TX_1 -2905,FOODS_2_075_TX_2 -2906,FOODS_2_075_TX_3 -2907,FOODS_2_075_WI_1 -2908,FOODS_2_075_WI_2 -2909,FOODS_2_075_WI_3 -2910,FOODS_2_076_CA_1 -2911,FOODS_2_076_CA_2 -2912,FOODS_2_076_CA_3 -2913,FOODS_2_076_CA_4 -2914,FOODS_2_076_TX_1 -2915,FOODS_2_076_TX_2 -2916,FOODS_2_076_TX_3 -2917,FOODS_2_076_WI_1 -2918,FOODS_2_076_WI_2 -2919,FOODS_2_076_WI_3 -2920,FOODS_2_077_CA_1 -2921,FOODS_2_077_CA_2 -2922,FOODS_2_077_CA_3 -2923,FOODS_2_077_CA_4 -2924,FOODS_2_077_TX_1 -2925,FOODS_2_077_TX_2 -2926,FOODS_2_077_TX_3 -2927,FOODS_2_077_WI_1 -2928,FOODS_2_077_WI_2 -2929,FOODS_2_077_WI_3 -2930,FOODS_2_078_CA_1 -2931,FOODS_2_078_CA_2 -2932,FOODS_2_078_CA_3 -2933,FOODS_2_078_CA_4 -2934,FOODS_2_078_TX_1 -2935,FOODS_2_078_TX_2 -2936,FOODS_2_078_TX_3 -2937,FOODS_2_078_WI_1 -2938,FOODS_2_078_WI_2 -2939,FOODS_2_078_WI_3 -2940,FOODS_2_079_CA_1 -2941,FOODS_2_079_CA_2 -2942,FOODS_2_079_CA_3 -2943,FOODS_2_079_CA_4 -2944,FOODS_2_079_TX_1 -2945,FOODS_2_079_TX_2 -2946,FOODS_2_079_TX_3 -2947,FOODS_2_079_WI_1 -2948,FOODS_2_079_WI_2 -2949,FOODS_2_079_WI_3 -2950,FOODS_2_080_CA_1 -2951,FOODS_2_080_CA_2 -2952,FOODS_2_080_CA_3 -2953,FOODS_2_080_CA_4 -2954,FOODS_2_080_TX_1 -2955,FOODS_2_080_TX_2 -2956,FOODS_2_080_TX_3 -2957,FOODS_2_080_WI_1 -2958,FOODS_2_080_WI_2 -2959,FOODS_2_080_WI_3 -2960,FOODS_2_081_CA_1 -2961,FOODS_2_081_CA_2 -2962,FOODS_2_081_CA_3 -2963,FOODS_2_081_CA_4 -2964,FOODS_2_081_TX_1 -2965,FOODS_2_081_TX_2 -2966,FOODS_2_081_TX_3 -2967,FOODS_2_081_WI_1 -2968,FOODS_2_081_WI_2 -2969,FOODS_2_081_WI_3 -2970,FOODS_2_082_CA_1 -2971,FOODS_2_082_CA_2 -2972,FOODS_2_082_CA_3 -2973,FOODS_2_082_CA_4 -2974,FOODS_2_082_TX_1 -2975,FOODS_2_082_TX_2 -2976,FOODS_2_082_TX_3 -2977,FOODS_2_082_WI_1 -2978,FOODS_2_082_WI_2 -2979,FOODS_2_082_WI_3 -2980,FOODS_2_083_CA_1 -2981,FOODS_2_083_CA_2 -2982,FOODS_2_083_CA_3 -2983,FOODS_2_083_CA_4 -2984,FOODS_2_083_TX_1 -2985,FOODS_2_083_TX_2 -2986,FOODS_2_083_TX_3 -2987,FOODS_2_083_WI_1 -2988,FOODS_2_083_WI_2 -2989,FOODS_2_083_WI_3 -2990,FOODS_2_084_CA_1 -2991,FOODS_2_084_CA_2 -2992,FOODS_2_084_CA_3 -2993,FOODS_2_084_CA_4 -2994,FOODS_2_084_TX_1 -2995,FOODS_2_084_TX_2 -2996,FOODS_2_084_TX_3 -2997,FOODS_2_084_WI_1 -2998,FOODS_2_084_WI_2 -2999,FOODS_2_084_WI_3 -3000,FOODS_2_085_CA_1 -3001,FOODS_2_085_CA_2 -3002,FOODS_2_085_CA_3 -3003,FOODS_2_085_CA_4 -3004,FOODS_2_085_TX_1 -3005,FOODS_2_085_TX_2 -3006,FOODS_2_085_TX_3 -3007,FOODS_2_085_WI_1 -3008,FOODS_2_085_WI_2 -3009,FOODS_2_085_WI_3 -3010,FOODS_2_086_CA_1 -3011,FOODS_2_086_CA_2 -3012,FOODS_2_086_CA_3 -3013,FOODS_2_086_CA_4 -3014,FOODS_2_086_TX_1 -3015,FOODS_2_086_TX_2 -3016,FOODS_2_086_TX_3 -3017,FOODS_2_086_WI_1 -3018,FOODS_2_086_WI_2 -3019,FOODS_2_086_WI_3 -3020,FOODS_2_087_CA_1 -3021,FOODS_2_087_CA_2 -3022,FOODS_2_087_CA_3 -3023,FOODS_2_087_CA_4 -3024,FOODS_2_087_TX_1 -3025,FOODS_2_087_TX_2 -3026,FOODS_2_087_TX_3 -3027,FOODS_2_087_WI_1 -3028,FOODS_2_087_WI_2 -3029,FOODS_2_087_WI_3 -3030,FOODS_2_088_CA_1 -3031,FOODS_2_088_CA_2 -3032,FOODS_2_088_CA_3 -3033,FOODS_2_088_CA_4 -3034,FOODS_2_088_TX_1 -3035,FOODS_2_088_TX_2 -3036,FOODS_2_088_TX_3 -3037,FOODS_2_088_WI_1 -3038,FOODS_2_088_WI_2 -3039,FOODS_2_088_WI_3 -3040,FOODS_2_089_CA_1 -3041,FOODS_2_089_CA_2 -3042,FOODS_2_089_CA_3 -3043,FOODS_2_089_CA_4 -3044,FOODS_2_089_TX_1 -3045,FOODS_2_089_TX_2 -3046,FOODS_2_089_TX_3 -3047,FOODS_2_089_WI_1 -3048,FOODS_2_089_WI_2 -3049,FOODS_2_089_WI_3 -3050,FOODS_2_090_CA_1 -3051,FOODS_2_090_CA_2 -3052,FOODS_2_090_CA_3 -3053,FOODS_2_090_CA_4 -3054,FOODS_2_090_TX_1 -3055,FOODS_2_090_TX_2 -3056,FOODS_2_090_TX_3 -3057,FOODS_2_090_WI_1 -3058,FOODS_2_090_WI_2 -3059,FOODS_2_090_WI_3 -3060,FOODS_2_091_CA_1 -3061,FOODS_2_091_CA_2 -3062,FOODS_2_091_CA_3 -3063,FOODS_2_091_CA_4 -3064,FOODS_2_091_TX_1 -3065,FOODS_2_091_TX_2 -3066,FOODS_2_091_TX_3 -3067,FOODS_2_091_WI_1 -3068,FOODS_2_091_WI_2 -3069,FOODS_2_091_WI_3 -3070,FOODS_2_092_CA_1 -3071,FOODS_2_092_CA_2 -3072,FOODS_2_092_CA_3 -3073,FOODS_2_092_CA_4 -3074,FOODS_2_092_TX_1 -3075,FOODS_2_092_TX_2 -3076,FOODS_2_092_TX_3 -3077,FOODS_2_092_WI_1 -3078,FOODS_2_092_WI_2 -3079,FOODS_2_092_WI_3 -3080,FOODS_2_093_CA_1 -3081,FOODS_2_093_CA_2 -3082,FOODS_2_093_CA_3 -3083,FOODS_2_093_CA_4 -3084,FOODS_2_093_TX_1 -3085,FOODS_2_093_TX_2 -3086,FOODS_2_093_TX_3 -3087,FOODS_2_093_WI_1 -3088,FOODS_2_093_WI_2 -3089,FOODS_2_093_WI_3 -3090,FOODS_2_094_CA_1 -3091,FOODS_2_094_CA_2 -3092,FOODS_2_094_CA_3 -3093,FOODS_2_094_CA_4 -3094,FOODS_2_094_TX_1 -3095,FOODS_2_094_TX_2 -3096,FOODS_2_094_TX_3 -3097,FOODS_2_094_WI_1 -3098,FOODS_2_094_WI_2 -3099,FOODS_2_094_WI_3 -3100,FOODS_2_095_CA_1 -3101,FOODS_2_095_CA_2 -3102,FOODS_2_095_CA_3 -3103,FOODS_2_095_CA_4 -3104,FOODS_2_095_TX_1 -3105,FOODS_2_095_TX_2 -3106,FOODS_2_095_TX_3 -3107,FOODS_2_095_WI_1 -3108,FOODS_2_095_WI_2 -3109,FOODS_2_095_WI_3 -3110,FOODS_2_096_CA_1 -3111,FOODS_2_096_CA_2 -3112,FOODS_2_096_CA_3 -3113,FOODS_2_096_CA_4 -3114,FOODS_2_096_TX_1 -3115,FOODS_2_096_TX_2 -3116,FOODS_2_096_TX_3 -3117,FOODS_2_096_WI_1 -3118,FOODS_2_096_WI_2 -3119,FOODS_2_096_WI_3 -3120,FOODS_2_097_CA_1 -3121,FOODS_2_097_CA_2 -3122,FOODS_2_097_CA_3 -3123,FOODS_2_097_CA_4 -3124,FOODS_2_097_TX_1 -3125,FOODS_2_097_TX_2 -3126,FOODS_2_097_TX_3 -3127,FOODS_2_097_WI_1 -3128,FOODS_2_097_WI_2 -3129,FOODS_2_097_WI_3 -3130,FOODS_2_099_CA_1 -3131,FOODS_2_099_CA_2 -3132,FOODS_2_099_CA_3 -3133,FOODS_2_099_CA_4 -3134,FOODS_2_099_TX_1 -3135,FOODS_2_099_TX_2 -3136,FOODS_2_099_TX_3 -3137,FOODS_2_099_WI_1 -3138,FOODS_2_099_WI_2 -3139,FOODS_2_099_WI_3 -3140,FOODS_2_100_CA_1 -3141,FOODS_2_100_CA_2 -3142,FOODS_2_100_CA_3 -3143,FOODS_2_100_CA_4 -3144,FOODS_2_100_TX_1 -3145,FOODS_2_100_TX_2 -3146,FOODS_2_100_TX_3 -3147,FOODS_2_100_WI_1 -3148,FOODS_2_100_WI_2 -3149,FOODS_2_100_WI_3 -3150,FOODS_2_101_CA_1 -3151,FOODS_2_101_CA_2 -3152,FOODS_2_101_CA_3 -3153,FOODS_2_101_CA_4 -3154,FOODS_2_101_TX_1 -3155,FOODS_2_101_TX_2 -3156,FOODS_2_101_TX_3 -3157,FOODS_2_101_WI_1 -3158,FOODS_2_101_WI_2 -3159,FOODS_2_101_WI_3 -3160,FOODS_2_102_CA_1 -3161,FOODS_2_102_CA_2 -3162,FOODS_2_102_CA_3 -3163,FOODS_2_102_CA_4 -3164,FOODS_2_102_TX_1 -3165,FOODS_2_102_TX_2 -3166,FOODS_2_102_TX_3 -3167,FOODS_2_102_WI_1 -3168,FOODS_2_102_WI_2 -3169,FOODS_2_102_WI_3 -3170,FOODS_2_103_CA_1 -3171,FOODS_2_103_CA_2 -3172,FOODS_2_103_CA_3 -3173,FOODS_2_103_CA_4 -3174,FOODS_2_103_TX_1 -3175,FOODS_2_103_TX_2 -3176,FOODS_2_103_TX_3 -3177,FOODS_2_103_WI_1 -3178,FOODS_2_103_WI_2 -3179,FOODS_2_103_WI_3 -3180,FOODS_2_104_CA_1 -3181,FOODS_2_104_CA_2 -3182,FOODS_2_104_CA_3 -3183,FOODS_2_104_CA_4 -3184,FOODS_2_104_TX_1 -3185,FOODS_2_104_TX_2 -3186,FOODS_2_104_TX_3 -3187,FOODS_2_104_WI_1 -3188,FOODS_2_104_WI_2 -3189,FOODS_2_104_WI_3 -3190,FOODS_2_105_CA_1 -3191,FOODS_2_105_CA_2 -3192,FOODS_2_105_CA_3 -3193,FOODS_2_105_CA_4 -3194,FOODS_2_105_TX_1 -3195,FOODS_2_105_TX_2 -3196,FOODS_2_105_TX_3 -3197,FOODS_2_105_WI_1 -3198,FOODS_2_105_WI_2 -3199,FOODS_2_105_WI_3 -3200,FOODS_2_106_CA_1 -3201,FOODS_2_106_CA_2 -3202,FOODS_2_106_CA_3 -3203,FOODS_2_106_CA_4 -3204,FOODS_2_106_TX_1 -3205,FOODS_2_106_TX_2 -3206,FOODS_2_106_TX_3 -3207,FOODS_2_106_WI_1 -3208,FOODS_2_106_WI_2 -3209,FOODS_2_106_WI_3 -3210,FOODS_2_107_CA_1 -3211,FOODS_2_107_CA_2 -3212,FOODS_2_107_CA_3 -3213,FOODS_2_107_CA_4 -3214,FOODS_2_107_TX_1 -3215,FOODS_2_107_TX_2 -3216,FOODS_2_107_TX_3 -3217,FOODS_2_107_WI_1 -3218,FOODS_2_107_WI_2 -3219,FOODS_2_107_WI_3 -3220,FOODS_2_108_CA_1 -3221,FOODS_2_108_CA_2 -3222,FOODS_2_108_CA_3 -3223,FOODS_2_108_CA_4 -3224,FOODS_2_108_TX_1 -3225,FOODS_2_108_TX_2 -3226,FOODS_2_108_TX_3 -3227,FOODS_2_108_WI_1 -3228,FOODS_2_108_WI_2 -3229,FOODS_2_108_WI_3 -3230,FOODS_2_109_CA_1 -3231,FOODS_2_109_CA_2 -3232,FOODS_2_109_CA_3 -3233,FOODS_2_109_CA_4 -3234,FOODS_2_109_TX_1 -3235,FOODS_2_109_TX_2 -3236,FOODS_2_109_TX_3 -3237,FOODS_2_109_WI_1 -3238,FOODS_2_109_WI_2 -3239,FOODS_2_109_WI_3 -3240,FOODS_2_110_CA_1 -3241,FOODS_2_110_CA_2 -3242,FOODS_2_110_CA_3 -3243,FOODS_2_110_CA_4 -3244,FOODS_2_110_TX_1 -3245,FOODS_2_110_TX_2 -3246,FOODS_2_110_TX_3 -3247,FOODS_2_110_WI_1 -3248,FOODS_2_110_WI_2 -3249,FOODS_2_110_WI_3 -3250,FOODS_2_111_CA_1 -3251,FOODS_2_111_CA_2 -3252,FOODS_2_111_CA_3 -3253,FOODS_2_111_CA_4 -3254,FOODS_2_111_TX_1 -3255,FOODS_2_111_TX_2 -3256,FOODS_2_111_TX_3 -3257,FOODS_2_111_WI_1 -3258,FOODS_2_111_WI_2 -3259,FOODS_2_111_WI_3 -3260,FOODS_2_112_CA_1 -3261,FOODS_2_112_CA_2 -3262,FOODS_2_112_CA_3 -3263,FOODS_2_112_CA_4 -3264,FOODS_2_112_TX_1 -3265,FOODS_2_112_TX_2 -3266,FOODS_2_112_TX_3 -3267,FOODS_2_112_WI_1 -3268,FOODS_2_112_WI_2 -3269,FOODS_2_112_WI_3 -3270,FOODS_2_113_CA_1 -3271,FOODS_2_113_CA_2 -3272,FOODS_2_113_CA_3 -3273,FOODS_2_113_CA_4 -3274,FOODS_2_113_TX_1 -3275,FOODS_2_113_TX_2 -3276,FOODS_2_113_TX_3 -3277,FOODS_2_113_WI_1 -3278,FOODS_2_113_WI_2 -3279,FOODS_2_113_WI_3 -3280,FOODS_2_114_CA_1 -3281,FOODS_2_114_CA_2 -3282,FOODS_2_114_CA_3 -3283,FOODS_2_114_CA_4 -3284,FOODS_2_114_TX_1 -3285,FOODS_2_114_TX_2 -3286,FOODS_2_114_TX_3 -3287,FOODS_2_114_WI_1 -3288,FOODS_2_114_WI_2 -3289,FOODS_2_114_WI_3 -3290,FOODS_2_115_CA_1 -3291,FOODS_2_115_CA_2 -3292,FOODS_2_115_CA_3 -3293,FOODS_2_115_CA_4 -3294,FOODS_2_115_TX_1 -3295,FOODS_2_115_TX_2 -3296,FOODS_2_115_TX_3 -3297,FOODS_2_115_WI_1 -3298,FOODS_2_115_WI_2 -3299,FOODS_2_115_WI_3 -3300,FOODS_2_116_CA_1 -3301,FOODS_2_116_CA_2 -3302,FOODS_2_116_CA_3 -3303,FOODS_2_116_CA_4 -3304,FOODS_2_116_TX_1 -3305,FOODS_2_116_TX_2 -3306,FOODS_2_116_TX_3 -3307,FOODS_2_116_WI_1 -3308,FOODS_2_116_WI_2 -3309,FOODS_2_116_WI_3 -3310,FOODS_2_117_CA_1 -3311,FOODS_2_117_CA_2 -3312,FOODS_2_117_CA_3 -3313,FOODS_2_117_CA_4 -3314,FOODS_2_117_TX_1 -3315,FOODS_2_117_TX_2 -3316,FOODS_2_117_TX_3 -3317,FOODS_2_117_WI_1 -3318,FOODS_2_117_WI_2 -3319,FOODS_2_117_WI_3 -3320,FOODS_2_118_CA_1 -3321,FOODS_2_118_CA_2 -3322,FOODS_2_118_CA_3 -3323,FOODS_2_118_CA_4 -3324,FOODS_2_118_TX_1 -3325,FOODS_2_118_TX_2 -3326,FOODS_2_118_TX_3 -3327,FOODS_2_118_WI_1 -3328,FOODS_2_118_WI_2 -3329,FOODS_2_118_WI_3 -3330,FOODS_2_119_CA_1 -3331,FOODS_2_119_CA_2 -3332,FOODS_2_119_CA_3 -3333,FOODS_2_119_CA_4 -3334,FOODS_2_119_TX_1 -3335,FOODS_2_119_TX_2 -3336,FOODS_2_119_TX_3 -3337,FOODS_2_119_WI_1 -3338,FOODS_2_119_WI_2 -3339,FOODS_2_119_WI_3 -3340,FOODS_2_120_CA_1 -3341,FOODS_2_120_CA_2 -3342,FOODS_2_120_CA_3 -3343,FOODS_2_120_CA_4 -3344,FOODS_2_120_TX_1 -3345,FOODS_2_120_TX_2 -3346,FOODS_2_120_TX_3 -3347,FOODS_2_120_WI_1 -3348,FOODS_2_120_WI_2 -3349,FOODS_2_120_WI_3 -3350,FOODS_2_121_CA_1 -3351,FOODS_2_121_CA_2 -3352,FOODS_2_121_CA_3 -3353,FOODS_2_121_CA_4 -3354,FOODS_2_121_TX_1 -3355,FOODS_2_121_TX_2 -3356,FOODS_2_121_TX_3 -3357,FOODS_2_121_WI_1 -3358,FOODS_2_121_WI_2 -3359,FOODS_2_121_WI_3 -3360,FOODS_2_122_CA_1 -3361,FOODS_2_122_CA_2 -3362,FOODS_2_122_CA_3 -3363,FOODS_2_122_CA_4 -3364,FOODS_2_122_TX_1 -3365,FOODS_2_122_TX_2 -3366,FOODS_2_122_TX_3 -3367,FOODS_2_122_WI_1 -3368,FOODS_2_122_WI_2 -3369,FOODS_2_122_WI_3 -3370,FOODS_2_123_CA_1 -3371,FOODS_2_123_CA_2 -3372,FOODS_2_123_CA_3 -3373,FOODS_2_123_CA_4 -3374,FOODS_2_123_TX_1 -3375,FOODS_2_123_TX_2 -3376,FOODS_2_123_TX_3 -3377,FOODS_2_123_WI_1 -3378,FOODS_2_123_WI_2 -3379,FOODS_2_123_WI_3 -3380,FOODS_2_124_CA_1 -3381,FOODS_2_124_CA_2 -3382,FOODS_2_124_CA_3 -3383,FOODS_2_124_CA_4 -3384,FOODS_2_124_TX_1 -3385,FOODS_2_124_TX_2 -3386,FOODS_2_124_TX_3 -3387,FOODS_2_124_WI_1 -3388,FOODS_2_124_WI_2 -3389,FOODS_2_124_WI_3 -3390,FOODS_2_125_CA_1 -3391,FOODS_2_125_CA_2 -3392,FOODS_2_125_CA_3 -3393,FOODS_2_125_CA_4 -3394,FOODS_2_125_TX_1 -3395,FOODS_2_125_TX_2 -3396,FOODS_2_125_TX_3 -3397,FOODS_2_125_WI_1 -3398,FOODS_2_125_WI_2 -3399,FOODS_2_125_WI_3 -3400,FOODS_2_126_CA_1 -3401,FOODS_2_126_CA_2 -3402,FOODS_2_126_CA_3 -3403,FOODS_2_126_CA_4 -3404,FOODS_2_126_TX_1 -3405,FOODS_2_126_TX_2 -3406,FOODS_2_126_TX_3 -3407,FOODS_2_126_WI_1 -3408,FOODS_2_126_WI_2 -3409,FOODS_2_126_WI_3 -3410,FOODS_2_127_CA_1 -3411,FOODS_2_127_CA_2 -3412,FOODS_2_127_CA_3 -3413,FOODS_2_127_CA_4 -3414,FOODS_2_127_TX_1 -3415,FOODS_2_127_TX_2 -3416,FOODS_2_127_TX_3 -3417,FOODS_2_127_WI_1 -3418,FOODS_2_127_WI_2 -3419,FOODS_2_127_WI_3 -3420,FOODS_2_128_CA_1 -3421,FOODS_2_128_CA_2 -3422,FOODS_2_128_CA_3 -3423,FOODS_2_128_CA_4 -3424,FOODS_2_128_TX_1 -3425,FOODS_2_128_TX_2 -3426,FOODS_2_128_TX_3 -3427,FOODS_2_128_WI_1 -3428,FOODS_2_128_WI_2 -3429,FOODS_2_128_WI_3 -3430,FOODS_2_129_CA_1 -3431,FOODS_2_129_CA_2 -3432,FOODS_2_129_CA_3 -3433,FOODS_2_129_CA_4 -3434,FOODS_2_129_TX_1 -3435,FOODS_2_129_TX_2 -3436,FOODS_2_129_TX_3 -3437,FOODS_2_129_WI_1 -3438,FOODS_2_129_WI_2 -3439,FOODS_2_129_WI_3 -3440,FOODS_2_130_CA_1 -3441,FOODS_2_130_CA_2 -3442,FOODS_2_130_CA_3 -3443,FOODS_2_130_CA_4 -3444,FOODS_2_130_TX_1 -3445,FOODS_2_130_TX_2 -3446,FOODS_2_130_TX_3 -3447,FOODS_2_130_WI_1 -3448,FOODS_2_130_WI_2 -3449,FOODS_2_130_WI_3 -3450,FOODS_2_131_CA_1 -3451,FOODS_2_131_CA_2 -3452,FOODS_2_131_CA_3 -3453,FOODS_2_131_CA_4 -3454,FOODS_2_131_TX_1 -3455,FOODS_2_131_TX_2 -3456,FOODS_2_131_TX_3 -3457,FOODS_2_131_WI_1 -3458,FOODS_2_131_WI_2 -3459,FOODS_2_131_WI_3 -3460,FOODS_2_132_CA_1 -3461,FOODS_2_132_CA_2 -3462,FOODS_2_132_CA_3 -3463,FOODS_2_132_CA_4 -3464,FOODS_2_132_TX_1 -3465,FOODS_2_132_TX_2 -3466,FOODS_2_132_TX_3 -3467,FOODS_2_132_WI_1 -3468,FOODS_2_132_WI_2 -3469,FOODS_2_132_WI_3 -3470,FOODS_2_133_CA_1 -3471,FOODS_2_133_CA_2 -3472,FOODS_2_133_CA_3 -3473,FOODS_2_133_CA_4 -3474,FOODS_2_133_TX_1 -3475,FOODS_2_133_TX_2 -3476,FOODS_2_133_TX_3 -3477,FOODS_2_133_WI_1 -3478,FOODS_2_133_WI_2 -3479,FOODS_2_133_WI_3 -3480,FOODS_2_134_CA_1 -3481,FOODS_2_134_CA_2 -3482,FOODS_2_134_CA_3 -3483,FOODS_2_134_CA_4 -3484,FOODS_2_134_TX_1 -3485,FOODS_2_134_TX_2 -3486,FOODS_2_134_TX_3 -3487,FOODS_2_134_WI_1 -3488,FOODS_2_134_WI_2 -3489,FOODS_2_134_WI_3 -3490,FOODS_2_135_CA_1 -3491,FOODS_2_135_CA_2 -3492,FOODS_2_135_CA_3 -3493,FOODS_2_135_CA_4 -3494,FOODS_2_135_TX_1 -3495,FOODS_2_135_TX_2 -3496,FOODS_2_135_TX_3 -3497,FOODS_2_135_WI_1 -3498,FOODS_2_135_WI_2 -3499,FOODS_2_135_WI_3 -3500,FOODS_2_136_CA_1 -3501,FOODS_2_136_CA_2 -3502,FOODS_2_136_CA_3 -3503,FOODS_2_136_CA_4 -3504,FOODS_2_136_TX_1 -3505,FOODS_2_136_TX_2 -3506,FOODS_2_136_TX_3 -3507,FOODS_2_136_WI_1 -3508,FOODS_2_136_WI_2 -3509,FOODS_2_136_WI_3 -3510,FOODS_2_137_CA_1 -3511,FOODS_2_137_CA_2 -3512,FOODS_2_137_CA_3 -3513,FOODS_2_137_CA_4 -3514,FOODS_2_137_TX_1 -3515,FOODS_2_137_TX_2 -3516,FOODS_2_137_TX_3 -3517,FOODS_2_137_WI_1 -3518,FOODS_2_137_WI_2 -3519,FOODS_2_137_WI_3 -3520,FOODS_2_138_CA_1 -3521,FOODS_2_138_CA_2 -3522,FOODS_2_138_CA_3 -3523,FOODS_2_138_CA_4 -3524,FOODS_2_138_TX_1 -3525,FOODS_2_138_TX_2 -3526,FOODS_2_138_TX_3 -3527,FOODS_2_138_WI_1 -3528,FOODS_2_138_WI_2 -3529,FOODS_2_138_WI_3 -3530,FOODS_2_139_CA_1 -3531,FOODS_2_139_CA_2 -3532,FOODS_2_139_CA_3 -3533,FOODS_2_139_CA_4 -3534,FOODS_2_139_TX_1 -3535,FOODS_2_139_TX_2 -3536,FOODS_2_139_TX_3 -3537,FOODS_2_139_WI_1 -3538,FOODS_2_139_WI_2 -3539,FOODS_2_139_WI_3 -3540,FOODS_2_140_CA_1 -3541,FOODS_2_140_CA_2 -3542,FOODS_2_140_CA_3 -3543,FOODS_2_140_CA_4 -3544,FOODS_2_140_TX_1 -3545,FOODS_2_140_TX_2 -3546,FOODS_2_140_TX_3 -3547,FOODS_2_140_WI_1 -3548,FOODS_2_140_WI_2 -3549,FOODS_2_140_WI_3 -3550,FOODS_2_141_CA_1 -3551,FOODS_2_141_CA_2 -3552,FOODS_2_141_CA_3 -3553,FOODS_2_141_CA_4 -3554,FOODS_2_141_TX_1 -3555,FOODS_2_141_TX_2 -3556,FOODS_2_141_TX_3 -3557,FOODS_2_141_WI_1 -3558,FOODS_2_141_WI_2 -3559,FOODS_2_141_WI_3 -3560,FOODS_2_142_CA_1 -3561,FOODS_2_142_CA_2 -3562,FOODS_2_142_CA_3 -3563,FOODS_2_142_CA_4 -3564,FOODS_2_142_TX_1 -3565,FOODS_2_142_TX_2 -3566,FOODS_2_142_TX_3 -3567,FOODS_2_142_WI_1 -3568,FOODS_2_142_WI_2 -3569,FOODS_2_142_WI_3 -3570,FOODS_2_143_CA_1 -3571,FOODS_2_143_CA_2 -3572,FOODS_2_143_CA_3 -3573,FOODS_2_143_CA_4 -3574,FOODS_2_143_TX_1 -3575,FOODS_2_143_TX_2 -3576,FOODS_2_143_TX_3 -3577,FOODS_2_143_WI_1 -3578,FOODS_2_143_WI_2 -3579,FOODS_2_143_WI_3 -3580,FOODS_2_144_CA_1 -3581,FOODS_2_144_CA_2 -3582,FOODS_2_144_CA_3 -3583,FOODS_2_144_CA_4 -3584,FOODS_2_144_TX_1 -3585,FOODS_2_144_TX_2 -3586,FOODS_2_144_TX_3 -3587,FOODS_2_144_WI_1 -3588,FOODS_2_144_WI_2 -3589,FOODS_2_144_WI_3 -3590,FOODS_2_145_CA_1 -3591,FOODS_2_145_CA_2 -3592,FOODS_2_145_CA_3 -3593,FOODS_2_145_CA_4 -3594,FOODS_2_145_TX_1 -3595,FOODS_2_145_TX_2 -3596,FOODS_2_145_TX_3 -3597,FOODS_2_145_WI_1 -3598,FOODS_2_145_WI_2 -3599,FOODS_2_145_WI_3 -3600,FOODS_2_146_CA_1 -3601,FOODS_2_146_CA_2 -3602,FOODS_2_146_CA_3 -3603,FOODS_2_146_CA_4 -3604,FOODS_2_146_TX_1 -3605,FOODS_2_146_TX_2 -3606,FOODS_2_146_TX_3 -3607,FOODS_2_146_WI_1 -3608,FOODS_2_146_WI_2 -3609,FOODS_2_146_WI_3 -3610,FOODS_2_147_CA_1 -3611,FOODS_2_147_CA_2 -3612,FOODS_2_147_CA_3 -3613,FOODS_2_147_CA_4 -3614,FOODS_2_147_TX_1 -3615,FOODS_2_147_TX_2 -3616,FOODS_2_147_TX_3 -3617,FOODS_2_147_WI_1 -3618,FOODS_2_147_WI_2 -3619,FOODS_2_147_WI_3 -3620,FOODS_2_148_CA_1 -3621,FOODS_2_148_CA_2 -3622,FOODS_2_148_CA_3 -3623,FOODS_2_148_CA_4 -3624,FOODS_2_148_TX_1 -3625,FOODS_2_148_TX_2 -3626,FOODS_2_148_TX_3 -3627,FOODS_2_148_WI_1 -3628,FOODS_2_148_WI_2 -3629,FOODS_2_148_WI_3 -3630,FOODS_2_149_CA_1 -3631,FOODS_2_149_CA_2 -3632,FOODS_2_149_CA_3 -3633,FOODS_2_149_CA_4 -3634,FOODS_2_149_TX_1 -3635,FOODS_2_149_TX_2 -3636,FOODS_2_149_TX_3 -3637,FOODS_2_149_WI_1 -3638,FOODS_2_149_WI_2 -3639,FOODS_2_149_WI_3 -3640,FOODS_2_150_CA_1 -3641,FOODS_2_150_CA_2 -3642,FOODS_2_150_CA_3 -3643,FOODS_2_150_CA_4 -3644,FOODS_2_150_TX_1 -3645,FOODS_2_150_TX_2 -3646,FOODS_2_150_TX_3 -3647,FOODS_2_150_WI_1 -3648,FOODS_2_150_WI_2 -3649,FOODS_2_150_WI_3 -3650,FOODS_2_151_CA_1 -3651,FOODS_2_151_CA_2 -3652,FOODS_2_151_CA_3 -3653,FOODS_2_151_CA_4 -3654,FOODS_2_151_TX_1 -3655,FOODS_2_151_TX_2 -3656,FOODS_2_151_TX_3 -3657,FOODS_2_151_WI_1 -3658,FOODS_2_151_WI_2 -3659,FOODS_2_151_WI_3 -3660,FOODS_2_152_CA_1 -3661,FOODS_2_152_CA_2 -3662,FOODS_2_152_CA_3 -3663,FOODS_2_152_CA_4 -3664,FOODS_2_152_TX_1 -3665,FOODS_2_152_TX_2 -3666,FOODS_2_152_TX_3 -3667,FOODS_2_152_WI_1 -3668,FOODS_2_152_WI_2 -3669,FOODS_2_152_WI_3 -3670,FOODS_2_153_CA_1 -3671,FOODS_2_153_CA_2 -3672,FOODS_2_153_CA_3 -3673,FOODS_2_153_CA_4 -3674,FOODS_2_153_TX_1 -3675,FOODS_2_153_TX_2 -3676,FOODS_2_153_TX_3 -3677,FOODS_2_153_WI_1 -3678,FOODS_2_153_WI_2 -3679,FOODS_2_153_WI_3 -3680,FOODS_2_154_CA_1 -3681,FOODS_2_154_CA_2 -3682,FOODS_2_154_CA_3 -3683,FOODS_2_154_CA_4 -3684,FOODS_2_154_TX_1 -3685,FOODS_2_154_TX_2 -3686,FOODS_2_154_TX_3 -3687,FOODS_2_154_WI_1 -3688,FOODS_2_154_WI_2 -3689,FOODS_2_154_WI_3 -3690,FOODS_2_155_CA_1 -3691,FOODS_2_155_CA_2 -3692,FOODS_2_155_CA_3 -3693,FOODS_2_155_CA_4 -3694,FOODS_2_155_TX_1 -3695,FOODS_2_155_TX_2 -3696,FOODS_2_155_TX_3 -3697,FOODS_2_155_WI_1 -3698,FOODS_2_155_WI_2 -3699,FOODS_2_155_WI_3 -3700,FOODS_2_156_CA_1 -3701,FOODS_2_156_CA_2 -3702,FOODS_2_156_CA_3 -3703,FOODS_2_156_CA_4 -3704,FOODS_2_156_TX_1 -3705,FOODS_2_156_TX_2 -3706,FOODS_2_156_TX_3 -3707,FOODS_2_156_WI_1 -3708,FOODS_2_156_WI_2 -3709,FOODS_2_156_WI_3 -3710,FOODS_2_157_CA_1 -3711,FOODS_2_157_CA_2 -3712,FOODS_2_157_CA_3 -3713,FOODS_2_157_CA_4 -3714,FOODS_2_157_TX_1 -3715,FOODS_2_157_TX_2 -3716,FOODS_2_157_TX_3 -3717,FOODS_2_157_WI_1 -3718,FOODS_2_157_WI_2 -3719,FOODS_2_157_WI_3 -3720,FOODS_2_158_CA_1 -3721,FOODS_2_158_CA_2 -3722,FOODS_2_158_CA_3 -3723,FOODS_2_158_CA_4 -3724,FOODS_2_158_TX_1 -3725,FOODS_2_158_TX_2 -3726,FOODS_2_158_TX_3 -3727,FOODS_2_158_WI_1 -3728,FOODS_2_158_WI_2 -3729,FOODS_2_158_WI_3 -3730,FOODS_2_159_CA_1 -3731,FOODS_2_159_CA_2 -3732,FOODS_2_159_CA_3 -3733,FOODS_2_159_CA_4 -3734,FOODS_2_159_TX_1 -3735,FOODS_2_159_TX_2 -3736,FOODS_2_159_TX_3 -3737,FOODS_2_159_WI_1 -3738,FOODS_2_159_WI_2 -3739,FOODS_2_159_WI_3 -3740,FOODS_2_160_CA_1 -3741,FOODS_2_160_CA_2 -3742,FOODS_2_160_CA_3 -3743,FOODS_2_160_CA_4 -3744,FOODS_2_160_TX_1 -3745,FOODS_2_160_TX_2 -3746,FOODS_2_160_TX_3 -3747,FOODS_2_160_WI_1 -3748,FOODS_2_160_WI_2 -3749,FOODS_2_160_WI_3 -3750,FOODS_2_161_CA_1 -3751,FOODS_2_161_CA_2 -3752,FOODS_2_161_CA_3 -3753,FOODS_2_161_CA_4 -3754,FOODS_2_161_TX_1 -3755,FOODS_2_161_TX_2 -3756,FOODS_2_161_TX_3 -3757,FOODS_2_161_WI_1 -3758,FOODS_2_161_WI_2 -3759,FOODS_2_161_WI_3 -3760,FOODS_2_162_CA_1 -3761,FOODS_2_162_CA_2 -3762,FOODS_2_162_CA_3 -3763,FOODS_2_162_CA_4 -3764,FOODS_2_162_TX_1 -3765,FOODS_2_162_TX_2 -3766,FOODS_2_162_TX_3 -3767,FOODS_2_162_WI_1 -3768,FOODS_2_162_WI_2 -3769,FOODS_2_162_WI_3 -3770,FOODS_2_163_CA_1 -3771,FOODS_2_163_CA_2 -3772,FOODS_2_163_CA_3 -3773,FOODS_2_163_CA_4 -3774,FOODS_2_163_TX_1 -3775,FOODS_2_163_TX_2 -3776,FOODS_2_163_TX_3 -3777,FOODS_2_163_WI_1 -3778,FOODS_2_163_WI_2 -3779,FOODS_2_163_WI_3 -3780,FOODS_2_164_CA_1 -3781,FOODS_2_164_CA_2 -3782,FOODS_2_164_CA_3 -3783,FOODS_2_164_CA_4 -3784,FOODS_2_164_TX_1 -3785,FOODS_2_164_TX_2 -3786,FOODS_2_164_TX_3 -3787,FOODS_2_164_WI_1 -3788,FOODS_2_164_WI_2 -3789,FOODS_2_164_WI_3 -3790,FOODS_2_165_CA_1 -3791,FOODS_2_165_CA_2 -3792,FOODS_2_165_CA_3 -3793,FOODS_2_165_CA_4 -3794,FOODS_2_165_TX_1 -3795,FOODS_2_165_TX_2 -3796,FOODS_2_165_TX_3 -3797,FOODS_2_165_WI_1 -3798,FOODS_2_165_WI_2 -3799,FOODS_2_165_WI_3 -3800,FOODS_2_166_CA_1 -3801,FOODS_2_166_CA_2 -3802,FOODS_2_166_CA_3 -3803,FOODS_2_166_CA_4 -3804,FOODS_2_166_TX_1 -3805,FOODS_2_166_TX_2 -3806,FOODS_2_166_TX_3 -3807,FOODS_2_166_WI_1 -3808,FOODS_2_166_WI_2 -3809,FOODS_2_166_WI_3 -3810,FOODS_2_167_CA_1 -3811,FOODS_2_167_CA_2 -3812,FOODS_2_167_CA_3 -3813,FOODS_2_167_CA_4 -3814,FOODS_2_167_TX_1 -3815,FOODS_2_167_TX_2 -3816,FOODS_2_167_TX_3 -3817,FOODS_2_167_WI_1 -3818,FOODS_2_167_WI_2 -3819,FOODS_2_167_WI_3 -3820,FOODS_2_168_CA_1 -3821,FOODS_2_168_CA_2 -3822,FOODS_2_168_CA_3 -3823,FOODS_2_168_CA_4 -3824,FOODS_2_168_TX_1 -3825,FOODS_2_168_TX_2 -3826,FOODS_2_168_TX_3 -3827,FOODS_2_168_WI_1 -3828,FOODS_2_168_WI_2 -3829,FOODS_2_168_WI_3 -3830,FOODS_2_169_CA_1 -3831,FOODS_2_169_CA_2 -3832,FOODS_2_169_CA_3 -3833,FOODS_2_169_CA_4 -3834,FOODS_2_169_TX_1 -3835,FOODS_2_169_TX_2 -3836,FOODS_2_169_TX_3 -3837,FOODS_2_169_WI_1 -3838,FOODS_2_169_WI_2 -3839,FOODS_2_169_WI_3 -3840,FOODS_2_170_CA_1 -3841,FOODS_2_170_CA_2 -3842,FOODS_2_170_CA_3 -3843,FOODS_2_170_CA_4 -3844,FOODS_2_170_TX_1 -3845,FOODS_2_170_TX_2 -3846,FOODS_2_170_TX_3 -3847,FOODS_2_170_WI_1 -3848,FOODS_2_170_WI_2 -3849,FOODS_2_170_WI_3 -3850,FOODS_2_171_CA_1 -3851,FOODS_2_171_CA_2 -3852,FOODS_2_171_CA_3 -3853,FOODS_2_171_CA_4 -3854,FOODS_2_171_TX_1 -3855,FOODS_2_171_TX_2 -3856,FOODS_2_171_TX_3 -3857,FOODS_2_171_WI_1 -3858,FOODS_2_171_WI_2 -3859,FOODS_2_171_WI_3 -3860,FOODS_2_172_CA_1 -3861,FOODS_2_172_CA_2 -3862,FOODS_2_172_CA_3 -3863,FOODS_2_172_CA_4 -3864,FOODS_2_172_TX_1 -3865,FOODS_2_172_TX_2 -3866,FOODS_2_172_TX_3 -3867,FOODS_2_172_WI_1 -3868,FOODS_2_172_WI_2 -3869,FOODS_2_172_WI_3 -3870,FOODS_2_173_CA_1 -3871,FOODS_2_173_CA_2 -3872,FOODS_2_173_CA_3 -3873,FOODS_2_173_CA_4 -3874,FOODS_2_173_TX_1 -3875,FOODS_2_173_TX_2 -3876,FOODS_2_173_TX_3 -3877,FOODS_2_173_WI_1 -3878,FOODS_2_173_WI_2 -3879,FOODS_2_173_WI_3 -3880,FOODS_2_174_CA_1 -3881,FOODS_2_174_CA_2 -3882,FOODS_2_174_CA_3 -3883,FOODS_2_174_CA_4 -3884,FOODS_2_174_TX_1 -3885,FOODS_2_174_TX_2 -3886,FOODS_2_174_TX_3 -3887,FOODS_2_174_WI_1 -3888,FOODS_2_174_WI_2 -3889,FOODS_2_174_WI_3 -3890,FOODS_2_175_CA_1 -3891,FOODS_2_175_CA_2 -3892,FOODS_2_175_CA_3 -3893,FOODS_2_175_CA_4 -3894,FOODS_2_175_TX_1 -3895,FOODS_2_175_TX_2 -3896,FOODS_2_175_TX_3 -3897,FOODS_2_175_WI_1 -3898,FOODS_2_175_WI_2 -3899,FOODS_2_175_WI_3 -3900,FOODS_2_176_CA_1 -3901,FOODS_2_176_CA_2 -3902,FOODS_2_176_CA_3 -3903,FOODS_2_176_CA_4 -3904,FOODS_2_176_TX_1 -3905,FOODS_2_176_TX_2 -3906,FOODS_2_176_TX_3 -3907,FOODS_2_176_WI_1 -3908,FOODS_2_176_WI_2 -3909,FOODS_2_176_WI_3 -3910,FOODS_2_177_CA_1 -3911,FOODS_2_177_CA_2 -3912,FOODS_2_177_CA_3 -3913,FOODS_2_177_CA_4 -3914,FOODS_2_177_TX_1 -3915,FOODS_2_177_TX_2 -3916,FOODS_2_177_TX_3 -3917,FOODS_2_177_WI_1 -3918,FOODS_2_177_WI_2 -3919,FOODS_2_177_WI_3 -3920,FOODS_2_178_CA_1 -3921,FOODS_2_178_CA_2 -3922,FOODS_2_178_CA_3 -3923,FOODS_2_178_CA_4 -3924,FOODS_2_178_TX_1 -3925,FOODS_2_178_TX_2 -3926,FOODS_2_178_TX_3 -3927,FOODS_2_178_WI_1 -3928,FOODS_2_178_WI_2 -3929,FOODS_2_178_WI_3 -3930,FOODS_2_179_CA_1 -3931,FOODS_2_179_CA_2 -3932,FOODS_2_179_CA_3 -3933,FOODS_2_179_CA_4 -3934,FOODS_2_179_TX_1 -3935,FOODS_2_179_TX_2 -3936,FOODS_2_179_TX_3 -3937,FOODS_2_179_WI_1 -3938,FOODS_2_179_WI_2 -3939,FOODS_2_179_WI_3 -3940,FOODS_2_180_CA_1 -3941,FOODS_2_180_CA_2 -3942,FOODS_2_180_CA_3 -3943,FOODS_2_180_CA_4 -3944,FOODS_2_180_TX_1 -3945,FOODS_2_180_TX_2 -3946,FOODS_2_180_TX_3 -3947,FOODS_2_180_WI_1 -3948,FOODS_2_180_WI_2 -3949,FOODS_2_180_WI_3 -3950,FOODS_2_181_CA_1 -3951,FOODS_2_181_CA_2 -3952,FOODS_2_181_CA_3 -3953,FOODS_2_181_CA_4 -3954,FOODS_2_181_TX_1 -3955,FOODS_2_181_TX_2 -3956,FOODS_2_181_TX_3 -3957,FOODS_2_181_WI_1 -3958,FOODS_2_181_WI_2 -3959,FOODS_2_181_WI_3 -3960,FOODS_2_182_CA_1 -3961,FOODS_2_182_CA_2 -3962,FOODS_2_182_CA_3 -3963,FOODS_2_182_CA_4 -3964,FOODS_2_182_TX_1 -3965,FOODS_2_182_TX_2 -3966,FOODS_2_182_TX_3 -3967,FOODS_2_182_WI_1 -3968,FOODS_2_182_WI_2 -3969,FOODS_2_182_WI_3 -3970,FOODS_2_183_CA_1 -3971,FOODS_2_183_CA_2 -3972,FOODS_2_183_CA_3 -3973,FOODS_2_183_CA_4 -3974,FOODS_2_183_TX_1 -3975,FOODS_2_183_TX_2 -3976,FOODS_2_183_TX_3 -3977,FOODS_2_183_WI_1 -3978,FOODS_2_183_WI_2 -3979,FOODS_2_183_WI_3 -3980,FOODS_2_184_CA_1 -3981,FOODS_2_184_CA_2 -3982,FOODS_2_184_CA_3 -3983,FOODS_2_184_CA_4 -3984,FOODS_2_184_TX_1 -3985,FOODS_2_184_TX_2 -3986,FOODS_2_184_TX_3 -3987,FOODS_2_184_WI_1 -3988,FOODS_2_184_WI_2 -3989,FOODS_2_184_WI_3 -3990,FOODS_2_185_CA_1 -3991,FOODS_2_185_CA_2 -3992,FOODS_2_185_CA_3 -3993,FOODS_2_185_CA_4 -3994,FOODS_2_185_TX_1 -3995,FOODS_2_185_TX_2 -3996,FOODS_2_185_TX_3 -3997,FOODS_2_185_WI_1 -3998,FOODS_2_185_WI_2 -3999,FOODS_2_185_WI_3 -4000,FOODS_2_186_CA_1 -4001,FOODS_2_186_CA_2 -4002,FOODS_2_186_CA_3 -4003,FOODS_2_186_CA_4 -4004,FOODS_2_186_TX_1 -4005,FOODS_2_186_TX_2 -4006,FOODS_2_186_TX_3 -4007,FOODS_2_186_WI_1 -4008,FOODS_2_186_WI_2 -4009,FOODS_2_186_WI_3 -4010,FOODS_2_187_CA_1 -4011,FOODS_2_187_CA_2 -4012,FOODS_2_187_CA_3 -4013,FOODS_2_187_CA_4 -4014,FOODS_2_187_TX_1 -4015,FOODS_2_187_TX_2 -4016,FOODS_2_187_TX_3 -4017,FOODS_2_187_WI_1 -4018,FOODS_2_187_WI_2 -4019,FOODS_2_187_WI_3 -4020,FOODS_2_188_CA_1 -4021,FOODS_2_188_CA_2 -4022,FOODS_2_188_CA_3 -4023,FOODS_2_188_CA_4 -4024,FOODS_2_188_TX_1 -4025,FOODS_2_188_TX_2 -4026,FOODS_2_188_TX_3 -4027,FOODS_2_188_WI_1 -4028,FOODS_2_188_WI_2 -4029,FOODS_2_188_WI_3 -4030,FOODS_2_189_CA_1 -4031,FOODS_2_189_CA_2 -4032,FOODS_2_189_CA_3 -4033,FOODS_2_189_CA_4 -4034,FOODS_2_189_TX_1 -4035,FOODS_2_189_TX_2 -4036,FOODS_2_189_TX_3 -4037,FOODS_2_189_WI_1 -4038,FOODS_2_189_WI_2 -4039,FOODS_2_189_WI_3 -4040,FOODS_2_190_CA_1 -4041,FOODS_2_190_CA_2 -4042,FOODS_2_190_CA_3 -4043,FOODS_2_190_CA_4 -4044,FOODS_2_190_TX_1 -4045,FOODS_2_190_TX_2 -4046,FOODS_2_190_TX_3 -4047,FOODS_2_190_WI_1 -4048,FOODS_2_190_WI_2 -4049,FOODS_2_190_WI_3 -4050,FOODS_2_191_CA_1 -4051,FOODS_2_191_CA_2 -4052,FOODS_2_191_CA_3 -4053,FOODS_2_191_CA_4 -4054,FOODS_2_191_TX_1 -4055,FOODS_2_191_TX_2 -4056,FOODS_2_191_TX_3 -4057,FOODS_2_191_WI_1 -4058,FOODS_2_191_WI_2 -4059,FOODS_2_191_WI_3 -4060,FOODS_2_192_CA_1 -4061,FOODS_2_192_CA_2 -4062,FOODS_2_192_CA_3 -4063,FOODS_2_192_CA_4 -4064,FOODS_2_192_TX_1 -4065,FOODS_2_192_TX_2 -4066,FOODS_2_192_TX_3 -4067,FOODS_2_192_WI_1 -4068,FOODS_2_192_WI_2 -4069,FOODS_2_192_WI_3 -4070,FOODS_2_193_CA_1 -4071,FOODS_2_193_CA_2 -4072,FOODS_2_193_CA_3 -4073,FOODS_2_193_CA_4 -4074,FOODS_2_193_TX_1 -4075,FOODS_2_193_TX_2 -4076,FOODS_2_193_TX_3 -4077,FOODS_2_193_WI_1 -4078,FOODS_2_193_WI_2 -4079,FOODS_2_193_WI_3 -4080,FOODS_2_194_CA_1 -4081,FOODS_2_194_CA_2 -4082,FOODS_2_194_CA_3 -4083,FOODS_2_194_CA_4 -4084,FOODS_2_194_TX_1 -4085,FOODS_2_194_TX_2 -4086,FOODS_2_194_TX_3 -4087,FOODS_2_194_WI_1 -4088,FOODS_2_194_WI_2 -4089,FOODS_2_194_WI_3 -4090,FOODS_2_195_CA_1 -4091,FOODS_2_195_CA_2 -4092,FOODS_2_195_CA_3 -4093,FOODS_2_195_CA_4 -4094,FOODS_2_195_TX_1 -4095,FOODS_2_195_TX_2 -4096,FOODS_2_195_TX_3 -4097,FOODS_2_195_WI_1 -4098,FOODS_2_195_WI_2 -4099,FOODS_2_195_WI_3 -4100,FOODS_2_196_CA_1 -4101,FOODS_2_196_CA_2 -4102,FOODS_2_196_CA_3 -4103,FOODS_2_196_CA_4 -4104,FOODS_2_196_TX_1 -4105,FOODS_2_196_TX_2 -4106,FOODS_2_196_TX_3 -4107,FOODS_2_196_WI_1 -4108,FOODS_2_196_WI_2 -4109,FOODS_2_196_WI_3 -4110,FOODS_2_197_CA_1 -4111,FOODS_2_197_CA_2 -4112,FOODS_2_197_CA_3 -4113,FOODS_2_197_CA_4 -4114,FOODS_2_197_TX_1 -4115,FOODS_2_197_TX_2 -4116,FOODS_2_197_TX_3 -4117,FOODS_2_197_WI_1 -4118,FOODS_2_197_WI_2 -4119,FOODS_2_197_WI_3 -4120,FOODS_2_198_CA_1 -4121,FOODS_2_198_CA_2 -4122,FOODS_2_198_CA_3 -4123,FOODS_2_198_CA_4 -4124,FOODS_2_198_TX_1 -4125,FOODS_2_198_TX_2 -4126,FOODS_2_198_TX_3 -4127,FOODS_2_198_WI_1 -4128,FOODS_2_198_WI_2 -4129,FOODS_2_198_WI_3 -4130,FOODS_2_199_CA_1 -4131,FOODS_2_199_CA_2 -4132,FOODS_2_199_CA_3 -4133,FOODS_2_199_CA_4 -4134,FOODS_2_199_TX_1 -4135,FOODS_2_199_TX_2 -4136,FOODS_2_199_TX_3 -4137,FOODS_2_199_WI_1 -4138,FOODS_2_199_WI_2 -4139,FOODS_2_199_WI_3 -4140,FOODS_2_200_CA_1 -4141,FOODS_2_200_CA_2 -4142,FOODS_2_200_CA_3 -4143,FOODS_2_200_CA_4 -4144,FOODS_2_200_TX_1 -4145,FOODS_2_200_TX_2 -4146,FOODS_2_200_TX_3 -4147,FOODS_2_200_WI_1 -4148,FOODS_2_200_WI_2 -4149,FOODS_2_200_WI_3 -4150,FOODS_2_201_CA_1 -4151,FOODS_2_201_CA_2 -4152,FOODS_2_201_CA_3 -4153,FOODS_2_201_CA_4 -4154,FOODS_2_201_TX_1 -4155,FOODS_2_201_TX_2 -4156,FOODS_2_201_TX_3 -4157,FOODS_2_201_WI_1 -4158,FOODS_2_201_WI_2 -4159,FOODS_2_201_WI_3 -4160,FOODS_2_202_CA_1 -4161,FOODS_2_202_CA_2 -4162,FOODS_2_202_CA_3 -4163,FOODS_2_202_CA_4 -4164,FOODS_2_202_TX_1 -4165,FOODS_2_202_TX_2 -4166,FOODS_2_202_TX_3 -4167,FOODS_2_202_WI_1 -4168,FOODS_2_202_WI_2 -4169,FOODS_2_202_WI_3 -4170,FOODS_2_203_CA_1 -4171,FOODS_2_203_CA_2 -4172,FOODS_2_203_CA_3 -4173,FOODS_2_203_CA_4 -4174,FOODS_2_203_TX_1 -4175,FOODS_2_203_TX_2 -4176,FOODS_2_203_TX_3 -4177,FOODS_2_203_WI_1 -4178,FOODS_2_203_WI_2 -4179,FOODS_2_203_WI_3 -4180,FOODS_2_204_CA_1 -4181,FOODS_2_204_CA_2 -4182,FOODS_2_204_CA_3 -4183,FOODS_2_204_CA_4 -4184,FOODS_2_204_TX_1 -4185,FOODS_2_204_TX_2 -4186,FOODS_2_204_TX_3 -4187,FOODS_2_204_WI_1 -4188,FOODS_2_204_WI_2 -4189,FOODS_2_204_WI_3 -4190,FOODS_2_205_CA_1 -4191,FOODS_2_205_CA_2 -4192,FOODS_2_205_CA_3 -4193,FOODS_2_205_CA_4 -4194,FOODS_2_205_TX_1 -4195,FOODS_2_205_TX_2 -4196,FOODS_2_205_TX_3 -4197,FOODS_2_205_WI_1 -4198,FOODS_2_205_WI_2 -4199,FOODS_2_205_WI_3 -4200,FOODS_2_206_CA_1 -4201,FOODS_2_206_CA_2 -4202,FOODS_2_206_CA_3 -4203,FOODS_2_206_CA_4 -4204,FOODS_2_206_TX_1 -4205,FOODS_2_206_TX_2 -4206,FOODS_2_206_TX_3 -4207,FOODS_2_206_WI_1 -4208,FOODS_2_206_WI_2 -4209,FOODS_2_206_WI_3 -4210,FOODS_2_207_CA_1 -4211,FOODS_2_207_CA_2 -4212,FOODS_2_207_CA_3 -4213,FOODS_2_207_CA_4 -4214,FOODS_2_207_TX_1 -4215,FOODS_2_207_TX_2 -4216,FOODS_2_207_TX_3 -4217,FOODS_2_207_WI_1 -4218,FOODS_2_207_WI_2 -4219,FOODS_2_207_WI_3 -4220,FOODS_2_208_CA_1 -4221,FOODS_2_208_CA_2 -4222,FOODS_2_208_CA_3 -4223,FOODS_2_208_CA_4 -4224,FOODS_2_208_TX_1 -4225,FOODS_2_208_TX_2 -4226,FOODS_2_208_TX_3 -4227,FOODS_2_208_WI_1 -4228,FOODS_2_208_WI_2 -4229,FOODS_2_208_WI_3 -4230,FOODS_2_209_CA_1 -4231,FOODS_2_209_CA_2 -4232,FOODS_2_209_CA_3 -4233,FOODS_2_209_CA_4 -4234,FOODS_2_209_TX_1 -4235,FOODS_2_209_TX_2 -4236,FOODS_2_209_TX_3 -4237,FOODS_2_209_WI_1 -4238,FOODS_2_209_WI_2 -4239,FOODS_2_209_WI_3 -4240,FOODS_2_210_CA_1 -4241,FOODS_2_210_CA_2 -4242,FOODS_2_210_CA_3 -4243,FOODS_2_210_CA_4 -4244,FOODS_2_210_TX_1 -4245,FOODS_2_210_TX_2 -4246,FOODS_2_210_TX_3 -4247,FOODS_2_210_WI_1 -4248,FOODS_2_210_WI_2 -4249,FOODS_2_210_WI_3 -4250,FOODS_2_211_CA_1 -4251,FOODS_2_211_CA_2 -4252,FOODS_2_211_CA_3 -4253,FOODS_2_211_CA_4 -4254,FOODS_2_211_TX_1 -4255,FOODS_2_211_TX_2 -4256,FOODS_2_211_TX_3 -4257,FOODS_2_211_WI_1 -4258,FOODS_2_211_WI_2 -4259,FOODS_2_211_WI_3 -4260,FOODS_2_212_CA_1 -4261,FOODS_2_212_CA_2 -4262,FOODS_2_212_CA_3 -4263,FOODS_2_212_CA_4 -4264,FOODS_2_212_TX_1 -4265,FOODS_2_212_TX_2 -4266,FOODS_2_212_TX_3 -4267,FOODS_2_212_WI_1 -4268,FOODS_2_212_WI_2 -4269,FOODS_2_212_WI_3 -4270,FOODS_2_213_CA_1 -4271,FOODS_2_213_CA_2 -4272,FOODS_2_213_CA_3 -4273,FOODS_2_213_CA_4 -4274,FOODS_2_213_TX_1 -4275,FOODS_2_213_TX_2 -4276,FOODS_2_213_TX_3 -4277,FOODS_2_213_WI_1 -4278,FOODS_2_213_WI_2 -4279,FOODS_2_213_WI_3 -4280,FOODS_2_214_CA_1 -4281,FOODS_2_214_CA_2 -4282,FOODS_2_214_CA_3 -4283,FOODS_2_214_CA_4 -4284,FOODS_2_214_TX_1 -4285,FOODS_2_214_TX_2 -4286,FOODS_2_214_TX_3 -4287,FOODS_2_214_WI_1 -4288,FOODS_2_214_WI_2 -4289,FOODS_2_214_WI_3 -4290,FOODS_2_215_CA_1 -4291,FOODS_2_215_CA_2 -4292,FOODS_2_215_CA_3 -4293,FOODS_2_215_CA_4 -4294,FOODS_2_215_TX_1 -4295,FOODS_2_215_TX_2 -4296,FOODS_2_215_TX_3 -4297,FOODS_2_215_WI_1 -4298,FOODS_2_215_WI_2 -4299,FOODS_2_215_WI_3 -4300,FOODS_2_216_CA_1 -4301,FOODS_2_216_CA_2 -4302,FOODS_2_216_CA_3 -4303,FOODS_2_216_CA_4 -4304,FOODS_2_216_TX_1 -4305,FOODS_2_216_TX_2 -4306,FOODS_2_216_TX_3 -4307,FOODS_2_216_WI_1 -4308,FOODS_2_216_WI_2 -4309,FOODS_2_216_WI_3 -4310,FOODS_2_217_CA_1 -4311,FOODS_2_217_CA_2 -4312,FOODS_2_217_CA_3 -4313,FOODS_2_217_CA_4 -4314,FOODS_2_217_TX_1 -4315,FOODS_2_217_TX_2 -4316,FOODS_2_217_TX_3 -4317,FOODS_2_217_WI_1 -4318,FOODS_2_217_WI_2 -4319,FOODS_2_217_WI_3 -4320,FOODS_2_218_CA_1 -4321,FOODS_2_218_CA_2 -4322,FOODS_2_218_CA_3 -4323,FOODS_2_218_CA_4 -4324,FOODS_2_218_TX_1 -4325,FOODS_2_218_TX_2 -4326,FOODS_2_218_TX_3 -4327,FOODS_2_218_WI_1 -4328,FOODS_2_218_WI_2 -4329,FOODS_2_218_WI_3 -4330,FOODS_2_219_CA_1 -4331,FOODS_2_219_CA_2 -4332,FOODS_2_219_CA_3 -4333,FOODS_2_219_CA_4 -4334,FOODS_2_219_TX_1 -4335,FOODS_2_219_TX_2 -4336,FOODS_2_219_TX_3 -4337,FOODS_2_219_WI_1 -4338,FOODS_2_219_WI_2 -4339,FOODS_2_219_WI_3 -4340,FOODS_2_220_CA_1 -4341,FOODS_2_220_CA_2 -4342,FOODS_2_220_CA_3 -4343,FOODS_2_220_CA_4 -4344,FOODS_2_220_TX_1 -4345,FOODS_2_220_TX_2 -4346,FOODS_2_220_TX_3 -4347,FOODS_2_220_WI_1 -4348,FOODS_2_220_WI_2 -4349,FOODS_2_220_WI_3 -4350,FOODS_2_221_CA_1 -4351,FOODS_2_221_CA_2 -4352,FOODS_2_221_CA_3 -4353,FOODS_2_221_CA_4 -4354,FOODS_2_221_TX_1 -4355,FOODS_2_221_TX_2 -4356,FOODS_2_221_TX_3 -4357,FOODS_2_221_WI_1 -4358,FOODS_2_221_WI_2 -4359,FOODS_2_221_WI_3 -4360,FOODS_2_222_CA_1 -4361,FOODS_2_222_CA_2 -4362,FOODS_2_222_CA_3 -4363,FOODS_2_222_CA_4 -4364,FOODS_2_222_TX_1 -4365,FOODS_2_222_TX_2 -4366,FOODS_2_222_TX_3 -4367,FOODS_2_222_WI_1 -4368,FOODS_2_222_WI_2 -4369,FOODS_2_222_WI_3 -4370,FOODS_2_223_CA_1 -4371,FOODS_2_223_CA_2 -4372,FOODS_2_223_CA_3 -4373,FOODS_2_223_CA_4 -4374,FOODS_2_223_TX_1 -4375,FOODS_2_223_TX_2 -4376,FOODS_2_223_TX_3 -4377,FOODS_2_223_WI_1 -4378,FOODS_2_223_WI_2 -4379,FOODS_2_223_WI_3 -4380,FOODS_2_224_CA_1 -4381,FOODS_2_224_CA_2 -4382,FOODS_2_224_CA_3 -4383,FOODS_2_224_CA_4 -4384,FOODS_2_224_TX_1 -4385,FOODS_2_224_TX_2 -4386,FOODS_2_224_TX_3 -4387,FOODS_2_224_WI_1 -4388,FOODS_2_224_WI_2 -4389,FOODS_2_224_WI_3 -4390,FOODS_2_225_CA_1 -4391,FOODS_2_225_CA_2 -4392,FOODS_2_225_CA_3 -4393,FOODS_2_225_CA_4 -4394,FOODS_2_225_TX_1 -4395,FOODS_2_225_TX_2 -4396,FOODS_2_225_TX_3 -4397,FOODS_2_225_WI_1 -4398,FOODS_2_225_WI_2 -4399,FOODS_2_225_WI_3 -4400,FOODS_2_226_CA_1 -4401,FOODS_2_226_CA_2 -4402,FOODS_2_226_CA_3 -4403,FOODS_2_226_CA_4 -4404,FOODS_2_226_TX_1 -4405,FOODS_2_226_TX_2 -4406,FOODS_2_226_TX_3 -4407,FOODS_2_226_WI_1 -4408,FOODS_2_226_WI_2 -4409,FOODS_2_226_WI_3 -4410,FOODS_2_227_CA_1 -4411,FOODS_2_227_CA_2 -4412,FOODS_2_227_CA_3 -4413,FOODS_2_227_CA_4 -4414,FOODS_2_227_TX_1 -4415,FOODS_2_227_TX_2 -4416,FOODS_2_227_TX_3 -4417,FOODS_2_227_WI_1 -4418,FOODS_2_227_WI_2 -4419,FOODS_2_227_WI_3 -4420,FOODS_2_228_CA_1 -4421,FOODS_2_228_CA_2 -4422,FOODS_2_228_CA_3 -4423,FOODS_2_228_CA_4 -4424,FOODS_2_228_TX_1 -4425,FOODS_2_228_TX_2 -4426,FOODS_2_228_TX_3 -4427,FOODS_2_228_WI_1 -4428,FOODS_2_228_WI_2 -4429,FOODS_2_228_WI_3 -4430,FOODS_2_229_CA_1 -4431,FOODS_2_229_CA_2 -4432,FOODS_2_229_CA_3 -4433,FOODS_2_229_CA_4 -4434,FOODS_2_229_TX_1 -4435,FOODS_2_229_TX_2 -4436,FOODS_2_229_TX_3 -4437,FOODS_2_229_WI_1 -4438,FOODS_2_229_WI_2 -4439,FOODS_2_229_WI_3 -4440,FOODS_2_230_CA_1 -4441,FOODS_2_230_CA_2 -4442,FOODS_2_230_CA_3 -4443,FOODS_2_230_CA_4 -4444,FOODS_2_230_TX_1 -4445,FOODS_2_230_TX_2 -4446,FOODS_2_230_TX_3 -4447,FOODS_2_230_WI_1 -4448,FOODS_2_230_WI_2 -4449,FOODS_2_230_WI_3 -4450,FOODS_2_231_CA_1 -4451,FOODS_2_231_CA_2 -4452,FOODS_2_231_CA_3 -4453,FOODS_2_231_CA_4 -4454,FOODS_2_231_TX_1 -4455,FOODS_2_231_TX_2 -4456,FOODS_2_231_TX_3 -4457,FOODS_2_231_WI_1 -4458,FOODS_2_231_WI_2 -4459,FOODS_2_231_WI_3 -4460,FOODS_2_232_CA_1 -4461,FOODS_2_232_CA_2 -4462,FOODS_2_232_CA_3 -4463,FOODS_2_232_CA_4 -4464,FOODS_2_232_TX_1 -4465,FOODS_2_232_TX_2 -4466,FOODS_2_232_TX_3 -4467,FOODS_2_232_WI_1 -4468,FOODS_2_232_WI_2 -4469,FOODS_2_232_WI_3 -4470,FOODS_2_233_CA_1 -4471,FOODS_2_233_CA_2 -4472,FOODS_2_233_CA_3 -4473,FOODS_2_233_CA_4 -4474,FOODS_2_233_TX_1 -4475,FOODS_2_233_TX_2 -4476,FOODS_2_233_TX_3 -4477,FOODS_2_233_WI_1 -4478,FOODS_2_233_WI_2 -4479,FOODS_2_233_WI_3 -4480,FOODS_2_234_CA_1 -4481,FOODS_2_234_CA_2 -4482,FOODS_2_234_CA_3 -4483,FOODS_2_234_CA_4 -4484,FOODS_2_234_TX_1 -4485,FOODS_2_234_TX_2 -4486,FOODS_2_234_TX_3 -4487,FOODS_2_234_WI_1 -4488,FOODS_2_234_WI_2 -4489,FOODS_2_234_WI_3 -4490,FOODS_2_235_CA_1 -4491,FOODS_2_235_CA_2 -4492,FOODS_2_235_CA_3 -4493,FOODS_2_235_CA_4 -4494,FOODS_2_235_TX_1 -4495,FOODS_2_235_TX_2 -4496,FOODS_2_235_TX_3 -4497,FOODS_2_235_WI_1 -4498,FOODS_2_235_WI_2 -4499,FOODS_2_235_WI_3 -4500,FOODS_2_236_CA_1 -4501,FOODS_2_236_CA_2 -4502,FOODS_2_236_CA_3 -4503,FOODS_2_236_CA_4 -4504,FOODS_2_236_TX_1 -4505,FOODS_2_236_TX_2 -4506,FOODS_2_236_TX_3 -4507,FOODS_2_236_WI_1 -4508,FOODS_2_236_WI_2 -4509,FOODS_2_236_WI_3 -4510,FOODS_2_237_CA_1 -4511,FOODS_2_237_CA_2 -4512,FOODS_2_237_CA_3 -4513,FOODS_2_237_CA_4 -4514,FOODS_2_237_TX_1 -4515,FOODS_2_237_TX_2 -4516,FOODS_2_237_TX_3 -4517,FOODS_2_237_WI_1 -4518,FOODS_2_237_WI_2 -4519,FOODS_2_237_WI_3 -4520,FOODS_2_238_CA_1 -4521,FOODS_2_238_CA_2 -4522,FOODS_2_238_CA_3 -4523,FOODS_2_238_CA_4 -4524,FOODS_2_238_TX_1 -4525,FOODS_2_238_TX_2 -4526,FOODS_2_238_TX_3 -4527,FOODS_2_238_WI_1 -4528,FOODS_2_238_WI_2 -4529,FOODS_2_238_WI_3 -4530,FOODS_2_239_CA_1 -4531,FOODS_2_239_CA_2 -4532,FOODS_2_239_CA_3 -4533,FOODS_2_239_CA_4 -4534,FOODS_2_239_TX_1 -4535,FOODS_2_239_TX_2 -4536,FOODS_2_239_TX_3 -4537,FOODS_2_239_WI_1 -4538,FOODS_2_239_WI_2 -4539,FOODS_2_239_WI_3 -4540,FOODS_2_240_CA_1 -4541,FOODS_2_240_CA_2 -4542,FOODS_2_240_CA_3 -4543,FOODS_2_240_CA_4 -4544,FOODS_2_240_TX_1 -4545,FOODS_2_240_TX_2 -4546,FOODS_2_240_TX_3 -4547,FOODS_2_240_WI_1 -4548,FOODS_2_240_WI_2 -4549,FOODS_2_240_WI_3 -4550,FOODS_2_241_CA_1 -4551,FOODS_2_241_CA_2 -4552,FOODS_2_241_CA_3 -4553,FOODS_2_241_CA_4 -4554,FOODS_2_241_TX_1 -4555,FOODS_2_241_TX_2 -4556,FOODS_2_241_TX_3 -4557,FOODS_2_241_WI_1 -4558,FOODS_2_241_WI_2 -4559,FOODS_2_241_WI_3 -4560,FOODS_2_242_CA_1 -4561,FOODS_2_242_CA_2 -4562,FOODS_2_242_CA_3 -4563,FOODS_2_242_CA_4 -4564,FOODS_2_242_TX_1 -4565,FOODS_2_242_TX_2 -4566,FOODS_2_242_TX_3 -4567,FOODS_2_242_WI_1 -4568,FOODS_2_242_WI_2 -4569,FOODS_2_242_WI_3 -4570,FOODS_2_243_CA_1 -4571,FOODS_2_243_CA_2 -4572,FOODS_2_243_CA_3 -4573,FOODS_2_243_CA_4 -4574,FOODS_2_243_TX_1 -4575,FOODS_2_243_TX_2 -4576,FOODS_2_243_TX_3 -4577,FOODS_2_243_WI_1 -4578,FOODS_2_243_WI_2 -4579,FOODS_2_243_WI_3 -4580,FOODS_2_244_CA_1 -4581,FOODS_2_244_CA_2 -4582,FOODS_2_244_CA_3 -4583,FOODS_2_244_CA_4 -4584,FOODS_2_244_TX_1 -4585,FOODS_2_244_TX_2 -4586,FOODS_2_244_TX_3 -4587,FOODS_2_244_WI_1 -4588,FOODS_2_244_WI_2 -4589,FOODS_2_244_WI_3 -4590,FOODS_2_245_CA_1 -4591,FOODS_2_245_CA_2 -4592,FOODS_2_245_CA_3 -4593,FOODS_2_245_CA_4 -4594,FOODS_2_245_TX_1 -4595,FOODS_2_245_TX_2 -4596,FOODS_2_245_TX_3 -4597,FOODS_2_245_WI_1 -4598,FOODS_2_245_WI_2 -4599,FOODS_2_245_WI_3 -4600,FOODS_2_246_CA_1 -4601,FOODS_2_246_CA_2 -4602,FOODS_2_246_CA_3 -4603,FOODS_2_246_CA_4 -4604,FOODS_2_246_TX_1 -4605,FOODS_2_246_TX_2 -4606,FOODS_2_246_TX_3 -4607,FOODS_2_246_WI_1 -4608,FOODS_2_246_WI_2 -4609,FOODS_2_246_WI_3 -4610,FOODS_2_247_CA_1 -4611,FOODS_2_247_CA_2 -4612,FOODS_2_247_CA_3 -4613,FOODS_2_247_CA_4 -4614,FOODS_2_247_TX_1 -4615,FOODS_2_247_TX_2 -4616,FOODS_2_247_TX_3 -4617,FOODS_2_247_WI_1 -4618,FOODS_2_247_WI_2 -4619,FOODS_2_247_WI_3 -4620,FOODS_2_248_CA_1 -4621,FOODS_2_248_CA_2 -4622,FOODS_2_248_CA_3 -4623,FOODS_2_248_CA_4 -4624,FOODS_2_248_TX_1 -4625,FOODS_2_248_TX_2 -4626,FOODS_2_248_TX_3 -4627,FOODS_2_248_WI_1 -4628,FOODS_2_248_WI_2 -4629,FOODS_2_248_WI_3 -4630,FOODS_2_249_CA_1 -4631,FOODS_2_249_CA_2 -4632,FOODS_2_249_CA_3 -4633,FOODS_2_249_CA_4 -4634,FOODS_2_249_TX_1 -4635,FOODS_2_249_TX_2 -4636,FOODS_2_249_TX_3 -4637,FOODS_2_249_WI_1 -4638,FOODS_2_249_WI_2 -4639,FOODS_2_249_WI_3 -4640,FOODS_2_250_CA_1 -4641,FOODS_2_250_CA_2 -4642,FOODS_2_250_CA_3 -4643,FOODS_2_250_CA_4 -4644,FOODS_2_250_TX_1 -4645,FOODS_2_250_TX_2 -4646,FOODS_2_250_TX_3 -4647,FOODS_2_250_WI_1 -4648,FOODS_2_250_WI_2 -4649,FOODS_2_250_WI_3 -4650,FOODS_2_251_CA_1 -4651,FOODS_2_251_CA_2 -4652,FOODS_2_251_CA_3 -4653,FOODS_2_251_CA_4 -4654,FOODS_2_251_TX_1 -4655,FOODS_2_251_TX_2 -4656,FOODS_2_251_TX_3 -4657,FOODS_2_251_WI_1 -4658,FOODS_2_251_WI_2 -4659,FOODS_2_251_WI_3 -4660,FOODS_2_252_CA_1 -4661,FOODS_2_252_CA_2 -4662,FOODS_2_252_CA_3 -4663,FOODS_2_252_CA_4 -4664,FOODS_2_252_TX_1 -4665,FOODS_2_252_TX_2 -4666,FOODS_2_252_TX_3 -4667,FOODS_2_252_WI_1 -4668,FOODS_2_252_WI_2 -4669,FOODS_2_252_WI_3 -4670,FOODS_2_253_CA_1 -4671,FOODS_2_253_CA_2 -4672,FOODS_2_253_CA_3 -4673,FOODS_2_253_CA_4 -4674,FOODS_2_253_TX_1 -4675,FOODS_2_253_TX_2 -4676,FOODS_2_253_TX_3 -4677,FOODS_2_253_WI_1 -4678,FOODS_2_253_WI_2 -4679,FOODS_2_253_WI_3 -4680,FOODS_2_254_CA_1 -4681,FOODS_2_254_CA_2 -4682,FOODS_2_254_CA_3 -4683,FOODS_2_254_CA_4 -4684,FOODS_2_254_TX_1 -4685,FOODS_2_254_TX_2 -4686,FOODS_2_254_TX_3 -4687,FOODS_2_254_WI_1 -4688,FOODS_2_254_WI_2 -4689,FOODS_2_254_WI_3 -4690,FOODS_2_255_CA_1 -4691,FOODS_2_255_CA_2 -4692,FOODS_2_255_CA_3 -4693,FOODS_2_255_CA_4 -4694,FOODS_2_255_TX_1 -4695,FOODS_2_255_TX_2 -4696,FOODS_2_255_TX_3 -4697,FOODS_2_255_WI_1 -4698,FOODS_2_255_WI_2 -4699,FOODS_2_255_WI_3 -4700,FOODS_2_256_CA_1 -4701,FOODS_2_256_CA_2 -4702,FOODS_2_256_CA_3 -4703,FOODS_2_256_CA_4 -4704,FOODS_2_256_TX_1 -4705,FOODS_2_256_TX_2 -4706,FOODS_2_256_TX_3 -4707,FOODS_2_256_WI_1 -4708,FOODS_2_256_WI_2 -4709,FOODS_2_256_WI_3 -4710,FOODS_2_257_CA_1 -4711,FOODS_2_257_CA_2 -4712,FOODS_2_257_CA_3 -4713,FOODS_2_257_CA_4 -4714,FOODS_2_257_TX_1 -4715,FOODS_2_257_TX_2 -4716,FOODS_2_257_TX_3 -4717,FOODS_2_257_WI_1 -4718,FOODS_2_257_WI_2 -4719,FOODS_2_257_WI_3 -4720,FOODS_2_258_CA_1 -4721,FOODS_2_258_CA_2 -4722,FOODS_2_258_CA_3 -4723,FOODS_2_258_CA_4 -4724,FOODS_2_258_TX_1 -4725,FOODS_2_258_TX_2 -4726,FOODS_2_258_TX_3 -4727,FOODS_2_258_WI_1 -4728,FOODS_2_258_WI_2 -4729,FOODS_2_258_WI_3 -4730,FOODS_2_259_CA_1 -4731,FOODS_2_259_CA_2 -4732,FOODS_2_259_CA_3 -4733,FOODS_2_259_CA_4 -4734,FOODS_2_259_TX_1 -4735,FOODS_2_259_TX_2 -4736,FOODS_2_259_TX_3 -4737,FOODS_2_259_WI_1 -4738,FOODS_2_259_WI_2 -4739,FOODS_2_259_WI_3 -4740,FOODS_2_260_CA_1 -4741,FOODS_2_260_CA_2 -4742,FOODS_2_260_CA_3 -4743,FOODS_2_260_CA_4 -4744,FOODS_2_260_TX_1 -4745,FOODS_2_260_TX_2 -4746,FOODS_2_260_TX_3 -4747,FOODS_2_260_WI_1 -4748,FOODS_2_260_WI_2 -4749,FOODS_2_260_WI_3 -4750,FOODS_2_261_CA_1 -4751,FOODS_2_261_CA_2 -4752,FOODS_2_261_CA_3 -4753,FOODS_2_261_CA_4 -4754,FOODS_2_261_TX_1 -4755,FOODS_2_261_TX_2 -4756,FOODS_2_261_TX_3 -4757,FOODS_2_261_WI_1 -4758,FOODS_2_261_WI_2 -4759,FOODS_2_261_WI_3 -4760,FOODS_2_262_CA_1 -4761,FOODS_2_262_CA_2 -4762,FOODS_2_262_CA_3 -4763,FOODS_2_262_CA_4 -4764,FOODS_2_262_TX_1 -4765,FOODS_2_262_TX_2 -4766,FOODS_2_262_TX_3 -4767,FOODS_2_262_WI_1 -4768,FOODS_2_262_WI_2 -4769,FOODS_2_262_WI_3 -4770,FOODS_2_263_CA_1 -4771,FOODS_2_263_CA_2 -4772,FOODS_2_263_CA_3 -4773,FOODS_2_263_CA_4 -4774,FOODS_2_263_TX_1 -4775,FOODS_2_263_TX_2 -4776,FOODS_2_263_TX_3 -4777,FOODS_2_263_WI_1 -4778,FOODS_2_263_WI_2 -4779,FOODS_2_263_WI_3 -4780,FOODS_2_264_CA_1 -4781,FOODS_2_264_CA_2 -4782,FOODS_2_264_CA_3 -4783,FOODS_2_264_CA_4 -4784,FOODS_2_264_TX_1 -4785,FOODS_2_264_TX_2 -4786,FOODS_2_264_TX_3 -4787,FOODS_2_264_WI_1 -4788,FOODS_2_264_WI_2 -4789,FOODS_2_264_WI_3 -4790,FOODS_2_265_CA_1 -4791,FOODS_2_265_CA_2 -4792,FOODS_2_265_CA_3 -4793,FOODS_2_265_CA_4 -4794,FOODS_2_265_TX_1 -4795,FOODS_2_265_TX_2 -4796,FOODS_2_265_TX_3 -4797,FOODS_2_265_WI_1 -4798,FOODS_2_265_WI_2 -4799,FOODS_2_265_WI_3 -4800,FOODS_2_266_CA_1 -4801,FOODS_2_266_CA_2 -4802,FOODS_2_266_CA_3 -4803,FOODS_2_266_CA_4 -4804,FOODS_2_266_TX_1 -4805,FOODS_2_266_TX_2 -4806,FOODS_2_266_TX_3 -4807,FOODS_2_266_WI_1 -4808,FOODS_2_266_WI_2 -4809,FOODS_2_266_WI_3 -4810,FOODS_2_267_CA_1 -4811,FOODS_2_267_CA_2 -4812,FOODS_2_267_CA_3 -4813,FOODS_2_267_CA_4 -4814,FOODS_2_267_TX_1 -4815,FOODS_2_267_TX_2 -4816,FOODS_2_267_TX_3 -4817,FOODS_2_267_WI_1 -4818,FOODS_2_267_WI_2 -4819,FOODS_2_267_WI_3 -4820,FOODS_2_268_CA_1 -4821,FOODS_2_268_CA_2 -4822,FOODS_2_268_CA_3 -4823,FOODS_2_268_CA_4 -4824,FOODS_2_268_TX_1 -4825,FOODS_2_268_TX_2 -4826,FOODS_2_268_TX_3 -4827,FOODS_2_268_WI_1 -4828,FOODS_2_268_WI_2 -4829,FOODS_2_268_WI_3 -4830,FOODS_2_269_CA_1 -4831,FOODS_2_269_CA_2 -4832,FOODS_2_269_CA_3 -4833,FOODS_2_269_CA_4 -4834,FOODS_2_269_TX_1 -4835,FOODS_2_269_TX_2 -4836,FOODS_2_269_TX_3 -4837,FOODS_2_269_WI_1 -4838,FOODS_2_269_WI_2 -4839,FOODS_2_269_WI_3 -4840,FOODS_2_270_CA_1 -4841,FOODS_2_270_CA_2 -4842,FOODS_2_270_CA_3 -4843,FOODS_2_270_CA_4 -4844,FOODS_2_270_TX_1 -4845,FOODS_2_270_TX_2 -4846,FOODS_2_270_TX_3 -4847,FOODS_2_270_WI_1 -4848,FOODS_2_270_WI_2 -4849,FOODS_2_270_WI_3 -4850,FOODS_2_271_CA_1 -4851,FOODS_2_271_CA_2 -4852,FOODS_2_271_CA_3 -4853,FOODS_2_271_CA_4 -4854,FOODS_2_271_TX_1 -4855,FOODS_2_271_TX_2 -4856,FOODS_2_271_TX_3 -4857,FOODS_2_271_WI_1 -4858,FOODS_2_271_WI_2 -4859,FOODS_2_271_WI_3 -4860,FOODS_2_272_CA_1 -4861,FOODS_2_272_CA_2 -4862,FOODS_2_272_CA_3 -4863,FOODS_2_272_CA_4 -4864,FOODS_2_272_TX_1 -4865,FOODS_2_272_TX_2 -4866,FOODS_2_272_TX_3 -4867,FOODS_2_272_WI_1 -4868,FOODS_2_272_WI_2 -4869,FOODS_2_272_WI_3 -4870,FOODS_2_273_CA_1 -4871,FOODS_2_273_CA_2 -4872,FOODS_2_273_CA_3 -4873,FOODS_2_273_CA_4 -4874,FOODS_2_273_TX_1 -4875,FOODS_2_273_TX_2 -4876,FOODS_2_273_TX_3 -4877,FOODS_2_273_WI_1 -4878,FOODS_2_273_WI_2 -4879,FOODS_2_273_WI_3 -4880,FOODS_2_274_CA_1 -4881,FOODS_2_274_CA_2 -4882,FOODS_2_274_CA_3 -4883,FOODS_2_274_CA_4 -4884,FOODS_2_274_TX_1 -4885,FOODS_2_274_TX_2 -4886,FOODS_2_274_TX_3 -4887,FOODS_2_274_WI_1 -4888,FOODS_2_274_WI_2 -4889,FOODS_2_274_WI_3 -4890,FOODS_2_275_CA_1 -4891,FOODS_2_275_CA_2 -4892,FOODS_2_275_CA_3 -4893,FOODS_2_275_CA_4 -4894,FOODS_2_275_TX_1 -4895,FOODS_2_275_TX_2 -4896,FOODS_2_275_TX_3 -4897,FOODS_2_275_WI_1 -4898,FOODS_2_275_WI_2 -4899,FOODS_2_275_WI_3 -4900,FOODS_2_276_CA_1 -4901,FOODS_2_276_CA_2 -4902,FOODS_2_276_CA_3 -4903,FOODS_2_276_CA_4 -4904,FOODS_2_276_TX_1 -4905,FOODS_2_276_TX_2 -4906,FOODS_2_276_TX_3 -4907,FOODS_2_276_WI_1 -4908,FOODS_2_276_WI_2 -4909,FOODS_2_276_WI_3 -4910,FOODS_2_277_CA_1 -4911,FOODS_2_277_CA_2 -4912,FOODS_2_277_CA_3 -4913,FOODS_2_277_CA_4 -4914,FOODS_2_277_TX_1 -4915,FOODS_2_277_TX_2 -4916,FOODS_2_277_TX_3 -4917,FOODS_2_277_WI_1 -4918,FOODS_2_277_WI_2 -4919,FOODS_2_277_WI_3 -4920,FOODS_2_278_CA_1 -4921,FOODS_2_278_CA_2 -4922,FOODS_2_278_CA_3 -4923,FOODS_2_278_CA_4 -4924,FOODS_2_278_TX_1 -4925,FOODS_2_278_TX_2 -4926,FOODS_2_278_TX_3 -4927,FOODS_2_278_WI_1 -4928,FOODS_2_278_WI_2 -4929,FOODS_2_278_WI_3 -4930,FOODS_2_279_CA_1 -4931,FOODS_2_279_CA_2 -4932,FOODS_2_279_CA_3 -4933,FOODS_2_279_CA_4 -4934,FOODS_2_279_TX_1 -4935,FOODS_2_279_TX_2 -4936,FOODS_2_279_TX_3 -4937,FOODS_2_279_WI_1 -4938,FOODS_2_279_WI_2 -4939,FOODS_2_279_WI_3 -4940,FOODS_2_280_CA_1 -4941,FOODS_2_280_CA_2 -4942,FOODS_2_280_CA_3 -4943,FOODS_2_280_CA_4 -4944,FOODS_2_280_TX_1 -4945,FOODS_2_280_TX_2 -4946,FOODS_2_280_TX_3 -4947,FOODS_2_280_WI_1 -4948,FOODS_2_280_WI_2 -4949,FOODS_2_280_WI_3 -4950,FOODS_2_281_CA_1 -4951,FOODS_2_281_CA_2 -4952,FOODS_2_281_CA_3 -4953,FOODS_2_281_CA_4 -4954,FOODS_2_281_TX_1 -4955,FOODS_2_281_TX_2 -4956,FOODS_2_281_TX_3 -4957,FOODS_2_281_WI_1 -4958,FOODS_2_281_WI_2 -4959,FOODS_2_281_WI_3 -4960,FOODS_2_282_CA_1 -4961,FOODS_2_282_CA_2 -4962,FOODS_2_282_CA_3 -4963,FOODS_2_282_CA_4 -4964,FOODS_2_282_TX_1 -4965,FOODS_2_282_TX_2 -4966,FOODS_2_282_TX_3 -4967,FOODS_2_282_WI_1 -4968,FOODS_2_282_WI_2 -4969,FOODS_2_282_WI_3 -4970,FOODS_2_283_CA_1 -4971,FOODS_2_283_CA_2 -4972,FOODS_2_283_CA_3 -4973,FOODS_2_283_CA_4 -4974,FOODS_2_283_TX_1 -4975,FOODS_2_283_TX_2 -4976,FOODS_2_283_TX_3 -4977,FOODS_2_283_WI_1 -4978,FOODS_2_283_WI_2 -4979,FOODS_2_283_WI_3 -4980,FOODS_2_284_CA_1 -4981,FOODS_2_284_CA_2 -4982,FOODS_2_284_CA_3 -4983,FOODS_2_284_CA_4 -4984,FOODS_2_284_TX_1 -4985,FOODS_2_284_TX_2 -4986,FOODS_2_284_TX_3 -4987,FOODS_2_284_WI_1 -4988,FOODS_2_284_WI_2 -4989,FOODS_2_284_WI_3 -4990,FOODS_2_285_CA_1 -4991,FOODS_2_285_CA_2 -4992,FOODS_2_285_CA_3 -4993,FOODS_2_285_CA_4 -4994,FOODS_2_285_TX_1 -4995,FOODS_2_285_TX_2 -4996,FOODS_2_285_TX_3 -4997,FOODS_2_285_WI_1 -4998,FOODS_2_285_WI_2 -4999,FOODS_2_285_WI_3 diff --git a/doc/source/templates/02_many_model_training/many_model_training.ipynb b/doc/source/templates/02_many_model_training/many_model_training.ipynb index a041c8a69b6a..0645706d75cd 100644 --- a/doc/source/templates/02_many_model_training/many_model_training.ipynb +++ b/doc/source/templates/02_many_model_training/many_model_training.ipynb @@ -21,79 +21,96 @@ }, { "cell_type": "markdown", - "id": "c56bb4d0", + "id": "08e65f8d", "metadata": {}, "source": [ - "## Installing Dependencies\n", - "\n", - "First, we'll need to install necessary dependencies in the Anyscale Workspace. To do so, first open up a terminal, and follow one of the following install steps, depending on which size template you picked:\n" + "> Slot in your code below wherever you see the ✂️ icon to build a many model training Ray application off of this template!" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "5b99c151", - "metadata": { - "tags": [ - "small" - ] - }, + "id": "c56bb4d0", + "metadata": {}, "source": [ - "### Install Dependencies (Small-scale Template)\n", + "## Handling Dependencies\n", "\n", - "The small-scale template only runs on a single node (the head node), so we just need to install the requirements *locally*." + "This template requires certain Python packages to be available to every node in the cluster.\n", + "\n", + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "dcd6fc93", + "id": "0c9b3dec", "metadata": { - "tags": [ - "small" - ] + "tags": [] }, "outputs": [], "source": [ - "%pip install -r requirements.txt --upgrade\n" + "requirements_path = \"./requirements.txt\"\n" ] }, { + "cell_type": "code", + "execution_count": null, + "id": "92161434", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" + ] + }, + { + "attachments": {}, "cell_type": "markdown", - "id": "c5ee8e43", - "metadata": { - "tags": [ - "large" - ] - }, + "id": "eff9369f", + "metadata": {}, "source": [ - "### Install Cluster-wide Dependencies (Large-scale Template)\n", + "First, we may want to use these modules right here in our script, which is running on the head node.\n", + "Install the Python packages on the head node using `pip install`.\n", "\n", - "When running in a distributed Ray Cluster, all nodes need to have access to the installed packages.\n", - "For this, we'll use `pip install --user` to install the necessary requirements.\n", - "On an [Anyscale Workspace](https://docs.anyscale.com/user-guide/develop-and-debug/workspaces),\n", - "this will install packages to a *shared filesystem* that will be available to all nodes in the cluster." + "```{note}\n", + "You may need to restart this notebook kernel to access the installed packages.\n", + "```\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "0c9b3dec", - "metadata": { - "tags": [ - "large" - ] - }, + "id": "5cba940c", + "metadata": {}, "outputs": [], "source": [ - "%pip install --user -r requirements.txt --upgrade\n" + "%pip install -r {requirements_path} --upgrade" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "08e65f8d", + "id": "1dcaea58", "metadata": {}, "source": [ - "> Slot in your code below wherever you see the ✂️ icon to build a many model training Ray application off of this template!" + "Next, we need to make sure all worker nodes also have access to the dependencies.\n", + "For this, use a [Ray Runtime Environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments)\n", + "to dynamically set up dependencies throughout the cluster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e268225d", + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "\n", + "ray.init(runtime_env={\"pip\": requirements})\n" ] }, { @@ -107,68 +124,31 @@ "from pyarrow import parquet as pq\n", "from sklearn.metrics import mean_squared_error\n", "\n", - "import ray\n", "from ray import tune\n", - "from ray.air import session\n", - "\n", - "try:\n", - " from statsforecast import StatsForecast\n", - " from statsforecast.models import AutoARIMA, AutoETS\n", - "except ImportError as e:\n", - " raise RuntimeError(\"Did you follow the steps above to install dependencies?\") from e\n" + "from ray.air import session\n" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b8fc83d0", "metadata": {}, "source": [ - "> ✂️ Replace this value to change the number of data partitions you will use. This will be total the number of Tune trials you will run!\n", + "> ✂️ Replace this value to change the number of data partitions you will use (<= 5000 for this dataset). This will be total the number of Tune trials you will run!\n", ">\n", "> Note that this template fits two models per data partition and reports the best performing one." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "a40e91a5", - "metadata": { - "tags": [ - "small" - ] - }, - "outputs": [], - "source": [ - "# Default values for the small-scale template\n", - "NUM_DATA_PARTITIONS: int = 50\n" - ] - }, { "cell_type": "code", "execution_count": null, "id": "5390c232", "metadata": { - "tags": [ - "large" - ] + "tags": [] }, "outputs": [], "source": [ - "# Default values for the large-scale template\n", - "NUM_DATA_PARTITIONS: int = 1000\n" - ] - }, - { - "cell_type": "markdown", - "id": "c260d8f8", - "metadata": {}, - "source": [ - "```{tip}\n", - "If you're running the small-scale version of the template, try setting\n", - "the number of trials to the recommended number of trials for the large-scale version.\n", - "It'll be much slower, but you'll see the dramatic speedup once distributing the load\n", - "to a multi-node Ray cluster in the large-scale version!\n", - "```" + "NUM_DATA_PARTITIONS: int = 500\n" ] }, { @@ -201,7 +181,7 @@ " return df.dropna()\n", "\n", "\n", - "def evaluate_cross_validation(df, metric):\n", + "def evaluate_cross_validation(df: pd.DataFrame, metric) -> pd.DataFrame:\n", " models = df.drop(columns=[\"ds\", \"cutoff\", \"y\"]).columns.tolist()\n", " evals = []\n", " for model in models:\n", @@ -233,15 +213,18 @@ "metadata": {}, "outputs": [], "source": [ - "model_classes = [AutoARIMA, AutoETS]\n", - "n_windows = 1\n", - "\n", - "\n", "def train_fn(config: dict):\n", + " try:\n", + " from statsforecast import StatsForecast\n", + " from statsforecast.models import AutoARIMA, AutoETS\n", + " except ImportError as e:\n", + " raise RuntimeError(\"Did you set a runtime env to install dependencies?\") from e\n", + "\n", " data_partition_id = config[\"data_partition_id\"]\n", " train_df = get_m5_partition(data_partition_id)\n", "\n", - " models = [model_cls() for model_cls in model_classes]\n", + " models = [AutoARIMA(), AutoETS()]\n", + " n_windows = 1\n", " forecast_horizon = 4\n", "\n", " sf = StatsForecast(\n", @@ -265,12 +248,11 @@ "\n", "\n", "trainable = train_fn\n", - "trainable = tune.with_resources(\n", - " trainable, resources={\"CPU\": len(model_classes) * n_windows}\n", - ")\n" + "trainable = tune.with_resources(trainable, resources={\"CPU\": 2 * 1})\n" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "301c7c58", "metadata": {}, @@ -280,6 +262,7 @@ "Feel free to change this to the resources required by your application! You can also comment out the `tune.with_resources` block to assign `1 CPU` (the default) to each trial.\n", "\n", "Note that this is purely for Tune to know how many trials to schedule concurrently -- setting the number of CPUs does not actually enforce any kind of resource isolation!\n", + "In this template, `statsforecast` runs cross validation in parallel with M models * N temporal cross-validation windows (e.g. 2 * 1).\n", "```\n", "\n", "See [Ray Tune's guide on assigning resources](https://docs.ray.io/en/latest/tune/tutorials/tune-resources.html) for more information." @@ -302,7 +285,12 @@ "metadata": {}, "outputs": [], "source": [ - "data_partitions = list(pd.read_csv(\"item_ids.csv\")[\"item_id\"])\n", + "# Download the list of item ids used to partition the dataset.\n", + "data_partitions = list(\n", + " pd.read_csv(\n", + " \"https://air-example-data.s3.us-west-2.amazonaws.com/m5_benchmarks_item_ids.csv\"\n", + " )[\"item_id\"]\n", + ")\n", "if NUM_DATA_PARTITIONS > len(data_partitions):\n", " print(f\"There are only {len(data_partitions)} partitions!\")\n", "\n", @@ -331,11 +319,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ba1a07d0", "metadata": {}, "source": [ - "> ✂️ Replace the metric and mode below with the metric you reported in your training function." + "View the reported results of all trials as a dataframe." ] }, { @@ -345,8 +334,8 @@ "metadata": {}, "outputs": [], "source": [ - "sample_result = result_grid[0]\n", - "sample_result.metrics\n" + "results_df = result_grid.get_dataframe()\n", + "results_df\n" ] } ], diff --git a/doc/source/templates/02_many_model_training/requirements.txt b/doc/source/templates/02_many_model_training/requirements.txt index f3abf6a44fdf..25eaf5428923 100644 --- a/doc/source/templates/02_many_model_training/requirements.txt +++ b/doc/source/templates/02_many_model_training/requirements.txt @@ -1 +1 @@ -statsforecast==1.5.0 \ No newline at end of file +statsforecast==1.5.0 diff --git a/doc/source/templates/03_serving_stable_diffusion/requirements.txt b/doc/source/templates/03_serving_stable_diffusion/requirements.txt index 30a36e09b6af..eac6df67b25e 100644 --- a/doc/source/templates/03_serving_stable_diffusion/requirements.txt +++ b/doc/source/templates/03_serving_stable_diffusion/requirements.txt @@ -1,9 +1,10 @@ accelerate==0.14.0 -diffusers @ git+https://github.com/huggingface/diffusers.git@25f11424f62d8d9bef8a721b806926399a1557f2 -numpy==1.23.4 +diffusers==0.15.1 +matplotlib>=3.5.3,<=3.7.1 +numpy>=1.21.6,<=1.23.5 Pillow==9.3.0 -scipy==1.9.3 -tensorboard==2.12.0 +scipy>=1.7.3,<=1.9.3 +tensorboard>=2.11.2,<=2.12.0 torch==1.13.0 torchvision==0.14.0 -transformers==4.24.0 \ No newline at end of file +transformers==4.28.1 diff --git a/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb b/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb index e1fb2fa66520..9c79e32010f1 100644 --- a/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb +++ b/doc/source/templates/03_serving_stable_diffusion/serving_stable_diffusion.ipynb @@ -17,74 +17,89 @@ }, { "cell_type": "markdown", - "id": "25364e8e", + "id": "2ea9629f", "metadata": {}, "source": [ - "## Installing Dependencies\n", + "## Handling Dependencies\n", "\n", - "First, we'll need to install necessary dependencies in the Anyscale Workspace. To do so, first open up a terminal, and follow one of the following install steps, depending on which size template you picked:" + "This template requires certain Python packages to be available to every node in the cluster.\n", + "\n", + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e43b49fc", + "metadata": {}, + "outputs": [], + "source": [ + "requirements_path = \"./requirements.txt\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19504900", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "94ec23af", - "metadata": { - "tags": [ - "small" - ] - }, + "id": "33419c37", + "metadata": {}, "source": [ - "### Install Dependencies (Small-scale Template)\n", + "First, we may want to use these modules right here in our script, which is running on the head node.\n", + "Install the Python packages on the head node using `pip install`.\n", "\n", - "The small-scale template only runs on a single node (the head node), so we just need to install the requirements *locally*." + "```{note}\n", + "You may need to restart this notebook kernel to access the installed packages.\n", + "```\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "5cb0f0d0", - "metadata": { - "tags": [ - "small" - ] - }, + "id": "9aadf0c5", + "metadata": {}, "outputs": [], "source": [ - "%pip install -r requirements.txt --upgrade\n" + "%pip install -r {requirements_path} --upgrade" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "a45dcc56", - "metadata": { - "tags": [ - "large" - ] - }, + "id": "4ba5feba", + "metadata": {}, "source": [ - "### Install Cluster-wide Dependencies (Large-scale Template)\n", - "\n", - "When running in a distributed Ray Cluster, all nodes need to have access to the installed packages.\n", - "For this, we'll use `pip install --user` to install the necessary requirements.\n", - "On an [Anyscale Workspace](https://docs.anyscale.com/user-guide/develop-and-debug/workspaces),\n", - "this will install packages to a *shared filesystem* that will be available to all nodes in the cluster." + "Next, we need to make sure all worker nodes also have access to the dependencies.\n", + "For this, use a [Ray Runtime Environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments)\n", + "to dynamically set up dependencies throughout the cluster.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "4c248f05", - "metadata": { - "tags": [ - "large" - ] - }, + "id": "ca638dbb", + "metadata": {}, "outputs": [], "source": [ - "%pip install --user -r requirements.txt --upgrade\n" + "import ray\n", + "\n", + "ray.init(runtime_env={\"pip\": requirements})\n" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "520ef4d7", "metadata": {}, @@ -94,19 +109,7 @@ "First, we define the Ray Serve application with the model loading and inference logic. This includes setting up:\n", "- The `/imagine` API endpoint that we query to generate the image.\n", "- The stable diffusion model loaded inside a Ray Serve Deployment.\n", - " We'll specify the *number of model replicas* to keep active in our Ray cluster. These model replicas can process incoming requests concurrently.\n", - "\n", - "" + " We'll specify the *number of model replicas* to keep active in our Ray cluster. These model replicas can process incoming requests concurrently.\n" ] }, { @@ -120,19 +123,14 @@ "from fastapi.responses import Response\n", "from io import BytesIO\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import os\n", "import requests\n", "import time\n", "import uuid\n", "\n", "import ray\n", - "from ray import serve\n", - "\n", - "try:\n", - " import torch\n", - " from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline\n", - "except ImportError as e:\n", - " raise RuntimeError(\"Did you follow the steps above to install dependencies?\") from e\n" + "from ray import serve\n" ] }, { @@ -148,42 +146,48 @@ { "cell_type": "code", "execution_count": null, - "id": "c1bea30b", + "id": "90eca147", "metadata": { - "tags": [ - "small" - ] + "tags": [] }, "outputs": [], "source": [ - "# Default values for the small-scale template\n", - "NUM_REPLICAS: int = 1\n", - "NUM_GPUS_PER_REPLICA: float = 1\n" + "NUM_REPLICAS: int = 4\n", + "NUM_GPUS_PER_REPLICA: float = 1\n", + "\n", + "# Control the output size: (IMAGE_SIZE, IMAGE_SIZE)\n", + "# NOTE: Generated image quality degrades rapidly if you reduce the size too much.\n", + "IMAGE_SIZE: int = 776\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "89eb3e2c", + "metadata": {}, + "source": [ + "First, we define the Ray Serve Deployment, which will load a stable diffusion model and perform inference with it.\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "90eca147", - "metadata": { - "tags": [ - "large" - ] - }, + "id": "76a02213", + "metadata": {}, "outputs": [], "source": [ - "# Default values for the large-scale template\n", - "NUM_REPLICAS: int = 4\n", - "NUM_GPUS_PER_REPLICA: float = 1\n" + "# Configure each model replica to use the specified resources.\n", + "ray_actor_options = {\n", + " \"num_gpus\": NUM_GPUS_PER_REPLICA,\n", + "}\n" ] }, { + "attachments": {}, "cell_type": "markdown", - "id": "89eb3e2c", + "id": "880b8593", "metadata": {}, "source": [ - "First, we'll define the Ray Serve Deployment, which will load and perform inference with a stable diffusion model.\n", - "\n", "> ✂️ Modify this block to load your own model, and change the `generate` method to perform your own online inference logic!" ] }, @@ -195,12 +199,20 @@ "outputs": [], "source": [ "@serve.deployment(\n", - " ray_actor_options={\"num_gpus\": NUM_GPUS_PER_REPLICA},\n", + " ray_actor_options=ray_actor_options,\n", " num_replicas=NUM_REPLICAS,\n", ")\n", "class StableDiffusionV2:\n", " def __init__(self):\n", " # \n", + " try:\n", + " import torch\n", + " from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline\n", + " except ImportError as e:\n", + " raise RuntimeError(\n", + " \"Did you set a runtime env to install dependencies?\"\n", + " ) from e\n", + "\n", " model_id = \"stabilityai/stable-diffusion-2\"\n", " scheduler = EulerDiscreteScheduler.from_pretrained(\n", " model_id, subfolder=\"scheduler\"\n", @@ -210,7 +222,7 @@ " )\n", " self.pipe = self.pipe.to(\"cuda\")\n", "\n", - " def generate(self, prompt: str, img_size: int = 512):\n", + " def generate(self, prompt: str, img_size: int = 776):\n", " # \n", " assert len(prompt), \"prompt parameter cannot be empty\"\n", " image = self.pipe(prompt, height=img_size, width=img_size).images[0]\n", @@ -248,7 +260,7 @@ " responses={200: {\"content\": {\"image/png\": {}}}},\n", " response_class=Response,\n", " )\n", - " async def generate(self, prompt: str, img_size: int = 512):\n", + " async def generate(self, prompt: str, img_size: int = 776):\n", " assert len(prompt), \"prompt parameter cannot be empty\"\n", "\n", " image = await (await self.handle.generate.remote(prompt, img_size=img_size))\n", @@ -372,7 +384,7 @@ "\n", "@ray.remote(num_cpus=1)\n", "def generate_image(prompt):\n", - " req = {\"prompt\": prompt, \"img_size\": 776}\n", + " req = {\"prompt\": prompt, \"img_size\": IMAGE_SIZE}\n", " resp = requests.get(endpoint, params=req)\n", " return resp.content\n", "\n", @@ -395,6 +407,7 @@ " \"`python server.py --num-replicas=...` in another terminal yet?\"\n", " ) from e\n", "\n", + " generation_times = []\n", " while True:\n", " prompt = (\n", " PROMPT\n", @@ -422,13 +435,15 @@ " filenames.append(filename)\n", "\n", " elapsed = time.time() - start\n", + " generation_times.append(elapsed)\n", " print(\n", " f\"\\nGenerated {len(images)} image(s) in {elapsed:.2f} seconds to \"\n", " f\"the directory: {dirname}\\n\"\n", " )\n", " show_images(filenames)\n", " if not INTERACTIVE:\n", - " break\n" + " break\n", + " return np.mean(generation_times) if generation_times else -1\n" ] }, { @@ -447,8 +462,7 @@ "metadata": {}, "outputs": [], "source": [ - "main()\n", - "serve.shutdown()\n" + "mean_generation_time = main()\n" ] }, { @@ -463,10 +477,15 @@ ] }, { - "cell_type": "markdown", - "id": "1c96ed20", + "cell_type": "code", + "execution_count": null, + "id": "9e360cf9", "metadata": {}, - "source": [] + "outputs": [], + "source": [ + "# Shut down the model replicas once you're done!\n", + "serve.shutdown()\n" + ] } ], "metadata": { diff --git a/doc/source/templates/README.md b/doc/source/templates/README.md index da8bab08a6de..12f90b465db8 100644 --- a/doc/source/templates/README.md +++ b/doc/source/templates/README.md @@ -15,80 +15,112 @@ Coming soon... To add a template: -1. Add your template as a directory somewhere in the Ray repo. - All files needed to run the template should be contained within this directory. +1. Add your template as a directory somewhere in `doc/source/templates`. + For example: ```text ray/ doc/source/templates/ / - requirements.txt + README.md .ipynb ``` If your template requires any special dependencies that are not included in a - base `ray-ml` Docker image, be sure to specify a `requirements.txt` file within - the directory. - -2. Add an entry to `doc/source/templates/templates.yaml` that links to your template. + base `ray-ml` Docker image, be sure to list and install the necessary dependencies + within the notebook. See `03_serving_stable_diffusion` for an example. - ```yaml - - name: Many Model Training using Ray Tune - # Paths should be relative to the Ray repo root directory - path: doc/source/templates/02_many_model_training - cluster_env: doc/source/templates/configs/anyscale_cluster_env.yaml - small: - compute_config: - gcp: doc/source/templates/configs/compute/cpu/gcp_small.yaml - aws: doc/source/templates/configs/compute/cpu/aws_small.yaml - large: - compute_config: - # Relative to `path` - gcp: doc/source/templates/configs/compute/cpu/gcp_large.yaml - aws: doc/source/templates/configs/compute/cpu/aws_large.yaml + ```{note} + The template should be self-contained and not require any external files. + This requirement is to simplify the testing procedure. ``` - Make sure that you include a small/large version for the template. - See the following table for a description of template size: +2. Add another copy of the template that includes test-specific code and a smoke-test version if applicable. + + **Note:** The need for a second test copy is temporary. Only one notebook will be needed + from 2.5 onward, since the test-specific code will be filtered out. + + **Label all test-specific code with the `remove-cell` Jupyter notebook tag.** + + **Put this test copy in `doc/source/templates/tests/.ipynb`.** + +3. List the smoke-test version of the template in `doc/BUILD` under the templates section. This will configure the smoke-test version to run in pre-merge CI. + + Set the `SMOKE_TEST` environment variable, which should be used in your template to + **to make the template work for a single CI instance.** + This environment variable can also be used to conditionally set certain smoke test parameters (like limiting dataset size). + + **Make sure that you tag the test with `"gpu"` if required, and any other tags + needed for special dependencies.** - | Attributes | Small-scale Version | Large-scale Version | - | -- | -- | -- | - | Number of Nodes | 1 | > 1 | - | Dataset size | Subset (of partitions/labels/rows) | Full example dataset | - | Model size | Pruned/mini version of the model | Full model | - | Runtime | 30-60s | Up to ~5-10 minutes | + ```python + py_test_run_all_notebooks( + size = "large", + include = ["source/templates/tests/batch_inference.ipynb"], + exclude = [], + data = ["//doc:workspace_templates"], + tags = ["exclusive", "team:ml", "ray_air", "gpu"], + env = {"SMOKE_TEST": "1"}, + ) + ``` - When you specify the template's compute config, see `doc/source/templates/configs` for defaults. +4. Add a release test for the template in `release/release_tests.yaml` (for both AWS and GCE). -3. Add a nightly release test for the template in `release/release_tests.yaml`. + **Use the `release_test_cluster_env.yaml` and `*_release_test.yaml` files for cluster env / compute configs.** + These contain placeholders for regions and cloud ids that our CI infra will fill in. ```yaml - name: workspace_template_small_02_many_model_training group: Workspace templates - working_dir: workspace_templates/02_many_model_training + working_dir: workspace_templates/tests python: "3.9" - frequency: nightly + frequency: nightly-3x team: ml cluster: cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/cpu/aws_small.yaml + cluster_compute: ../configs/compute/cpu/aws_release_test.yaml + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: ../configs/release_test_cluster_env.yaml + cluster_compute: ../configs/compute/cpu/gce_release_test.yaml run: timeout: 300 - script: pip install -U -r requirements.txt - && jupyter nbconvert --TagRemovePreprocessor.remove_input_tags='large' - --to script --output _test many_model_training.ipynb && ipython _test.py + script: jupyter nbconvert --to script --output _test many_model_training.ipynb && ipython _test.py ``` - Note: `--TagRemovePreprocessor.remove_input_tags='large'` will make sure that only the small-scale - version of the template gets tested nightly. +5. Add an entry to `doc/source/templates/templates.yaml` that links to your template. + + ```yaml + many-model-training-ray-tune: + title: Many Model Training + description: Scaling Many Model Training with Ray Tune + path: doc/source/templates/02_many_model_training + cluster_env: doc/source/templates/configs/anyscale_cluster_env.yaml + compute_config: + GCP: doc/source/templates/configs/compute/cpu/gce.yaml + AWS: doc/source/templates/configs/compute/cpu/aws.yaml + ``` + + **In this example, `many-model-training-ray-tune` is the template ID, which should be unique.** + + **Use the `anyscale_cluster_env.yaml`, `gce.yaml`, and `aws.yaml` files, NOT the release test counterparts.** + + When you specify the template's compute config, see `doc/source/templates/configs` for shared configs. + +6. Run a validation script on `templates.yaml` to make sure that the paths you specified are all valid and all yamls are properly formatted. -4. Run a validation script on `templates.yaml` to make sure that the paths you specified are all valid. + **Note:** This will also run in CI, but you can check quickly by running the validation script. ```bash $ python doc/source/templates/validate.py Success! ``` -5. Success! Your template is ready for review. +7. Success! Your template is ready for review. diff --git a/doc/source/templates/configs/anyscale_cluster_env.yaml b/doc/source/templates/configs/anyscale_cluster_env.yaml index 68ea5af0a616..2b8c1e458e95 100644 --- a/doc/source/templates/configs/anyscale_cluster_env.yaml +++ b/doc/source/templates/configs/anyscale_cluster_env.yaml @@ -3,4 +3,4 @@ # docker_image: anyscale/ray-ml:latest-py39 # ray_version: nightly # or define a build_id for existing images, e.g. for "anyscaleray-ml231-py39-gpu" -build_id: "anyscaleray-mlnightly-py39-gpu" +build_id: "anyscaleray-ml240-py39-gpu" diff --git a/doc/source/templates/configs/compute/cpu/aws.yaml b/doc/source/templates/configs/compute/cpu/aws.yaml new file mode 100644 index 000000000000..a8b33d551afb --- /dev/null +++ b/doc/source/templates/configs/compute/cpu/aws.yaml @@ -0,0 +1,11 @@ +# 8 m5.2xlarge nodes --> 64 CPUs +head_node_type: + name: head_node_type + instance_type: m5.2xlarge + +worker_node_types: +- name: cpu_worker + instance_type: m5.2xlarge + min_workers: 7 + max_workers: 7 + use_spot: false diff --git a/doc/source/templates/configs/compute/cpu/aws_large.yaml b/doc/source/templates/configs/compute/cpu/aws_release_test.yaml similarity index 84% rename from doc/source/templates/configs/compute/cpu/aws_large.yaml rename to doc/source/templates/configs/compute/cpu/aws_release_test.yaml index c3260c7100fd..28b9115d2755 100644 --- a/doc/source/templates/configs/compute/cpu/aws_large.yaml +++ b/doc/source/templates/configs/compute/cpu/aws_release_test.yaml @@ -1,3 +1,6 @@ +cloud_id: {{ env["ANYSCALE_CLOUD_ID"] }} +region: us-west-2 + # 8 m5.2xlarge nodes --> 64 CPUs head_node_type: name: head_node_type diff --git a/doc/source/templates/configs/compute/cpu/aws_small.yaml b/doc/source/templates/configs/compute/cpu/aws_small.yaml deleted file mode 100644 index 16e628057308..000000000000 --- a/doc/source/templates/configs/compute/cpu/aws_small.yaml +++ /dev/null @@ -1,21 +0,0 @@ -cloud_id: {{ env["ANYSCALE_CLOUD_ID"] }} -region: us-west-2 - -# 1 m5.2xlarge node --> 8 CPUs -head_node_type: - name: head_node_type - instance_type: m5.2xlarge - -worker_node_types: -- name: cpu_worker - instance_type: m5.2xlarge - min_workers: 0 - max_workers: 0 - use_spot: false - -aws: - TagSpecifications: - - ResourceType: "instance" - Tags: - - Key: ttl-hours - Value: '24' \ No newline at end of file diff --git a/doc/source/templates/configs/compute/cpu/gcp_large.yaml b/doc/source/templates/configs/compute/cpu/gce.yaml similarity index 100% rename from doc/source/templates/configs/compute/cpu/gcp_large.yaml rename to doc/source/templates/configs/compute/cpu/gce.yaml diff --git a/doc/source/templates/configs/compute/cpu/gcp_small.yaml b/doc/source/templates/configs/compute/cpu/gce_release_test.yaml similarity index 58% rename from doc/source/templates/configs/compute/cpu/gcp_small.yaml rename to doc/source/templates/configs/compute/cpu/gce_release_test.yaml index 4ea97f92161b..918b5ccd349f 100644 --- a/doc/source/templates/configs/compute/cpu/gcp_small.yaml +++ b/doc/source/templates/configs/compute/cpu/gce_release_test.yaml @@ -1,14 +1,17 @@ cloud_id: {{ env["ANYSCALE_CLOUD_ID"] }} + region: us-west1 +allowed_azs: + - us-west1-b -# 1 n1-standard-8 node --> 8 CPUs +# 8 n2-standard-8 nodes --> 64 CPUs head_node_type: name: head_node_type - instance_type: n1-standard-8 + instance_type: n2-standard-8 worker_node_types: - name: cpu_worker instance_type: n2-standard-8 - min_workers: 0 - max_workers: 0 + min_workers: 7 + max_workers: 7 use_spot: false diff --git a/doc/source/templates/configs/compute/gpu/aws.yaml b/doc/source/templates/configs/compute/gpu/aws.yaml new file mode 100644 index 000000000000..101c8b7bbc4b --- /dev/null +++ b/doc/source/templates/configs/compute/gpu/aws.yaml @@ -0,0 +1,11 @@ +# 4 g4dn.4xlarge nodes --> 64 CPUs, 4 GPUs +head_node_type: + name: head_node_type + instance_type: g4dn.4xlarge + +worker_node_types: +- name: gpu_worker + instance_type: g4dn.4xlarge + min_workers: 3 + max_workers: 3 + use_spot: false diff --git a/doc/source/templates/configs/compute/gpu/aws_large.yaml b/doc/source/templates/configs/compute/gpu/aws_release_test.yaml similarity index 85% rename from doc/source/templates/configs/compute/gpu/aws_large.yaml rename to doc/source/templates/configs/compute/gpu/aws_release_test.yaml index 4d72cd8e5e24..501677653dcb 100644 --- a/doc/source/templates/configs/compute/gpu/aws_large.yaml +++ b/doc/source/templates/configs/compute/gpu/aws_release_test.yaml @@ -1,3 +1,6 @@ +cloud_id: {{ env["ANYSCALE_CLOUD_ID"] }} +region: us-west-2 + # 4 g4dn.4xlarge nodes --> 64 CPUs, 4 GPUs head_node_type: name: head_node_type diff --git a/doc/source/templates/configs/compute/gpu/aws_small.yaml b/doc/source/templates/configs/compute/gpu/aws_small.yaml deleted file mode 100644 index 237d186b749b..000000000000 --- a/doc/source/templates/configs/compute/gpu/aws_small.yaml +++ /dev/null @@ -1,21 +0,0 @@ -cloud_id: {{ env["ANYSCALE_CLOUD_ID"] }} -region: us-west-2 - -# 1 g4dn.4xlarge node --> 16 CPUs, 1 GPU -head_node_type: - name: head_node_type - instance_type: g4dn.4xlarge - -worker_node_types: -- name: gpu_worker - instance_type: g4dn.4xlarge - min_workers: 0 - max_workers: 0 - use_spot: false - -aws: - TagSpecifications: - - ResourceType: "instance" - Tags: - - Key: ttl-hours - Value: '24' \ No newline at end of file diff --git a/doc/source/templates/configs/compute/gpu/gcp_large.yaml b/doc/source/templates/configs/compute/gpu/gce.yaml similarity index 100% rename from doc/source/templates/configs/compute/gpu/gcp_large.yaml rename to doc/source/templates/configs/compute/gpu/gce.yaml diff --git a/doc/source/templates/configs/compute/gpu/gcp_small.yaml b/doc/source/templates/configs/compute/gpu/gce_release_test.yaml similarity index 66% rename from doc/source/templates/configs/compute/gpu/gcp_small.yaml rename to doc/source/templates/configs/compute/gpu/gce_release_test.yaml index 9fddd463b62b..146d633d57b3 100644 --- a/doc/source/templates/configs/compute/gpu/gcp_small.yaml +++ b/doc/source/templates/configs/compute/gpu/gce_release_test.yaml @@ -1,7 +1,10 @@ cloud_id: {{ env["ANYSCALE_CLOUD_ID"] }} + region: us-west1 +allowed_azs: + - us-west1-b -# 1 n1-standard-16-nvidia-tesla-t4-1 node --> 16 CPUs, 1 GPU +# 4 n1-standard-16-nvidia-tesla-t4-1 nodes --> 64 CPUs, 4 GPUs head_node_type: name: head_node_type instance_type: n1-standard-16-nvidia-tesla-t4-1 @@ -9,6 +12,6 @@ head_node_type: worker_node_types: - name: gpu_worker instance_type: n1-standard-16-nvidia-tesla-t4-1 - min_workers: 0 - max_workers: 0 + min_workers: 3 + max_workers: 3 use_spot: false diff --git a/doc/source/templates/templates.yaml b/doc/source/templates/templates.yaml index 71e39dba785d..f9a7429ed494 100644 --- a/doc/source/templates/templates.yaml +++ b/doc/source/templates/templates.yaml @@ -5,21 +5,21 @@ batch-inference-ray-data: path: doc/source/templates/01_batch_inference cluster_env: doc/source/templates/configs/anyscale_cluster_env.yaml compute_config: - GCP: doc/source/templates/configs/compute/gpu/gcp_large.yaml - AWS: doc/source/templates/configs/compute/gpu/aws_large.yaml + GCP: doc/source/templates/configs/compute/gpu/gce.yaml + AWS: doc/source/templates/configs/compute/gpu/aws.yaml many-model-training-ray-tune: title: Many Model Training description: Scaling Many Model Training with Ray Tune path: doc/source/templates/02_many_model_training cluster_env: doc/source/templates/configs/anyscale_cluster_env.yaml compute_config: - GCP: doc/source/templates/configs/compute/cpu/gcp_large.yaml - AWS: doc/source/templates/configs/compute/cpu/aws_large.yaml + GCP: doc/source/templates/configs/compute/cpu/gce.yaml + AWS: doc/source/templates/configs/compute/cpu/aws.yaml serve-stable-diffusion-model-ray-serve: title: Serving Stable Diffusion description: Serving a Stable Diffusion Model with Ray Serve path: doc/source/templates/03_serving_stable_diffusion cluster_env: doc/source/templates/configs/anyscale_cluster_env.yaml compute_config: - GCP: doc/source/templates/configs/compute/cpu/gcp_large.yaml - AWS: doc/source/templates/configs/compute/cpu/aws_large.yaml + GCP: doc/source/templates/configs/compute/gpu/gce.yaml + AWS: doc/source/templates/configs/compute/gpu/aws.yaml \ No newline at end of file diff --git a/doc/source/templates/tests/01_batch_inference/batch_inference.ipynb b/doc/source/templates/tests/01_batch_inference/batch_inference.ipynb new file mode 100644 index 000000000000..0dcfa9cbc4a9 --- /dev/null +++ b/doc/source/templates/tests/01_batch_inference/batch_inference.ipynb @@ -0,0 +1,391 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "cfababd6", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# ==== Code for testing purposes to exclude in user-facing template. ====\n", + "\n", + "import os\n", + "import time\n", + "\n", + "SMOKE_TEST = True if os.environ.get(\"SMOKE_TEST\", \"0\") == \"1\" else False\n", + "\n", + "start_time = time.monotonic()\n" + ] + }, + { + "cell_type": "markdown", + "id": "02ff59ce", + "metadata": {}, + "source": [ + "# Scaling Batch Inference with Ray Data\n", + "\n", + "This template is a quickstart to using [Ray Data](https://docs.ray.io/en/latest/data/dataset.html) for batch inference. Ray Data is one of many libraries under the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). See [this blog post](https://www.anyscale.com/blog/model-batch-inference-in-ray-actors-actorpool-and-datasets) for more information on why and how you should perform batch inference with Ray!\n", + "\n", + "This template walks through GPU batch prediction on an image dataset using a PyTorch model, but the framework and data format are there just to help you build your own application!\n", + "\n", + "At a high level, this template will:\n", + "1. [Load your dataset using Ray Data.](https://docs.ray.io/en/latest/data/creating-datasets.html)\n", + "2. [Preprocess your dataset before feeding it to your model.](https://docs.ray.io/en/latest/data/transforming-datasets.html)\n", + "3. [Initialize your model and perform inference on a shard of your dataset with a remote actor.](https://docs.ray.io/en/latest/data/transforming-datasets.html#writing-user-defined-functions-udfs)\n", + "4. [Save your prediction results.](https://docs.ray.io/en/latest/data/api/input_output.html)\n", + "\n", + "> Slot in your code below wherever you see the ✂️ icon to build a many model training Ray application off of this template!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "065e7765", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import numpy as np\n", + "import tempfile\n", + "from typing import Dict\n", + "\n", + "import ray\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c99f142a", + "metadata": {}, + "source": [ + ">✂️ Play around with these values!\n", + ">\n", + ">For example, for a cluster with 4 GPU nodes, you may want 4 workers, each using 1 GPU.\n", + ">Be sure to stay within the resource constraints of your Ray Cluster if autoscaling is not enabled.\n", + ">You can check the available resources in your Ray Cluster with: `ray status`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d49681f-baf0-4ed8-9740-5c4e38744311", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "NUM_WORKERS: int = 4\n", + "NUM_GPUS_PER_WORKER: float = 1\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20e9e07c", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "if SMOKE_TEST:\n", + " NUM_WORKERS = 4\n", + " NUM_GPUS_PER_WORKER = 0.25\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "770bbdc7", + "metadata": {}, + "outputs": [], + "source": [ + "!ray status" + ] + }, + { + "cell_type": "markdown", + "id": "23321ba8", + "metadata": {}, + "source": [ + "```{tip}\n", + "Try setting `NUM_GPUS_PER_WORKER` to a fractional amount! This will leverage Ray's fractional resource allocation, which means you can schedule multiple batch inference workers to happen on the same GPU.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "245f37c9", + "metadata": {}, + "source": [ + "> ✂️ Replace this function with logic to load your own data with Ray Data.\n", + ">\n", + "> See [the Ray Data guide on creating datasets](https://docs.ray.io/en/latest/data/creating-datasets.html) to learn how to create a dataset based on the data type and how file storage format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "615f4a78", + "metadata": {}, + "outputs": [], + "source": [ + "def load_ray_dataset():\n", + " from ray.data.datasource.partitioning import Partitioning\n", + "\n", + " s3_uri = \"s3://anonymous@air-example-data-2/imagenette2/val/\"\n", + " partitioning = Partitioning(\"dir\", field_names=[\"class\"], base_dir=s3_uri)\n", + " ds = ray.data.read_images(\n", + " s3_uri, size=(256, 256), partitioning=partitioning, mode=\"RGB\"\n", + " )\n", + " return ds\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "966bcfdc", + "metadata": {}, + "outputs": [], + "source": [ + "ds = load_ray_dataset()\n", + "ds.schema()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "965db5e8", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "if SMOKE_TEST:\n", + " ds = ds.limit(12)\n" + ] + }, + { + "cell_type": "markdown", + "id": "39d01e3c", + "metadata": {}, + "source": [ + "> ✂️ Replace this function with your own data preprocessing logic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "652121bd", + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n", + " from torchvision import transforms\n", + "\n", + " def to_tensor(batch: np.ndarray) -> torch.Tensor:\n", + " tensor = torch.as_tensor(batch, dtype=torch.float)\n", + " # (B, H, W, C) -> (B, C, H, W)\n", + " tensor = tensor.permute(0, 3, 1, 2).contiguous()\n", + " # [0., 255.] -> [0., 1.]\n", + " tensor = tensor.div(255)\n", + " return tensor\n", + "\n", + " transform = transforms.Compose(\n", + " [\n", + " transforms.Lambda(to_tensor),\n", + " transforms.CenterCrop(224),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + " ]\n", + " )\n", + " return {\"image\": transform(batch[\"image\"]).numpy()}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c35f5a17", + "metadata": {}, + "outputs": [], + "source": [ + "ds = ds.map_batches(preprocess, batch_format=\"numpy\")\n", + "\n", + "print(\"Dataset schema:\\n\", ds.schema())\n", + "print(\"Number of images:\", ds.count())\n" + ] + }, + { + "cell_type": "markdown", + "id": "ad059e54", + "metadata": {}, + "source": [ + "> ✂️ Replace parts of this Callable class with your own model initialization and inference logic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42cac828", + "metadata": {}, + "outputs": [], + "source": [ + "class PredictCallable:\n", + " def __init__(self):\n", + " # \n", + " from torchvision import models\n", + "\n", + " self.model = models.resnet152(pretrained=True)\n", + " self.model.eval()\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " self.model.to(self.device)\n", + "\n", + " def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n", + " # \n", + " input_data = torch.as_tensor(batch[\"image\"], device=self.device)\n", + " with torch.inference_mode():\n", + " pred = self.model(input_data)\n", + " return {\"predicted_class_index\": pred.argmax(dim=1).detach().cpu().numpy()}\n" + ] + }, + { + "cell_type": "markdown", + "id": "fda0c298", + "metadata": {}, + "source": [ + "Now, perform batch prediction using Ray Data! Ray Data will perform model inference using `NUM_WORKERS` copies of the `PredictCallable` class you defined." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "331e21e4", + "metadata": {}, + "outputs": [], + "source": [ + "predictions = ds.map_batches(\n", + " PredictCallable,\n", + " batch_size=128,\n", + " compute=ray.data.ActorPoolStrategy(\n", + " # Fix the number of batch inference workers to `NUM_WORKERS`.\n", + " min_size=NUM_WORKERS,\n", + " max_size=NUM_WORKERS,\n", + " ),\n", + " num_gpus=NUM_GPUS_PER_WORKER,\n", + " batch_format=\"numpy\",\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23e77ada", + "metadata": {}, + "outputs": [], + "source": [ + "preds = predictions.materialize()\n", + "preds.schema()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d606556", + "metadata": {}, + "outputs": [], + "source": [ + "preds.take(5)\n" + ] + }, + { + "cell_type": "markdown", + "id": "ceddd984", + "metadata": {}, + "source": [ + "```{tip}\n", + "Play around with the `min_size` and `max_size` parameters to enable autoscaling!\n", + "For example, try commenting out `max_size`: this will autoscale up to an infinite number of workers, if you have free resources in the cluster.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "90ec67e8", + "metadata": {}, + "source": [ + "Shard the predictions into a few partitions, and save each partition to a file!\n", + "\n", + "```{note}\n", + "This currently saves to the local filesystem under `/tmp/predictions`, but you could also save to a cloud bucket (e.g., `s3://predictions-bucket`).\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1887e34", + "metadata": {}, + "outputs": [], + "source": [ + "num_shards = 3\n", + "\n", + "with tempfile.TemporaryDirectory() as temp_dir:\n", + " predictions.repartition(num_shards).write_parquet(f\"local://{temp_dir}\")\n", + " print(f\"Predictions saved to `{temp_dir}`!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e88a268", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "release_test_out = os.environ.get(\"TEST_OUTPUT_JSON\", \"/tmp/release_test_out.json\")\n", + "\n", + "elapsed = time.monotonic() - start_time\n", + "with open(release_test_out, \"wt\") as f:\n", + " json.dump({\"total_runtime\": elapsed}, f)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/templates/tests/02_many_model_training/many_model_training.ipynb b/doc/source/templates/tests/02_many_model_training/many_model_training.ipynb new file mode 100644 index 000000000000..dbc4aa42bfcd --- /dev/null +++ b/doc/source/templates/tests/02_many_model_training/many_model_training.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "46369bd2", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# ==== Code for testing purposes to exclude in user-facing template. ====\n", + "\n", + "import os\n", + "\n", + "SMOKE_TEST = True if os.environ.get(\"SMOKE_TEST\", \"0\") == \"1\" else False\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "98e0d4f3", + "metadata": {}, + "source": [ + "# Scaling Many Model Training with Ray Tune\n", + "\n", + "This template is a quickstart to using [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) for training many models in parallel. Ray Tune is one of many libraries in the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html). See [this blog post](https://www.anyscale.com/blog/training-one-million-machine-learning-models-in-record-time-with-ray) for more information on the benefits of performing many model training with Ray!\n", + "\n", + "This template walks through time-series forecasting using `statsforecast`, but the framework and data format can be swapped out easily -- they are there just to help you build your own application!\n", + "\n", + "At a high level, this template will:\n", + "\n", + "1. [Define the training function for a single partition of data.](https://docs.ray.io/en/latest/tune/tutorials/tune-run.html)\n", + "2. [Define a Tune search space to run training over many partitions of data.](https://docs.ray.io/en/latest/tune/tutorials/tune-search-spaces.html)\n", + "3. [Extract the best model per dataset partition from the Tune experiment output.](https://docs.ray.io/en/latest/tune/examples/tune_analyze_results.html)" + ] + }, + { + "cell_type": "markdown", + "id": "08e65f8d", + "metadata": {}, + "source": [ + "> Slot in your code below wherever you see the ✂️ icon to build a many model training Ray application off of this template!" + ] + }, + { + "cell_type": "markdown", + "id": "182f65ea", + "metadata": {}, + "source": [ + "## Handling Dependencies\n", + "\n", + "This template requires certain Python packages to be available to every node in the cluster.\n", + "\n", + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "511f1722", + "metadata": {}, + "outputs": [], + "source": [ + "requirements_path = \"./requirements.txt\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9a44498", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "if not os.path.exists(requirements_path):\n", + " # CWD is at the ray root in CI\n", + " requirements_path = \"doc/source/templates/tests/02_many_model_training/requirements.txt\"\n", + " assert os.path.exists(requirements_path), (requirements_path, os.getcwd())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd9da7f", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" + ] + }, + { + "cell_type": "markdown", + "id": "90a96c5b", + "metadata": {}, + "source": [ + "First, we may want to use these modules right here in our script, which is running on the head node.\n", + "Install the Python packages on the head node using `pip install`.\n", + "\n", + "```{note}\n", + "You may need to restart this notebook kernel to access the installed packages.\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18069827", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r {requirements_path} --upgrade" + ] + }, + { + "cell_type": "markdown", + "id": "3e17a4da", + "metadata": {}, + "source": [ + "Next, we need to make sure all worker nodes also have access to the dependencies.\n", + "For this, use a [Ray Runtime Environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments)\n", + "to dynamically set up dependencies throughout the cluster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e268225d", + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "\n", + "ray.init(runtime_env={\"pip\": requirements})\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "389adc20", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pyarrow import parquet as pq\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "from ray import tune\n", + "from ray.air import session\n" + ] + }, + { + "cell_type": "markdown", + "id": "b8fc83d0", + "metadata": {}, + "source": [ + "> ✂️ Replace this value to change the number of data partitions you will use. This will be total the number of Tune trials you will run!\n", + ">\n", + "> Note that this template fits two models per data partition and reports the best performing one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5390c232", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "NUM_DATA_PARTITIONS: int = 500\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6fb7a2d", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "if SMOKE_TEST:\n", + " NUM_DATA_PARTITIONS: int = 10\n", + "\n", + "import time\n", + "\n", + "start_time = time.monotonic()\n" + ] + }, + { + "cell_type": "markdown", + "id": "8b2f3d16", + "metadata": {}, + "source": [ + "> ✂️ Replace the following with your own data-loading and evaluation helper functions. (Or, just delete these!)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68b14061", + "metadata": {}, + "outputs": [], + "source": [ + "def get_m5_partition(unique_id: str) -> pd.DataFrame:\n", + " df = (\n", + " pq.read_table(\n", + " \"s3://anonymous@m5-benchmarks/data/train/target.parquet\",\n", + " columns=[\"item_id\", \"timestamp\", \"demand\"],\n", + " filters=[(\"item_id\", \"=\", unique_id)],\n", + " )\n", + " .to_pandas()\n", + " .rename(columns={\"item_id\": \"unique_id\", \"timestamp\": \"ds\", \"demand\": \"y\"})\n", + " )\n", + " df[\"unique_id\"] = df[\"unique_id\"].astype(str)\n", + " df[\"ds\"] = pd.to_datetime(df[\"ds\"])\n", + " return df.dropna()\n", + "\n", + "\n", + "def evaluate_cross_validation(df: pd.DataFrame, metric) -> pd.DataFrame:\n", + " models = df.drop(columns=[\"ds\", \"cutoff\", \"y\"]).columns.tolist()\n", + " evals = []\n", + " for model in models:\n", + " eval_ = (\n", + " df.groupby([\"unique_id\", \"cutoff\"])\n", + " .apply(lambda x: metric(x[\"y\"].values, x[model].values))\n", + " .to_frame()\n", + " )\n", + " eval_.columns = [model]\n", + " evals.append(eval_)\n", + " evals = pd.concat(evals, axis=1)\n", + " evals = evals.groupby([\"unique_id\"]).mean(numeric_only=True)\n", + " evals[\"best_model\"] = evals.idxmin(axis=1)\n", + " return evals\n" + ] + }, + { + "cell_type": "markdown", + "id": "060ee3ce", + "metadata": {}, + "source": [ + "> ✂️ Replace this with your own training logic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faaa0dad", + "metadata": {}, + "outputs": [], + "source": [ + "def train_fn(config: dict):\n", + " try:\n", + " from statsforecast import StatsForecast\n", + " from statsforecast.models import AutoARIMA, AutoETS\n", + " except ImportError as e:\n", + " raise RuntimeError(\"Did you set a runtime env to install dependencies?\") from e\n", + "\n", + " data_partition_id = config[\"data_partition_id\"]\n", + " train_df = get_m5_partition(data_partition_id)\n", + "\n", + " models = [AutoARIMA(), AutoETS()]\n", + " n_windows = 1\n", + " forecast_horizon = 4\n", + "\n", + " sf = StatsForecast(\n", + " df=train_df,\n", + " models=models,\n", + " freq=\"D\",\n", + " n_jobs=n_windows * len(models),\n", + " )\n", + " cv_df = sf.cross_validation(\n", + " h=forecast_horizon,\n", + " step_size=forecast_horizon,\n", + " n_windows=n_windows,\n", + " )\n", + "\n", + " eval_df = evaluate_cross_validation(df=cv_df, metric=mean_squared_error)\n", + " best_model = eval_df[\"best_model\"][data_partition_id]\n", + " forecast_mse = eval_df[best_model][data_partition_id]\n", + "\n", + " # Report the best-performing model and its corresponding eval metric.\n", + " session.report({\"forecast_mse\": forecast_mse, \"best_model\": best_model})\n", + "\n", + "\n", + "trainable = train_fn\n", + "trainable = tune.with_resources(trainable, resources={\"CPU\": 2 * 1})\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "301c7c58", + "metadata": {}, + "source": [ + "```{note}\n", + "`tune.with_resources` is used at the end to specify the number of resources to assign *each trial*.\n", + "Feel free to change this to the resources required by your application! You can also comment out the `tune.with_resources` block to assign `1 CPU` (the default) to each trial.\n", + "\n", + "Note that this is purely for Tune to know how many trials to schedule concurrently -- setting the number of CPUs does not actually enforce any kind of resource isolation!\n", + "In this template, `statsforecast` runs cross validation in parallel with M models * N temporal cross-validation windows (e.g. 2 * 1).\n", + "```\n", + "\n", + "See [Ray Tune's guide on assigning resources](https://docs.ray.io/en/latest/tune/tutorials/tune-resources.html) for more information." + ] + }, + { + "cell_type": "markdown", + "id": "89741e7a", + "metadata": {}, + "source": [ + "> ✂️ Replace this with your desired hyperparameter search space!\n", + ">\n", + "> For example, this template searches over the data partition ID to train a model on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e9f2825", + "metadata": {}, + "outputs": [], + "source": [ + "# Download the list of item ids used to partition the dataset.\n", + "data_partitions = list(\n", + " pd.read_csv(\n", + " \"https://air-example-data.s3.us-west-2.amazonaws.com/m5_benchmarks_item_ids.csv\"\n", + " )[\"item_id\"]\n", + ")\n", + "if NUM_DATA_PARTITIONS > len(data_partitions):\n", + " print(f\"There are only {len(data_partitions)} partitions!\")\n", + "\n", + "param_space = {\n", + " \"data_partition_id\": tune.grid_search(data_partitions[:NUM_DATA_PARTITIONS]),\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "13b4dd3e", + "metadata": {}, + "source": [ + "Run many model training using Ray Tune!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1ef8245", + "metadata": {}, + "outputs": [], + "source": [ + "tuner = tune.Tuner(trainable, param_space=param_space)\n", + "result_grid = tuner.fit()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ba1a07d0", + "metadata": {}, + "source": [ + "View the reported results of all trials as a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7baa29a", + "metadata": {}, + "outputs": [], + "source": [ + "results_df = result_grid.get_dataframe()\n", + "results_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a66e5cc", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "release_test_out = os.environ.get(\"TEST_OUTPUT_JSON\", \"/tmp/release_test_out.json\")\n", + "\n", + "elapsed = time.monotonic() - start_time\n", + "with open(release_test_out, \"wt\") as f:\n", + " json.dump({\"total_time\": elapsed}, f)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/templates/tests/02_many_model_training/requirements.txt b/doc/source/templates/tests/02_many_model_training/requirements.txt new file mode 120000 index 000000000000..2b363f05fc09 --- /dev/null +++ b/doc/source/templates/tests/02_many_model_training/requirements.txt @@ -0,0 +1 @@ +../../02_many_model_training/requirements.txt \ No newline at end of file diff --git a/doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt b/doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt new file mode 120000 index 000000000000..bb4db21916ff --- /dev/null +++ b/doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt @@ -0,0 +1 @@ +../../03_serving_stable_diffusion/requirements.txt \ No newline at end of file diff --git a/doc/source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb b/doc/source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb new file mode 100644 index 000000000000..68f8a0989ffd --- /dev/null +++ b/doc/source/templates/tests/03_serving_stable_diffusion/serving_stable_diffusion.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d3939eef", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# ==== Code for testing purposes to exclude in user-facing template. ====\n", + "\n", + "import os\n", + "\n", + "SMOKE_TEST = True if os.environ.get(\"SMOKE_TEST\", \"0\") == \"1\" else False\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "597c13c0", + "metadata": {}, + "source": [ + "# Serving a Stable Diffusion Model with Ray Serve\n", + "\n", + "This guide is a quickstart to use [Ray Serve](https://docs.ray.io/en/latest/serve/index.html) for model serving. Ray Serve is one of many libraries under the [Ray AI Runtime](https://docs.ray.io/en/latest/ray-air/getting-started.html).\n", + "\n", + "This template loads a pretrained stable diffusion model from HuggingFace and serves it to a local endpoint as a Ray Serve deployment. \n", + "\n", + "> Slot in your code below wherever you see the ✂️ icon to build a model serving Ray application off of this template!" + ] + }, + { + "cell_type": "markdown", + "id": "3c8c02eb", + "metadata": {}, + "source": [ + "## Handling Dependencies\n", + "\n", + "This template requires certain Python packages to be available to every node in the cluster.\n", + "\n", + "> ✂️ Add your own package dependencies in the `requirements.txt` file!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814d966b", + "metadata": {}, + "outputs": [], + "source": [ + "requirements_path = \"./requirements.txt\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dec4a7bb", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "if not os.path.exists(requirements_path):\n", + " # CWD is at the ray root in CI\n", + " requirements_path = \"doc/source/templates/tests/03_serving_stable_diffusion/requirements.txt\"\n", + " assert os.path.exists(requirements_path), (requirements_path, os.getcwd())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0d78e94", + "metadata": {}, + "outputs": [], + "source": [ + "with open(requirements_path, \"r\") as f:\n", + " requirements = f.read().strip().splitlines()\n", + "\n", + "print(\"Requirements:\")\n", + "print(\"\\n\".join(requirements))\n" + ] + }, + { + "cell_type": "markdown", + "id": "6b73761e", + "metadata": {}, + "source": [ + "First, we may want to use these modules right here in our script, which is running on the head node.\n", + "Install the Python packages on the head node using `pip install`.\n", + "\n", + "```{note}\n", + "You may need to restart this notebook kernel to access the installed packages.\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f6eaf2b", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r {requirements_path} --upgrade" + ] + }, + { + "cell_type": "markdown", + "id": "4b14415f", + "metadata": {}, + "source": [ + "Next, we need to make sure all worker nodes also have access to the dependencies.\n", + "For this, use a [Ray Runtime Environment](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments)\n", + "to dynamically set up dependencies throughout the cluster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8b21822", + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "\n", + "ray.init(runtime_env={\"pip\": requirements})\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "520ef4d7", + "metadata": {}, + "source": [ + "## Deploy the Ray Serve application locally\n", + "\n", + "First, we define the Ray Serve application with the model loading and inference logic. This includes setting up:\n", + "- The `/imagine` API endpoint that we query to generate the image.\n", + "- The stable diffusion model loaded inside a Ray Serve Deployment.\n", + " We'll specify the *number of model replicas* to keep active in our Ray cluster. These model replicas can process incoming requests concurrently.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72ee2132", + "metadata": {}, + "outputs": [], + "source": [ + "from fastapi import FastAPI\n", + "from fastapi.responses import Response\n", + "from io import BytesIO\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import os\n", + "import requests\n", + "import time\n", + "import uuid\n", + "\n", + "import ray\n", + "from ray import serve\n" + ] + }, + { + "cell_type": "markdown", + "id": "de6318ac", + "metadata": {}, + "source": [ + "> ✂️ Replace these values to change the number of model replicas to serve, as well as the GPU resources required by each replica.\n", + ">\n", + "> With more model replicas, more images can be generated in parallel!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90eca147", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "NUM_REPLICAS: int = 4\n", + "NUM_GPUS_PER_REPLICA: float = 1\n", + "\n", + "# Control the output size: (IMAGE_SIZE, IMAGE_SIZE)\n", + "# NOTE: Generated image quality degrades rapidly if you reduce size too much.\n", + "IMAGE_SIZE: int = 776\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40a719f6", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "if SMOKE_TEST:\n", + " NUM_REPLICAS: int = 1\n", + " NUM_GPUS_PER_REPLICA: float = 1\n", + " IMAGE_SIZE: int = 256\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "89eb3e2c", + "metadata": {}, + "source": [ + "First, we define the Ray Serve Deployment, which will load a stable diffusion model and perform inference with it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76a02213", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure each model replica to use the specified resources.\n", + "ray_actor_options = {\n", + " \"num_gpus\": NUM_GPUS_PER_REPLICA,\n", + "}\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "880b8593", + "metadata": {}, + "source": [ + "> ✂️ Modify this block to load your own model, and change the `generate` method to perform your own online inference logic!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f203efd4", + "metadata": {}, + "outputs": [], + "source": [ + "@serve.deployment(\n", + " ray_actor_options=ray_actor_options,\n", + " num_replicas=NUM_REPLICAS,\n", + ")\n", + "class StableDiffusionV2:\n", + " def __init__(self):\n", + " # \n", + " try:\n", + " import torch\n", + " from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline\n", + " except ImportError as e:\n", + " raise RuntimeError(\n", + " \"Did you set a runtime env to install dependencies?\"\n", + " ) from e\n", + "\n", + " model_id = \"stabilityai/stable-diffusion-2\"\n", + " scheduler = EulerDiscreteScheduler.from_pretrained(\n", + " model_id, subfolder=\"scheduler\"\n", + " )\n", + " self.pipe = StableDiffusionPipeline.from_pretrained(\n", + " model_id, scheduler=scheduler, revision=\"fp16\", torch_dtype=torch.float16\n", + " )\n", + " self.pipe = self.pipe.to(\"cuda\")\n", + "\n", + " def generate(self, prompt: str, img_size: int = 776):\n", + " # \n", + " assert len(prompt), \"prompt parameter cannot be empty\"\n", + " image = self.pipe(prompt, height=img_size, width=img_size).images[0]\n", + " return image\n" + ] + }, + { + "cell_type": "markdown", + "id": "0134aa54", + "metadata": {}, + "source": [ + "Next, we'll define the actual API endpoint to live at `/imagine`.\n", + "\n", + "> ✂️ Modify this block to change the endpoint URL, response schema, and add any post-processing logic needed from your model output!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f80fee2", + "metadata": {}, + "outputs": [], + "source": [ + "app = FastAPI()\n", + "\n", + "\n", + "@serve.deployment(num_replicas=1, route_prefix=\"/\")\n", + "@serve.ingress(app)\n", + "class APIIngress:\n", + " def __init__(self, diffusion_model_handle) -> None:\n", + " self.handle = diffusion_model_handle\n", + "\n", + " @app.get(\n", + " \"/imagine\",\n", + " responses={200: {\"content\": {\"image/png\": {}}}},\n", + " response_class=Response,\n", + " )\n", + " async def generate(self, prompt: str, img_size: int = 776):\n", + " assert len(prompt), \"prompt parameter cannot be empty\"\n", + "\n", + " image = await (await self.handle.generate.remote(prompt, img_size=img_size))\n", + "\n", + " file_stream = BytesIO()\n", + " image.save(file_stream, \"PNG\")\n", + " return Response(content=file_stream.getvalue(), media_type=\"image/png\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "61b8916d", + "metadata": {}, + "source": [ + "Now, we deploy the Ray Serve application locally at `http://localhost:8000`!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfc2e244", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "entrypoint = APIIngress.bind(StableDiffusionV2.bind())\n", + "port = 8000\n", + "\n", + "# Shutdown any existing Serve replicas, if they're still around.\n", + "serve.shutdown()\n", + "serve.run(entrypoint, port=port, name=\"serving_stable_diffusion_template\")\n", + "print(\"Done setting up replicas! Now accepting requests...\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "757678cc", + "metadata": {}, + "source": [ + "## Make requests to the endpoint\n", + "\n", + "Next, we'll build a simple client to submit prompts as HTTP requests to the local endpoint at `http://localhost:8000/imagine`." + ] + }, + { + "cell_type": "markdown", + "id": "3e29193b", + "metadata": {}, + "source": [ + "> ✂️ Replace this value to change the number of images to generate per prompt.\n", + ">\n", + "> Each image will be generated starting from a different set of random noise,\n", + "> so you'll be able to see multiple options per prompt!\n", + ">\n", + "> Try starting with `NUM_IMAGES_PER_PROMPT` equal to `NUM_REPLICAS` from earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6aac28e1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "NUM_IMAGES_PER_PROMPT: int = NUM_REPLICAS\n" + ] + }, + { + "cell_type": "markdown", + "id": "6b466230", + "metadata": {}, + "source": [ + "> ✂️ You can choose to run this interactively, or submit a single `PROMPT`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd20a52d", + "metadata": {}, + "outputs": [], + "source": [ + "INTERACTIVE: bool = False\n", + "PROMPT = \"twin peaks sf in basquiat painting style\"\n" + ] + }, + { + "cell_type": "markdown", + "id": "008976b5", + "metadata": {}, + "source": [ + "Start the client script in the next few cells, and generate your first image! For example:\n", + "\n", + "If running interactively, this will look like:\n", + "\n", + "```\n", + "Enter a prompt (or 'q' to quit): twin peaks sf in basquiat painting style\n", + "\n", + "Generating image(s)...\n", + "(Take a look at the terminal serving the endpoint for more logs!)\n", + "\n", + "\n", + "Generated 1 image(s) in 69.89 seconds to the directory: 58b298d9\n", + "```\n", + "\n", + "![Example output](https://user-images.githubusercontent.com/3887863/221063452-3c5e5f6b-fc8c-410f-ad5c-202441cceb51.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ad095b", + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = f\"http://localhost:{port}/imagine\"\n", + "\n", + "\n", + "@ray.remote(num_cpus=1)\n", + "def generate_image(prompt):\n", + " req = {\"prompt\": prompt, \"img_size\": IMAGE_SIZE}\n", + " resp = requests.get(endpoint, params=req)\n", + " return resp.content\n", + "\n", + "\n", + "def show_images(filenames):\n", + " fig, axs = plt.subplots(1, len(filenames), figsize=(4 * len(filenames), 4))\n", + " for i, filename in enumerate(filenames):\n", + " ax = axs if len(filenames) == 1 else axs[i]\n", + " ax.imshow(plt.imread(filename))\n", + " ax.axis(\"off\")\n", + " plt.show()\n", + "\n", + "\n", + "def main() -> float:\n", + " try:\n", + " requests.get(endpoint, timeout=0.1)\n", + " except Exception as e:\n", + " raise RuntimeWarning(\n", + " \"Did you setup the Ray Serve model replicas with \"\n", + " \"`python server.py --num-replicas=...` in another terminal yet?\"\n", + " ) from e\n", + "\n", + " generation_times = []\n", + " while True:\n", + " prompt = (\n", + " PROMPT\n", + " if not INTERACTIVE\n", + " else input(f\"\\nEnter a prompt (or 'q' to quit): \")\n", + " )\n", + " if prompt.lower() == \"q\":\n", + " break\n", + "\n", + " print(\"\\nGenerating image(s)...\\n\")\n", + " start = time.time()\n", + "\n", + " # Make `NUM_IMAGES_PER_PROMPT` requests to the endpoint at once!\n", + " images = ray.get(\n", + " [generate_image.remote(prompt) for _ in range(NUM_IMAGES_PER_PROMPT)]\n", + " )\n", + "\n", + " dirname = f\"{uuid.uuid4().hex[:8]}\"\n", + " os.makedirs(dirname)\n", + " filenames = []\n", + " for i, image in enumerate(images):\n", + " filename = os.path.join(dirname, f\"{i}.png\")\n", + " with open(filename, \"wb\") as f:\n", + " f.write(image)\n", + " filenames.append(filename)\n", + "\n", + " elapsed = time.time() - start\n", + " generation_times.append(elapsed)\n", + " print(\n", + " f\"\\nGenerated {len(images)} image(s) in {elapsed:.2f} seconds to \"\n", + " f\"the directory: {dirname}\\n\"\n", + " )\n", + " show_images(filenames)\n", + " if not INTERACTIVE:\n", + " break\n", + " return np.mean(generation_times) if generation_times else -1\n" + ] + }, + { + "cell_type": "markdown", + "id": "c8949cc7", + "metadata": {}, + "source": [ + "Once the stable diffusion model finishes generating your image, it will be included in the HTTP response body.\n", + "The client writes this to an image in your Workspace directory for you to view. It'll also show up in the notebook cell!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71be51fa", + "metadata": {}, + "outputs": [], + "source": [ + "mean_generation_time = main()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fb124968", + "metadata": {}, + "source": [ + "You've successfully served a stable diffusion model!\n", + "You can modify this template and iterate your model deployment directly on your cluster within your Anyscale Workspace,\n", + "testing with the local endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3660120b", + "metadata": {}, + "outputs": [], + "source": [ + "# Shut down the model replicas once you're done!\n", + "serve.shutdown()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49894fe3", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "release_test_out = os.environ.get(\"TEST_OUTPUT_JSON\", \"/tmp/release_test_out.json\")\n", + "\n", + "with open(release_test_out, \"wt\") as f:\n", + " json.dump({\"mean_generation_time\": mean_generation_time}, f)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ray_dev_py38", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/source/templates/validate.py b/doc/source/templates/validate.py index 04c2b07b10a9..28032516cc0c 100644 --- a/doc/source/templates/validate.py +++ b/doc/source/templates/validate.py @@ -2,50 +2,153 @@ from pathlib import Path import yaml -# ray/doc/source/examples/ -> ray -ray_root_path = (Path(__file__).parent / ".." / ".." / "..").resolve() -templates_catalog_path = Path(__file__).parent / "templates.yaml" -with open(templates_catalog_path, "r") as f: - templates = yaml.safe_load(f) +def get_root_path() -> Path: + """ + If we're running from a Ray repo, and just use the + current file to get the doc directory. + ray/doc/source/examples/ -> ray/ -invalid = collections.defaultdict(list) + For CI, the current file location is: + `/doc/source/examples/validate.py` + We can get the "ray root dir" in the same way: + /doc/source/examples -> / + """ + root_path = Path(__file__).parent / ".." / ".." / ".." + return root_path.resolve() -required_fields = {"name", "path", "cluster_env", "small", "large"} -for i, template in enumerate(templates): - name = template.get("name", i) - missing_fields = set(template) - required_fields - assert not missing_fields, f"Missing fields for {name}: {missing_fields}" +def validate_templates_yaml_schema(templates) -> dict: + all_missing_fields = {} + required_fields = {"title", "description", "path", "cluster_env", "compute_config"} - rel_path = template["path"] - if not (ray_root_path / rel_path).exists(): - invalid[name].append(rel_path) + for template_name, template_config in templates.items(): + # ======= Schema check for templates.yaml ======== + missing_fields = required_fields - set(template_config) + if missing_fields: + all_missing_fields[template_name] = missing_fields + continue - rel_path = template["cluster_env"] - if not (ray_root_path / rel_path).exists(): - invalid[name].append(rel_path) + return all_missing_fields - required_per_size = {"compute_config"} - sizes = ["small", "large"] - for size in sizes: - configs = template[size] - missing = set(configs) - required_per_size - assert not missing, f"Missing fields for {name} ({size}): {missing_fields}" +def validate_template_paths(templates, invalid_paths) -> None: + root_path = get_root_path() - rel_paths = list(configs["compute_config"].values()) + for template_name, template_config in templates.items(): + if "path" not in template_config: + continue + + # The yaml specifies relative paths to the ray root directory + rel_path = template_config["path"] + if not (root_path / rel_path).exists(): + invalid_paths[template_name].append(rel_path) + + +def validate_cluster_envs(templates, invalid_paths, invalid_yamls) -> None: + root_path = get_root_path() + + for template_name, template_config in templates.items(): + if "cluster_env" not in template_config: + continue + + rel_path = template_config["cluster_env"] + cluster_env_path = root_path / rel_path + if not cluster_env_path.exists(): + invalid_paths[template_name].append(rel_path) + else: + try: + # Assert that the yaml file is properly formatted. + with open(cluster_env_path, "r") as f: + yaml.safe_load(f) + except yaml.parser.ParserError as e: + invalid_yamls[template_name].append(str(e)) + + +def validate_compute_configs(templates, invalid_paths, invalid_yamls) -> dict: + root_path = get_root_path() + required_cloud_providers = {"AWS", "GCP"} + + all_missing_providers = {} + + for template_name, template_config in templates.items(): + if "compute_config" not in template_config: + continue + + compute_config_per_provider = template_config["compute_config"] + + missing_providers = required_cloud_providers - set(compute_config_per_provider) + if missing_providers: + all_missing_providers[template_name] = missing_providers + continue + + rel_paths = list(compute_config_per_provider.values()) for rel_path in rel_paths: - if not (ray_root_path / rel_path).exists(): - invalid[name].append(rel_path) - -if invalid: - print("VALIDATION FAILED!! Please fix the paths listed below:\n\n") - - for name, invalid_paths in invalid.items(): - print("Template Name:", name) - for path in invalid_paths: - print("-", path) - print() -else: - print("Success!") + compute_config_path = root_path / rel_path + if not compute_config_path.exists(): + invalid_paths[template_name].append(rel_path) + else: + try: + # Assert that the yaml file is properly formatted. + with open(compute_config_path, "r") as f: + yaml.safe_load(f) + except yaml.parser.ParserError as e: + invalid_yamls[template_name].append(str(e)) + + return all_missing_providers + + +if __name__ == "__main__": + root_path = get_root_path() + templates_catalog_path = root_path / "doc/source/templates/templates.yaml" + + with open(templates_catalog_path, "r") as f: + templates = yaml.safe_load(f) + + invalid_paths = collections.defaultdict(list) + invalid_yamls = collections.defaultdict(list) + + all_missing_fields = validate_templates_yaml_schema(templates) + validate_template_paths(templates, invalid_paths) + validate_cluster_envs(templates, invalid_paths, invalid_yamls) + all_missing_providers = validate_compute_configs( + templates, invalid_paths, invalid_yamls + ) + + # ======= Print an informative error message. ======== + if any([all_missing_fields, all_missing_providers, invalid_paths, invalid_yamls]): + msg = "TEMPLATES VALIDATION FAILED!! Please fix the issues listed below:\n\n" + + if all_missing_fields: + msg += "Please supply missing fields in `templates.yaml`:\n" + for template_name, missing_fields in all_missing_fields.items(): + msg += f"- {template_name}: {missing_fields}\n" + + if all_missing_providers: + msg += ( + "\nPlease supply paths to compute configs for these cloud providers " + "in `templates.yaml`:\n" + ) + for template_name, missing_providers in all_missing_providers.items(): + msg += f"- {template_name}: {missing_providers}\n" + + if invalid_paths: + msg += "\nPlease fix invalid paths in `templates.yaml`:\n" + for template_name, invalid_paths_for_template in invalid_paths.items(): + msg += f"- {template_name}:\n" + msg += "\n".join([f"\t- {path}" for path in invalid_paths_for_template]) + msg += "\n" + + if invalid_yamls: + msg += "\nPlease fix invalid configuration yamls:\n" + for template_name, invalid_yamls_per_template in invalid_yamls.items(): + msg += f"- {template_name}:\n\n" + msg += "\n\n".join( + f"{i + 1}. {invalid_yaml}" + for i, invalid_yaml in enumerate(invalid_yamls_per_template) + ) + msg += "\n\n" + + raise ValueError(msg) + else: + print("Success!") diff --git a/doc/source/train/api/api.rst b/doc/source/train/api/api.rst index 2230170fc993..36bb7a31ca2c 100644 --- a/doc/source/train/api/api.rst +++ b/doc/source/train/api/api.rst @@ -150,15 +150,25 @@ LightGBM ~train.lightgbm.LightGBMCheckpoint -HuggingFace -~~~~~~~~~~~ +Hugging Face +~~~~~~~~~~~~ + +Transformers +************ + +.. autosummary:: + :toctree: doc/ + + ~train.hf_transformers.TransformersTrainer + ~train.hf_transformers.TransformersCheckpoint + +Accelerate +********** .. autosummary:: :toctree: doc/ - ~train.huggingface.HuggingFaceTrainer - ~train.huggingface.HuggingFaceCheckpoint - ~train.huggingface.accelerate.AccelerateTrainer + ~train.hf_accelerate.AccelerateTrainer Scikit-Learn ~~~~~~~~~~~~ @@ -219,7 +229,7 @@ Restoration API for Built-in Trainers .. autosummary:: - train.huggingface.HuggingFaceTrainer.restore + train.hf_transformers.TransformersTrainer.restore .. note:: @@ -232,4 +242,4 @@ Restoration API for Built-in Trainers .. seealso:: - See :ref:`train-restore-faq` for more details on when and how trainer restore should be used. + See :ref:`train-restore-guide` for more details on when and how trainer restore should be used. diff --git a/doc/source/train/config_guide.rst b/doc/source/train/config_guide.rst index b2a010024808..e419949ace34 100644 --- a/doc/source/train/config_guide.rst +++ b/doc/source/train/config_guide.rst @@ -7,36 +7,51 @@ The following overviews how to configure scale-out, run options, and fault-toler For more details on how to configure data ingest, also refer to :ref:`air-ingest`. Scaling Configurations in Train (``ScalingConfig``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------------------------------- The scaling configuration specifies distributed training properties like the number of workers or the resources per worker. The properties of the scaling configuration are :ref:`tunable `. -:class:`ScalingConfig API reference ` - .. literalinclude:: doc_code/key_concepts.py :language: python :start-after: __scaling_config_start__ :end-before: __scaling_config_end__ +.. seealso:: + + See the :class:`~ray.air.ScalingConfig` API reference. + +.. _train-run-config: Run Configuration in Train (``RunConfig``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------------ -The run configuration specifies distributed training properties like the number of workers or the -resources per worker. +``RunConfig`` is a configuration object used in Ray Train to define the experiment +spec that corresponds to a call to ``trainer.fit()``. -The properties of the run configuration are :ref:`not tunable `. +It includes settings such as the experiment name, storage path for results, +stopping conditions, custom callbacks, checkpoint configuration, verbosity level, +and logging options. + +Many of these settings are configured through other config objects and passed through +the ``RunConfig``. The following sub-sections contain descriptions of these configs. -:class:`RunConfig API reference ` +The properties of the run configuration are :ref:`not tunable `. .. literalinclude:: doc_code/key_concepts.py :language: python :start-after: __run_config_start__ :end-before: __run_config_end__ +.. seealso:: + + See the :class:`~ray.air.RunConfig` API reference. + + See :ref:`tune-storage-options` for storage configuration examples (related to ``storage_path``). + + Failure configurations in Train (``FailureConfig``) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -45,30 +60,15 @@ The failure configuration specifies how training failures should be dealt with. As part of the RunConfig, the properties of the failure configuration are :ref:`not tunable `. -:class:`FailureConfig API reference ` .. literalinclude:: doc_code/key_concepts.py :language: python :start-after: __failure_config_start__ :end-before: __failure_config_end__ -.. _train-config-sync: - -Sync configurations in Train (``SyncConfig``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. seealso:: -The sync configuration specifies how to synchronize checkpoints between the -Ray cluster and remote storage. - -As part of the RunConfig, the properties of the sync configuration -are :ref:`not tunable `. - -:class:`SyncConfig API reference ` - -.. literalinclude:: doc_code/key_concepts.py - :language: python - :start-after: __sync_config_start__ - :end-before: __sync_config_end__ + See the :class:`~ray.air.FailureConfig` API reference. Checkpoint configurations in Train (``CheckpointConfig``) @@ -80,10 +80,46 @@ and how many checkpoints to keep. As part of the RunConfig, the properties of the checkpoint configuration are :ref:`not tunable `. -:class:`CheckpointConfig API reference ` - .. literalinclude:: doc_code/key_concepts.py :language: python :start-after: __checkpoint_config_start__ :end-before: __checkpoint_config_end__ +Trainers of certain frameworks including :class:`~ray.train.xgboost.XGBoostTrainer`, +:class:`~ray.train.lightgbm.LightGBMTrainer`, and :class:`~ray.train.hf_transformers.TransformersTrainer` +implement checkpointing out of the box. For these trainers, checkpointing can be +enabled by setting the checkpoint frequency within the :class:`~ray.air.CheckpointConfig`. + +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __checkpoint_config_ckpt_freq_start__ + :end-before: __checkpoint_config_ckpt_freq_end__ + +.. warning:: + + ``checkpoint_frequency`` and other parameters do *not* work for trainers + that accept a custom training loop such as :class:`~ray.train.torch.TorchTrainer`, + since checkpointing is fully user-controlled. + +.. seealso:: + + See the :class:`~ray.air.CheckpointConfig` API reference. + + +Synchronization configurations in Train (``tune.SyncConfig``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``tune.SyncConfig`` specifies how synchronization of results +and checkpoints should happen in a distributed Ray cluster. + +As part of the RunConfig, the properties of the failure configuration +are :ref:`not tunable `. + +.. note:: + + This configuration is mostly relevant to running multiple Train runs with a + Ray Tune. See :ref:`tune-storage-options` for a guide on using the ``SyncConfig``. + +.. seealso:: + + See the :class:`~ray.tune.syncer.SyncConfig` API reference. diff --git a/doc/source/train/dl_guide.rst b/doc/source/train/dl_guide.rst index d9af4f66ae79..79d9d4ebdcd2 100644 --- a/doc/source/train/dl_guide.rst +++ b/doc/source/train/dl_guide.rst @@ -51,128 +51,130 @@ Updating your training function First, you'll want to update your training function to support distributed training. -.. tabbed:: PyTorch +.. tab-set:: - Ray Train will set up your distributed process group for you and also provides utility methods - to automatically prepare your model and data for distributed training. + .. tab-item:: PyTorch - .. note:: - Ray Train will still work even if you don't use the :func:`ray.train.torch.prepare_model` - and :func:`ray.train.torch.prepare_data_loader` utilities below, - and instead handle the logic directly inside your training function. + Ray Train will set up your distributed process group for you and also provides utility methods + to automatically prepare your model and data for distributed training. - First, use the :func:`~ray.train.torch.prepare_model` function to automatically move your model to the right device and wrap it in - ``DistributedDataParallel``: + .. note:: + Ray Train will still work even if you don't use the :func:`ray.train.torch.prepare_model` + and :func:`ray.train.torch.prepare_data_loader` utilities below, + and instead handle the logic directly inside your training function. - .. code-block:: diff + First, use the :func:`~ray.train.torch.prepare_model` function to automatically move your model to the right device and wrap it in + ``DistributedDataParallel``: - import torch - from torch.nn.parallel import DistributedDataParallel - +from ray.air import session - +from ray import train - +import ray.train.torch + .. code-block:: diff + import torch + from torch.nn.parallel import DistributedDataParallel + +from ray.air import session + +from ray import train + +import ray.train.torch - def train_func(): - - device = torch.device(f"cuda:{session.get_local_rank()}" if - - torch.cuda.is_available() else "cpu") - - torch.cuda.set_device(device) - # Create model. - model = NeuralNetwork() + def train_func(): + - device = torch.device(f"cuda:{session.get_local_rank()}" if + - torch.cuda.is_available() else "cpu") + - torch.cuda.set_device(device) - - model = model.to(device) - - model = DistributedDataParallel(model, - - device_ids=[session.get_local_rank()] if torch.cuda.is_available() else None) + # Create model. + model = NeuralNetwork() - + model = train.torch.prepare_model(model) + - model = model.to(device) + - model = DistributedDataParallel(model, + - device_ids=[session.get_local_rank()] if torch.cuda.is_available() else None) - ... - + + model = train.torch.prepare_model(model) + ... - Then, use the ``prepare_data_loader`` function to automatically add a ``DistributedSampler`` to your ``DataLoader`` - and move the batches to the right device. This step is not necessary if you are passing in Ray Datasets to your Trainer - (see :ref:`train-datasets`): - .. code-block:: diff - import torch - from torch.utils.data import DataLoader, DistributedSampler - +from ray.air import session - +from ray import train - +import ray.train.torch + Then, use the ``prepare_data_loader`` function to automatically add a ``DistributedSampler`` to your ``DataLoader`` + and move the batches to the right device. This step is not necessary if you are passing in Ray Data to your Trainer + (see :ref:`train-datasets`): + .. code-block:: diff - def train_func(): - - device = torch.device(f"cuda:{session.get_local_rank()}" if - - torch.cuda.is_available() else "cpu") - - torch.cuda.set_device(device) + import torch + from torch.utils.data import DataLoader, DistributedSampler + +from ray.air import session + +from ray import train + +import ray.train.torch - ... - - data_loader = DataLoader(my_dataset, batch_size=worker_batch_size, sampler=DistributedSampler(dataset)) + def train_func(): + - device = torch.device(f"cuda:{session.get_local_rank()}" if + - torch.cuda.is_available() else "cpu") + - torch.cuda.set_device(device) - + data_loader = DataLoader(my_dataset, batch_size=worker_batch_size) - + data_loader = train.torch.prepare_data_loader(data_loader) + ... - for X, y in data_loader: - - X = X.to_device(device) - - y = y.to_device(device) + - data_loader = DataLoader(my_dataset, batch_size=worker_batch_size, sampler=DistributedSampler(dataset)) - .. tip:: - Keep in mind that ``DataLoader`` takes in a ``batch_size`` which is the batch size for each worker. - The global batch size can be calculated from the worker batch size (and vice-versa) with the following equation: + + data_loader = DataLoader(my_dataset, batch_size=worker_batch_size) + + data_loader = train.torch.prepare_data_loader(data_loader) - .. code-block:: python - - global_batch_size = worker_batch_size * session.get_world_size() + for X, y in data_loader: + - X = X.to_device(device) + - y = y.to_device(device) + + .. tip:: + Keep in mind that ``DataLoader`` takes in a ``batch_size`` which is the batch size for each worker. + The global batch size can be calculated from the worker batch size (and vice-versa) with the following equation: + + .. code-block:: python -.. tabbed:: TensorFlow + global_batch_size = worker_batch_size * session.get_world_size() - .. note:: - The current TensorFlow implementation supports - ``MultiWorkerMirroredStrategy`` (and ``MirroredStrategy``). If there are - other strategies you wish to see supported by Ray Train, please let us know - by submitting a `feature request on GitHub `_. + .. tab-item:: TensorFlow - These instructions closely follow TensorFlow's `Multi-worker training - with Keras `_ - tutorial. One key difference is that Ray Train will handle the environment - variable set up for you. + .. note:: + The current TensorFlow implementation supports + ``MultiWorkerMirroredStrategy`` (and ``MirroredStrategy``). If there are + other strategies you wish to see supported by Ray Train, please let us know + by submitting a `feature request on GitHub `_. - **Step 1:** Wrap your model in ``MultiWorkerMirroredStrategy``. + These instructions closely follow TensorFlow's `Multi-worker training + with Keras `_ + tutorial. One key difference is that Ray Train will handle the environment + variable set up for you. - The `MultiWorkerMirroredStrategy `_ - enables synchronous distributed training. The ``Model`` *must* be built and - compiled within the scope of the strategy. + **Step 1:** Wrap your model in ``MultiWorkerMirroredStrategy``. - .. code-block:: python + The `MultiWorkerMirroredStrategy `_ + enables synchronous distributed training. The ``Model`` *must* be built and + compiled within the scope of the strategy. + + .. code-block:: python - with tf.distribute.MultiWorkerMirroredStrategy().scope(): - model = ... # build model - model.compile() + with tf.distribute.MultiWorkerMirroredStrategy().scope(): + model = ... # build model + model.compile() - **Step 2:** Update your ``Dataset`` batch size to the *global* batch - size. + **Step 2:** Update your ``Dataset`` batch size to the *global* batch + size. - The `batch `_ - will be split evenly across worker processes, so ``batch_size`` should be - set appropriately. + The `batch `_ + will be split evenly across worker processes, so ``batch_size`` should be + set appropriately. - .. code-block:: diff + .. code-block:: diff - -batch_size = worker_batch_size - +batch_size = worker_batch_size * session.get_world_size() + -batch_size = worker_batch_size + +batch_size = worker_batch_size * session.get_world_size() -.. tabbed:: Horovod + .. tab-item:: Horovod - If you have a training function that already runs with the `Horovod Ray - Executor `_, - you should not need to make any additional changes! + If you have a training function that already runs with the `Horovod Ray + Executor `_, + you should not need to make any additional changes! - To onboard onto Horovod, please visit the `Horovod guide - `_. + To onboard onto Horovod, please visit the `Horovod guide + `_. Creating a Ray Train Trainer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -181,96 +183,100 @@ Creating a Ray Train Trainer execute training. You can create a simple ``Trainer`` for the backend of choice with one of the following: -.. tabbed:: PyTorch +.. tab-set:: - .. code-block:: python + .. tab-item:: PyTorch - from ray.air import ScalingConfig - from ray.train.torch import TorchTrainer - # For GPU Training, set `use_gpu` to True. - use_gpu = False - trainer = TorchTrainer( - train_func, - scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=2) - ) + .. code-block:: python + + from ray.air import ScalingConfig + from ray.train.torch import TorchTrainer + # For GPU Training, set `use_gpu` to True. + use_gpu = False + trainer = TorchTrainer( + train_func, + scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=2) + ) -.. tabbed:: TensorFlow + .. tab-item:: TensorFlow - .. warning:: - Ray will not automatically set any environment variables or configuration - related to local parallelism / threading - :ref:`aside from "OMP_NUM_THREADS" `. - If you desire greater control over TensorFlow threading, use - the ``tf.config.threading`` module (eg. - ``tf.config.threading.set_inter_op_parallelism_threads(num_cpus)``) - at the beginning of your ``train_loop_per_worker`` function. + .. warning:: + Ray will not automatically set any environment variables or configuration + related to local parallelism / threading + :ref:`aside from "OMP_NUM_THREADS" `. + If you desire greater control over TensorFlow threading, use + the ``tf.config.threading`` module (eg. + ``tf.config.threading.set_inter_op_parallelism_threads(num_cpus)``) + at the beginning of your ``train_loop_per_worker`` function. - .. code-block:: python + .. code-block:: python - from ray.air import ScalingConfig - from ray.train.tensorflow import TensorflowTrainer - # For GPU Training, set `use_gpu` to True. - use_gpu = False - trainer = TensorflowTrainer( - train_func, - scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=2) - ) + from ray.air import ScalingConfig + from ray.train.tensorflow import TensorflowTrainer + # For GPU Training, set `use_gpu` to True. + use_gpu = False + trainer = TensorflowTrainer( + train_func, + scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=2) + ) -.. tabbed:: Horovod + .. tab-item:: Horovod - .. code-block:: python + .. code-block:: python - from ray.air import ScalingConfig - from ray.train.horovod import HorovodTrainer - # For GPU Training, set `use_gpu` to True. - use_gpu = False - trainer = HorovodTrainer( - train_func, - scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=2) - ) + from ray.air import ScalingConfig + from ray.train.horovod import HorovodTrainer + # For GPU Training, set `use_gpu` to True. + use_gpu = False + trainer = HorovodTrainer( + train_func, + scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=2) + ) To customize the backend setup, you can use the :ref:`framework-specific config objects `. -.. tabbed:: PyTorch +.. tab-set:: - .. code-block:: python + .. tab-item:: PyTorch - from ray.air import ScalingConfig - from ray.train.torch import TorchTrainer, TorchConfig + .. code-block:: python - trainer = TorchTrainer( - train_func, - torch_backend=TorchConfig(...), - scaling_config=ScalingConfig(num_workers=2), - ) + from ray.air import ScalingConfig + from ray.train.torch import TorchTrainer, TorchConfig + trainer = TorchTrainer( + train_func, + torch_backend=TorchConfig(...), + scaling_config=ScalingConfig(num_workers=2), + ) -.. tabbed:: TensorFlow - .. code-block:: python + .. tab-item:: TensorFlow - from ray.air import ScalingConfig - from ray.train.tensorflow import TensorflowTrainer, TensorflowConfig + .. code-block:: python - trainer = TensorflowTrainer( - train_func, - tensorflow_backend=TensorflowConfig(...), - scaling_config=ScalingConfig(num_workers=2), - ) + from ray.air import ScalingConfig + from ray.train.tensorflow import TensorflowTrainer, TensorflowConfig -.. tabbed:: Horovod + trainer = TensorflowTrainer( + train_func, + tensorflow_backend=TensorflowConfig(...), + scaling_config=ScalingConfig(num_workers=2), + ) - .. code-block:: python + .. tab-item:: Horovod - from ray.air import ScalingConfig - from ray.train.horovod import HorovodTrainer, HorovodConfig + .. code-block:: python - trainer = HorovodTrainer( - train_func, - tensorflow_backend=HorovodConfig(...), - scaling_config=ScalingConfig(num_workers=2), - ) + from ray.air import ScalingConfig + from ray.train.horovod import HorovodTrainer, HorovodConfig + + trainer = HorovodTrainer( + train_func, + tensorflow_backend=HorovodConfig(...), + scaling_config=ScalingConfig(num_workers=2), + ) For more configurability, please reference the :py:class:`~ray.train.data_parallel_trainer.DataParallelTrainer` API. @@ -408,13 +414,13 @@ of the :py:class:`~ray.air.result.Result` object returned by ``Trainer.fit()``. .. _train-datasets: -Distributed Data Ingest with Ray Datasets and Ray Train +Distributed Data Ingest with Ray Data and Ray Train ------------------------------------------------------- -:ref:`Ray Datasets ` are the recommended way to work with large datasets in Ray Train. Datasets provides automatic loading, sharding, and pipelined ingest (optional) of Data across multiple Train workers. +:ref:`Ray Data ` is the recommended way to work with large datasets in Ray Train. Ray Data provides automatic loading, sharding, and streamed ingest of Data across multiple Train workers. To get started, pass in one or more datasets under the ``datasets`` keyword argument for Trainer (e.g., ``Trainer(datasets={...})``). -Here's a simple code overview of the Datasets integration: +Here's a simple code overview of the Ray Data integration: .. code-block:: python @@ -504,6 +510,8 @@ The following figure shows how these two sessions look like in a Data Parallel t .. https://docs.google.com/drawings/d/1g0pv8gqgG29aPEPTcd4BC0LaRNbW1sAkv3H6W1TCp0c/edit +.. _train-dl-saving-checkpoints: + Saving checkpoints ++++++++++++++++++ @@ -518,102 +526,104 @@ attribute. Concrete examples are provided to demonstrate how checkpoints (model weights but not models) are saved appropriately in distributed training. -.. tabbed:: PyTorch - - .. code-block:: python - :emphasize-lines: 36, 37, 38, 39, 40, 41 - - import ray.train.torch - from ray.air import session, Checkpoint, ScalingConfig - from ray.train.torch import TorchTrainer - - import torch - import torch.nn as nn - from torch.optim import Adam - import numpy as np - - def train_func(config): - n = 100 - # create a toy dataset - # data : X - dim = (n, 4) - # target : Y - dim = (n, 1) - X = torch.Tensor(np.random.normal(0, 1, size=(n, 4))) - Y = torch.Tensor(np.random.uniform(0, 1, size=(n, 1))) - # toy neural network : 1-layer - # wrap the model in DDP - model = ray.train.torch.prepare_model(nn.Linear(4, 1)) - criterion = nn.MSELoss() - - optimizer = Adam(model.parameters(), lr=3e-4) - for epoch in range(config["num_epochs"]): - y = model.forward(X) - # compute loss - loss = criterion(y, Y) - # back-propagate loss - optimizer.zero_grad() - loss.backward() - optimizer.step() - state_dict = model.state_dict() - checkpoint = Checkpoint.from_dict( - dict(epoch=epoch, model_weights=state_dict) - ) - session.report({}, checkpoint=checkpoint) - - trainer = TorchTrainer( - train_func, - train_loop_config={"num_epochs": 5}, - scaling_config=ScalingConfig(num_workers=2), - ) - result = trainer.fit() - - print(result.checkpoint.to_dict()) - # {'epoch': 4, 'model_weights': OrderedDict([('bias', tensor([-0.1215])), ('weight', tensor([[0.3253, 0.1979, 0.4525, 0.2850]]))]), '_timestamp': 1656107095, '_preprocessor': None, '_current_checkpoint_id': 4} - - -.. tabbed:: TensorFlow - - .. code-block:: python - :emphasize-lines: 23 - - from ray.air import session, Checkpoint, ScalingConfig - from ray.train.tensorflow import TensorflowTrainer - - import numpy as np - - def train_func(config): - import tensorflow as tf - n = 100 - # create a toy dataset - # data : X - dim = (n, 4) - # target : Y - dim = (n, 1) - X = np.random.normal(0, 1, size=(n, 4)) - Y = np.random.uniform(0, 1, size=(n, 1)) - - strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() - with strategy.scope(): +.. tab-set:: + + .. tab-item:: PyTorch + + .. code-block:: python + :emphasize-lines: 36, 37, 38, 39, 40, 41 + + import ray.train.torch + from ray.air import session, Checkpoint, ScalingConfig + from ray.train.torch import TorchTrainer + + import torch + import torch.nn as nn + from torch.optim import Adam + import numpy as np + + def train_func(config): + n = 100 + # create a toy dataset + # data : X - dim = (n, 4) + # target : Y - dim = (n, 1) + X = torch.Tensor(np.random.normal(0, 1, size=(n, 4))) + Y = torch.Tensor(np.random.uniform(0, 1, size=(n, 1))) # toy neural network : 1-layer - model = tf.keras.Sequential([tf.keras.layers.Dense(1, activation="linear", input_shape=(4,))]) - model.compile(optimizer="Adam", loss="mean_squared_error", metrics=["mse"]) - - for epoch in range(config["num_epochs"]): - model.fit(X, Y, batch_size=20) - checkpoint = Checkpoint.from_dict( - dict(epoch=epoch, model_weights=model.get_weights()) - ) - session.report({}, checkpoint=checkpoint) - - trainer = TensorflowTrainer( - train_func, - train_loop_config={"num_epochs": 5}, - scaling_config=ScalingConfig(num_workers=2), - ) - result = trainer.fit() - - print(result.checkpoint.to_dict()) - # {'epoch': 4, 'model_weights': [array([[-0.31858477], - # [ 0.03747174], - # [ 0.28266194], - # [ 0.8626015 ]], dtype=float32), array([0.02230084], dtype=float32)], '_timestamp': 1656107383, '_preprocessor': None, '_current_checkpoint_id': 4} + # wrap the model in DDP + model = ray.train.torch.prepare_model(nn.Linear(4, 1)) + criterion = nn.MSELoss() + + optimizer = Adam(model.parameters(), lr=3e-4) + for epoch in range(config["num_epochs"]): + y = model.forward(X) + # compute loss + loss = criterion(y, Y) + # back-propagate loss + optimizer.zero_grad() + loss.backward() + optimizer.step() + state_dict = model.state_dict() + checkpoint = Checkpoint.from_dict( + dict(epoch=epoch, model_weights=state_dict) + ) + session.report({}, checkpoint=checkpoint) + + trainer = TorchTrainer( + train_func, + train_loop_config={"num_epochs": 5}, + scaling_config=ScalingConfig(num_workers=2), + ) + result = trainer.fit() + + print(result.checkpoint.to_dict()) + # {'epoch': 4, 'model_weights': OrderedDict([('bias', tensor([-0.1215])), ('weight', tensor([[0.3253, 0.1979, 0.4525, 0.2850]]))]), '_timestamp': 1656107095, '_preprocessor': None, '_current_checkpoint_id': 4} + + + .. tab-item:: TensorFlow + + .. code-block:: python + :emphasize-lines: 23 + + from ray.air import session, Checkpoint, ScalingConfig + from ray.train.tensorflow import TensorflowTrainer + + import numpy as np + + def train_func(config): + import tensorflow as tf + n = 100 + # create a toy dataset + # data : X - dim = (n, 4) + # target : Y - dim = (n, 1) + X = np.random.normal(0, 1, size=(n, 4)) + Y = np.random.uniform(0, 1, size=(n, 1)) + + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + with strategy.scope(): + # toy neural network : 1-layer + model = tf.keras.Sequential([tf.keras.layers.Dense(1, activation="linear", input_shape=(4,))]) + model.compile(optimizer="Adam", loss="mean_squared_error", metrics=["mse"]) + + for epoch in range(config["num_epochs"]): + model.fit(X, Y, batch_size=20) + checkpoint = Checkpoint.from_dict( + dict(epoch=epoch, model_weights=model.get_weights()) + ) + session.report({}, checkpoint=checkpoint) + + trainer = TensorflowTrainer( + train_func, + train_loop_config={"num_epochs": 5}, + scaling_config=ScalingConfig(num_workers=2), + ) + result = trainer.fit() + + print(result.checkpoint.to_dict()) + # {'epoch': 4, 'model_weights': [array([[-0.31858477], + # [ 0.03747174], + # [ 0.28266194], + # [ 0.8626015 ]], dtype=float32), array([0.02230084], dtype=float32)], '_timestamp': 1656107383, '_preprocessor': None, '_current_checkpoint_id': 4} By default, checkpoints will be persisted to local disk in the :ref:`log @@ -688,6 +698,8 @@ You may also config ``CheckpointConfig`` to keep the "N best" checkpoints persis # ('local_path', '/home/ubuntu/ray_results/TorchTrainer_2022-06-24_21-34-49/TorchTrainer_7988b_00000_0_2022-06-24_21-34-49/checkpoint_000002') +.. _train-dl-loading-checkpoints: + Loading checkpoints +++++++++++++++++++ @@ -699,141 +711,143 @@ Checkpoints can be loaded into the training function in 2 steps: 2. The checkpoint to start training with can be bootstrapped by passing in a :py:class:`~ray.air.checkpoint.Checkpoint` to ``Trainer`` as the ``resume_from_checkpoint`` argument. -.. tabbed:: PyTorch - - .. code-block:: python - :emphasize-lines: 23, 25, 26, 29, 30, 31, 35 - - import ray.train.torch - from ray.air import session, Checkpoint, ScalingConfig - from ray.train.torch import TorchTrainer - - import torch - import torch.nn as nn - from torch.optim import Adam - import numpy as np - - def train_func(config): - n = 100 - # create a toy dataset - # data : X - dim = (n, 4) - # target : Y - dim = (n, 1) - X = torch.Tensor(np.random.normal(0, 1, size=(n, 4))) - Y = torch.Tensor(np.random.uniform(0, 1, size=(n, 1))) - - # toy neural network : 1-layer - model = nn.Linear(4, 1) - criterion = nn.MSELoss() - optimizer = Adam(model.parameters(), lr=3e-4) - start_epoch = 0 - - checkpoint = session.get_checkpoint() - if checkpoint: - # assume that we have run the session.report() example - # and successfully save some model weights - checkpoint_dict = checkpoint.to_dict() - model.load_state_dict(checkpoint_dict.get("model_weights")) - start_epoch = checkpoint_dict.get("epoch", -1) + 1 - - # wrap the model in DDP - model = ray.train.torch.prepare_model(model) - for epoch in range(start_epoch, config["num_epochs"]): - y = model.forward(X) - # compute loss - loss = criterion(y, Y) - # back-propagate loss - optimizer.zero_grad() - loss.backward() - optimizer.step() - state_dict = model.state_dict() - checkpoint = Checkpoint.from_dict( - dict(epoch=epoch, model_weights=state_dict) - ) - session.report({}, checkpoint=checkpoint) - - trainer = TorchTrainer( - train_func, - train_loop_config={"num_epochs": 2}, - scaling_config=ScalingConfig(num_workers=2), - ) - # save a checkpoint - result = trainer.fit() - - # load checkpoint - trainer = TorchTrainer( - train_func, - train_loop_config={"num_epochs": 4}, - scaling_config=ScalingConfig(num_workers=2), - resume_from_checkpoint=result.checkpoint, - ) - result = trainer.fit() - - print(result.checkpoint.to_dict()) - # {'epoch': 3, 'model_weights': OrderedDict([('bias', tensor([0.0902])), ('weight', tensor([[-0.1549, -0.0861, 0.4353, -0.4116]]))]), '_timestamp': 1656108265, '_preprocessor': None, '_current_checkpoint_id': 2} - -.. tabbed:: TensorFlow - - .. code-block:: python - :emphasize-lines: 15, 21, 22, 25, 26, 27, 30 - - from ray.air import session, Checkpoint, ScalingConfig - from ray.train.tensorflow import TensorflowTrainer - - import numpy as np - - def train_func(config): - import tensorflow as tf - n = 100 - # create a toy dataset - # data : X - dim = (n, 4) - # target : Y - dim = (n, 1) - X = np.random.normal(0, 1, size=(n, 4)) - Y = np.random.uniform(0, 1, size=(n, 1)) - - start_epoch = 0 - strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() - - with strategy.scope(): +.. tab-set:: + + .. tab-item:: PyTorch + + .. code-block:: python + :emphasize-lines: 23, 25, 26, 29, 30, 31, 35 + + import ray.train.torch + from ray.air import session, Checkpoint, ScalingConfig + from ray.train.torch import TorchTrainer + + import torch + import torch.nn as nn + from torch.optim import Adam + import numpy as np + + def train_func(config): + n = 100 + # create a toy dataset + # data : X - dim = (n, 4) + # target : Y - dim = (n, 1) + X = torch.Tensor(np.random.normal(0, 1, size=(n, 4))) + Y = torch.Tensor(np.random.uniform(0, 1, size=(n, 1))) + # toy neural network : 1-layer - model = tf.keras.Sequential([tf.keras.layers.Dense(1, activation="linear", input_shape=(4,))]) + model = nn.Linear(4, 1) + criterion = nn.MSELoss() + optimizer = Adam(model.parameters(), lr=3e-4) + start_epoch = 0 + checkpoint = session.get_checkpoint() if checkpoint: # assume that we have run the session.report() example # and successfully save some model weights checkpoint_dict = checkpoint.to_dict() - model.set_weights(checkpoint_dict.get("model_weights")) + model.load_state_dict(checkpoint_dict.get("model_weights")) start_epoch = checkpoint_dict.get("epoch", -1) + 1 - model.compile(optimizer="Adam", loss="mean_squared_error", metrics=["mse"]) - - for epoch in range(start_epoch, config["num_epochs"]): - model.fit(X, Y, batch_size=20) - checkpoint = Checkpoint.from_dict( - dict(epoch=epoch, model_weights=model.get_weights()) - ) - session.report({}, checkpoint=checkpoint) - - trainer = TensorflowTrainer( - train_func, - train_loop_config={"num_epochs": 2}, - scaling_config=ScalingConfig(num_workers=2), - ) - # save a checkpoint - result = trainer.fit() - - # load a checkpoint - trainer = TensorflowTrainer( - train_func, - train_loop_config={"num_epochs": 5}, - scaling_config=ScalingConfig(num_workers=2), - resume_from_checkpoint=result.checkpoint, - ) - result = trainer.fit() - - print(result.checkpoint.to_dict()) - # {'epoch': 4, 'model_weights': [array([[-0.70056134], - # [-0.8839263 ], - # [-1.0043601 ], - # [-0.61634773]], dtype=float32), array([0.01889327], dtype=float32)], '_timestamp': 1656108446, '_preprocessor': None, '_current_checkpoint_id': 3} + + # wrap the model in DDP + model = ray.train.torch.prepare_model(model) + for epoch in range(start_epoch, config["num_epochs"]): + y = model.forward(X) + # compute loss + loss = criterion(y, Y) + # back-propagate loss + optimizer.zero_grad() + loss.backward() + optimizer.step() + state_dict = model.state_dict() + checkpoint = Checkpoint.from_dict( + dict(epoch=epoch, model_weights=state_dict) + ) + session.report({}, checkpoint=checkpoint) + + trainer = TorchTrainer( + train_func, + train_loop_config={"num_epochs": 2}, + scaling_config=ScalingConfig(num_workers=2), + ) + # save a checkpoint + result = trainer.fit() + + # load checkpoint + trainer = TorchTrainer( + train_func, + train_loop_config={"num_epochs": 4}, + scaling_config=ScalingConfig(num_workers=2), + resume_from_checkpoint=result.checkpoint, + ) + result = trainer.fit() + + print(result.checkpoint.to_dict()) + # {'epoch': 3, 'model_weights': OrderedDict([('bias', tensor([0.0902])), ('weight', tensor([[-0.1549, -0.0861, 0.4353, -0.4116]]))]), '_timestamp': 1656108265, '_preprocessor': None, '_current_checkpoint_id': 2} + + .. tab-item:: TensorFlow + + .. code-block:: python + :emphasize-lines: 15, 21, 22, 25, 26, 27, 30 + + from ray.air import session, Checkpoint, ScalingConfig + from ray.train.tensorflow import TensorflowTrainer + + import numpy as np + + def train_func(config): + import tensorflow as tf + n = 100 + # create a toy dataset + # data : X - dim = (n, 4) + # target : Y - dim = (n, 1) + X = np.random.normal(0, 1, size=(n, 4)) + Y = np.random.uniform(0, 1, size=(n, 1)) + + start_epoch = 0 + strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() + + with strategy.scope(): + # toy neural network : 1-layer + model = tf.keras.Sequential([tf.keras.layers.Dense(1, activation="linear", input_shape=(4,))]) + checkpoint = session.get_checkpoint() + if checkpoint: + # assume that we have run the session.report() example + # and successfully save some model weights + checkpoint_dict = checkpoint.to_dict() + model.set_weights(checkpoint_dict.get("model_weights")) + start_epoch = checkpoint_dict.get("epoch", -1) + 1 + model.compile(optimizer="Adam", loss="mean_squared_error", metrics=["mse"]) + + for epoch in range(start_epoch, config["num_epochs"]): + model.fit(X, Y, batch_size=20) + checkpoint = Checkpoint.from_dict( + dict(epoch=epoch, model_weights=model.get_weights()) + ) + session.report({}, checkpoint=checkpoint) + + trainer = TensorflowTrainer( + train_func, + train_loop_config={"num_epochs": 2}, + scaling_config=ScalingConfig(num_workers=2), + ) + # save a checkpoint + result = trainer.fit() + + # load a checkpoint + trainer = TensorflowTrainer( + train_func, + train_loop_config={"num_epochs": 5}, + scaling_config=ScalingConfig(num_workers=2), + resume_from_checkpoint=result.checkpoint, + ) + result = trainer.fit() + + print(result.checkpoint.to_dict()) + # {'epoch': 4, 'model_weights': [array([[-0.70056134], + # [-0.8839263 ], + # [-1.0043601 ], + # [-0.61634773]], dtype=float32), array([0.01889327], dtype=float32)], '_timestamp': 1656108446, '_preprocessor': None, '_current_checkpoint_id': 3} .. _train-callbacks: @@ -917,20 +931,22 @@ You may also want to collect metrics from multiple workers. While Ray Train curr worker, you can use third-party libraries or distributed primitives of your machine learning framework to report metrics from multiple workers. -.. tabbed:: PyTorch +.. tab-set:: - Ray Train natively supports `TorchMetrics `_, which provides a collection of machine learning metrics for distributed, scalable PyTorch models. + .. tab-item:: PyTorch - Here is an example of reporting both the aggregated R2 score and mean train and validation loss from all workers. + Ray Train natively supports `TorchMetrics `_, which provides a collection of machine learning metrics for distributed, scalable PyTorch models. - .. literalinclude:: doc_code/torchmetrics_example.py - :language: python - :start-after: __start__ + Here is an example of reporting both the aggregated R2 score and mean train and validation loss from all workers. -.. tabbed:: TensorFlow + .. literalinclude:: doc_code/torchmetrics_example.py + :language: python + :start-after: __start__ - TensorFlow Keras automatically aggregates metrics from all workers. If you wish to have more - control over that, consider implementing a `custom training loop `_. + .. tab-item:: TensorFlow + + TensorFlow Keras automatically aggregates metrics from all workers. If you wish to have more + control over that, consider implementing a `custom training loop `_. .. Running on the cloud .. -------------------- @@ -945,25 +961,124 @@ metrics from multiple workers. .. _train-fault-tolerance: -Fault Tolerance & Elastic Training ----------------------------------- +Fault Tolerance +--------------- + +Automatically Recover from Train Worker Failures +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Ray Train has built-in fault tolerance to recover from worker failures (i.e. ``RayActorError``\s). When a failure is detected, the workers will be shut -down and new workers will be added in. The training function will be -restarted, but progress from the previous execution can be resumed through -checkpointing. +down and new workers will be added in. -.. warning:: In order to retain progress when recovery, your training function - **must** implement logic for both saving *and* loading :ref:`checkpoints - `. +.. note:: Elastic Training is not yet supported. + +The training function will be restarted, but progress from the previous execution can +be resumed through checkpointing. + +.. tip:: + In order to retain progress when recovery, your training function + **must** implement logic for both :ref:`saving ` + *and* :ref:`loading checkpoints `. Each instance of recovery from a worker failure is considered a retry. The number of retries is configurable through the ``max_failures`` attribute of the -``failure_config`` argument set in the ``run_config`` argument passed to the -``Trainer``. +:class:`~ray.air.FailureConfig` argument set in the :class:`~ray.air.RunConfig` +passed to the ``Trainer``: -.. note:: Elastic Training is not yet supported. +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __failure_config_start__ + :end-before: __failure_config_end__ + +.. _train-restore-guide: + +Restore a Ray Train Experiment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +At the experiment level, :ref:`Trainer restoration ` +allows you to resume a previously interrupted experiment from where it left off. + +A Train experiment may be interrupted due to one of the following reasons: + +- The experiment was manually interrupted (e.g., Ctrl+C, or pre-empted head node instance). +- The head node crashed (e.g., OOM or some other runtime error). +- The entire cluster went down (e.g., network error affecting all nodes). + +Trainer restoration is possible for all of Ray Train's built-in trainers, +but we use ``TorchTrainer`` in the examples for demonstration. +We also use ``Trainer`` to refer to methods that are shared across all +built-in trainers. + +Let's say your initial Train experiment is configured as follows. +The actual training loop is just for demonstration purposes: the important detail is that +:ref:`saving ` *and* :ref:`loading checkpoints ` +has been implemented. + +.. literalinclude:: doc_code/dl_guide.py + :language: python + :start-after: __ft_initial_run_start__ + :end-before: __ft_initial_run_end__ + +The results and checkpoints of the experiment are saved to the path configured by :class:`~ray.air.config.RunConfig`. +If the experiment has been interrupted due to one of the reasons listed above, use this path to resume: + +.. literalinclude:: doc_code/dl_guide.py + :language: python + :start-after: __ft_restored_run_start__ + :end-before: __ft_restored_run_end__ + +.. tip:: + + You can also restore from a remote path (e.g., from an experiment directory stored in a s3 bucket). + + .. literalinclude:: doc_code/dl_guide.py + :language: python + :dedent: + :start-after: __ft_restore_from_cloud_initial_start__ + :end-before: __ft_restore_from_cloud_initial_end__ + + .. literalinclude:: doc_code/dl_guide.py + :language: python + :dedent: + :start-after: __ft_restore_from_cloud_restored_start__ + :end-before: __ft_restore_from_cloud_restored_end__ + +.. note:: + + Different trainers may allow more parameters to be optionally re-specified on restore. + Only **datasets** are required to be re-specified on restore, if they were supplied originally. + + See :ref:`train-framework-specific-restore` for more details. + + +Auto-resume ++++++++++++ + +Adding the branching logic below will allow you to run the same script after the interrupt, +picking up training from where you left on the previous run. Notice that we use the +:meth:`Trainer.can_restore ` utility method +to determine the existence and validity of the given experiment directory. + +.. literalinclude:: doc_code/dl_guide.py + :language: python + :start-after: __ft_autoresume_start__ + :end-before: __ft_autoresume_end__ + +.. seealso:: + + See the :meth:`BaseTrainer.restore ` docstring + for a full example. + +.. note:: + + `Trainer.restore` is different from + :class:`Trainer(..., resume_from_checkpoint=...) `. + `resume_from_checkpoint` is meant to be used to start a *new* Train experiment, + which writes results to a new directory and starts over from iteration 0. + + `Trainer.restore` is used to continue an existing experiment, where + new results will continue to be appended to existing logs. .. Running on pre-emptible machines .. -------------------------------- @@ -1074,40 +1189,42 @@ Automatic Mixed Precision Automatic mixed precision (AMP) lets you train your models faster by using a lower precision datatype for operations like linear layers and convolutions. -.. tabbed:: PyTorch +.. tab-set:: - You can train your Torch model with AMP by: + .. tab-item:: PyTorch - 1. Adding :func:`ray.train.torch.accelerate` with ``amp=True`` to the top of your training function. - 2. Wrapping your optimizer with :func:`ray.train.torch.prepare_optimizer`. - 3. Replacing your backward call with :func:`ray.train.torch.backward`. + You can train your Torch model with AMP by: - .. code-block:: diff + 1. Adding :func:`ray.train.torch.accelerate` with ``amp=True`` to the top of your training function. + 2. Wrapping your optimizer with :func:`ray.train.torch.prepare_optimizer`. + 3. Replacing your backward call with :func:`ray.train.torch.backward`. - def train_func(): - + train.torch.accelerate(amp=True) + .. code-block:: diff - model = NeuralNetwork() - model = train.torch.prepare_model(model) + def train_func(): + + train.torch.accelerate(amp=True) - data_loader = DataLoader(my_dataset, batch_size=worker_batch_size) - data_loader = train.torch.prepare_data_loader(data_loader) + model = NeuralNetwork() + model = train.torch.prepare_model(model) - optimizer = torch.optim.SGD(model.parameters(), lr=0.001) - + optimizer = train.torch.prepare_optimizer(optimizer) + data_loader = DataLoader(my_dataset, batch_size=worker_batch_size) + data_loader = train.torch.prepare_data_loader(data_loader) - model.train() - for epoch in range(90): - for images, targets in dataloader: - optimizer.zero_grad() + optimizer = torch.optim.SGD(model.parameters(), lr=0.001) + + optimizer = train.torch.prepare_optimizer(optimizer) - outputs = model(images) - loss = torch.nn.functional.cross_entropy(outputs, targets) + model.train() + for epoch in range(90): + for images, targets in dataloader: + optimizer.zero_grad() - - loss.backward() - + train.torch.backward(loss) - optimizer.step() - ... + outputs = model(images) + loss = torch.nn.functional.cross_entropy(outputs, targets) + + - loss.backward() + + train.torch.backward(loss) + optimizer.step() + ... .. note:: The performance of AMP varies based on GPU architecture, model type, @@ -1119,25 +1236,27 @@ precision datatype for operations like linear layers and convolutions. Reproducibility --------------- -.. tabbed:: PyTorch +.. tab-set:: + + .. tab-item:: PyTorch - To limit sources of nondeterministic behavior, add - :func:`ray.train.torch.enable_reproducibility` to the top of your training - function. + To limit sources of nondeterministic behavior, add + :func:`ray.train.torch.enable_reproducibility` to the top of your training + function. - .. code-block:: diff + .. code-block:: diff - def train_func(): - + train.torch.enable_reproducibility() + def train_func(): + + train.torch.enable_reproducibility() - model = NeuralNetwork() - model = train.torch.prepare_model(model) + model = NeuralNetwork() + model = train.torch.prepare_model(model) - ... + ... - .. warning:: :func:`ray.train.torch.enable_reproducibility` can't guarantee - completely reproducible results across executions. To learn more, read - the `PyTorch notes on randomness `_. + .. warning:: :func:`ray.train.torch.enable_reproducibility` can't guarantee + completely reproducible results across executions. To learn more, read + the `PyTorch notes on randomness `_. .. import ray diff --git a/doc/source/train/doc_code/dl_guide.py b/doc/source/train/doc_code/dl_guide.py new file mode 100644 index 000000000000..67f74dee5cb3 --- /dev/null +++ b/doc/source/train/doc_code/dl_guide.py @@ -0,0 +1,98 @@ +# flake8: noqa + +MOCK = True + +# __ft_initial_run_start__ +from typing import Dict, Optional + +import ray +from ray import air +from ray.air import session +from ray.train.torch import TorchCheckpoint, TorchTrainer + + +def get_datasets() -> Dict[str, ray.data.Dataset]: + return {"train": ray.data.from_items([{"x": i, "y": 2 * i} for i in range(10)])} + + +def train_loop_per_worker(config: dict): + from torchvision.models import resnet18 + + # Checkpoint loading + checkpoint: Optional[TorchCheckpoint] = session.get_checkpoint() + model = checkpoint.get_model() if checkpoint else resnet18() + ray.train.torch.prepare_model(model) + + train_ds = session.get_dataset_shard("train") + + for epoch in range(5): + # Do some training... + + # Checkpoint saving + session.report( + {"epoch": epoch}, + checkpoint=TorchCheckpoint.from_model(model), + ) + + +trainer = TorchTrainer( + train_loop_per_worker=train_loop_per_worker, + datasets=get_datasets(), + scaling_config=air.ScalingConfig(num_workers=2), + run_config=air.RunConfig( + storage_path="~/ray_results", + name="dl_trainer_restore", + ), +) +result = trainer.fit() +# __ft_initial_run_end__ + +# __ft_restored_run_start__ +from ray.train.torch import TorchTrainer + +restored_trainer = TorchTrainer.restore( + path="~/ray_results/dl_trainer_restore", + datasets=get_datasets(), +) +# __ft_restored_run_end__ + + +if not MOCK: + # __ft_restore_from_cloud_initial_start__ + original_trainer = TorchTrainer( + # ... + run_config=air.RunConfig( + # Configure cloud storage + storage_path="s3://results-bucket", + name="dl_trainer_restore", + ), + ) + result = trainer.fit() + # __ft_restore_from_cloud_initial_end__ + + # __ft_restore_from_cloud_restored_start__ + restored_trainer = TorchTrainer.restore( + "s3://results-bucket/dl_trainer_restore", + datasets=get_datasets(), + ) + # __ft_restore_from_cloud_restored_end__ + + +# __ft_autoresume_start__ +if TorchTrainer.can_restore("~/ray_results/dl_restore_autoresume"): + trainer = TorchTrainer.restore( + "~/ray_results/dl_restore_autoresume", + datasets=get_datasets(), + ) + result = trainer.fit() +else: + trainer = TorchTrainer( + train_loop_per_worker=train_loop_per_worker, + datasets=get_datasets(), + scaling_config=air.ScalingConfig(num_workers=2), + run_config=air.RunConfig( + storage_path="~/ray_results", name="dl_restore_autoresume" + ), + ) +result = trainer.fit() +# __ft_autoresume_end__ diff --git a/doc/source/train/doc_code/key_concepts.py b/doc/source/train/doc_code/key_concepts.py index b7845f80d530..9d4704f36d7d 100644 --- a/doc/source/train/doc_code/key_concepts.py +++ b/doc/source/train/doc_code/key_concepts.py @@ -98,14 +98,20 @@ def train_fn(config): # __run_config_start__ from ray.air import RunConfig +from ray.air.integrations.wandb import WandbLoggerCallback run_config = RunConfig( # Name of the training run (directory name). name="my_train_run", - # Directory to store results in (will be storage_path/name). + # The experiment results will be saved to: storage_path/name storage_path="~/ray_results", + # storage_path="s3://my_bucket/tune_results", # Low training verbosity. verbose=1, + # Custom and built-in callbacks + callbacks=[WandbLoggerCallback()], + # Stopping criteria + stop={"training_iteration": 10}, ) # __run_config_end__ @@ -120,26 +126,37 @@ def train_fn(config): ) # __failure_config_end__ -# __sync_config_start__ -from ray.air import RunConfig -from ray.tune import SyncConfig +# __checkpoint_config_start__ +from ray.air import RunConfig, CheckpointConfig run_config = RunConfig( + checkpoint_config=CheckpointConfig( + # Only keep the 2 *best* checkpoints and delete the others. + num_to_keep=2, + # *Best* checkpoints are determined by these params: + checkpoint_score_attribute="mean_accuracy", + checkpoint_score_order="max", + ), # This will store checkpoints on S3. - storage_path="s3://remote-bucket/location" + storage_path="s3://remote-bucket/location", ) -# __sync_config_end__ +# __checkpoint_config_end__ -# __checkpoint_config_start__ +# __checkpoint_config_ckpt_freq_start__ from ray.air import RunConfig, CheckpointConfig run_config = RunConfig( checkpoint_config=CheckpointConfig( - # Only keep this many checkpoints. - num_to_keep=2 + # Checkpoint every iteration. + checkpoint_frequency=1, + # Only keep the latest checkpoint and delete the others. + num_to_keep=1, ) ) -# __checkpoint_config_end__ + +# from ray.train.xgboost import XGBoostTrainer +# trainer = XGBoostTrainer(..., run_config=run_config) +# __checkpoint_config_ckpt_freq_end__ # __results_start__ diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index c1ad510ebaa5..5ce1ac51e5f5 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -31,109 +31,106 @@ and use cases. You can filter these examples by the following categories: Distributed Training Examples using Ray Train --------------------------------------------- -.. panels:: - :container: container pb-4 full-width - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /images/pytorch_logo.png - - +++ - .. link-button:: torch_fashion_mnist_ex - :type: ref - :text: PyTorch Fashion MNIST Training Example - :classes: btn-link btn-block stretched-link trainTorchFashionMnist - - --- - :img-top: /images/hugging.png - - +++ - .. link-button:: train_transformers_example - :type: ref - :text: Transformers with PyTorch Training Example - :classes: btn-link btn-block stretched-link trainTransformers - - --- - :img-top: /images/tf_logo.png - - +++ - .. link-button:: tensorflow_mnist_example - :type: ref - :text: TensorFlow MNIST Training Example - :classes: btn-link btn-block stretched-link trainTensorflowMnist - - --- - :img-top: /images/horovod.png - - +++ - .. link-button:: horovod_example - :type: ref - :text: End-to-end Horovod Training Example - :classes: btn-link btn-block stretched-link trainHorovod +.. grid:: 1 2 3 3 + :gutter: 1 + :class-container: container pb-4 + + .. grid-item-card:: + :img-top: /images/pytorch_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: torch_fashion_mnist_ex + + PyTorch Fashion MNIST Training Example + + .. grid-item-card:: + :img-top: /images/hugging.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: train_transformers_example + + Transformers with PyTorch Training Example + + .. grid-item-card:: + :img-top: /images/tf_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tensorflow_mnist_example + + TensorFlow MNIST Training Example + + .. grid-item-card:: + :img-top: /images/horovod.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: horovod_example + + End-to-end Horovod Training Example + + .. grid-item-card:: + :img-top: /images/pytorch_lightning_small.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: lightning_mnist_example + + End-to-end PyTorch Lightning Training Example + + .. grid-item-card:: + :img-top: /images/pytorch_lightning_small.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: lightning_advanced_example + + Use LightningTrainer with Ray Data and Batch Predictor - --- - :img-top: /images/pytorch_lightning_small.png - - +++ - .. link-button:: lightning_mnist_example - :type: ref - :text: End-to-end PyTorch Lightning Training Example - :classes: btn-link btn-block stretched-link trainLightning - - --- - :img-top: /images/pytorch_lightning_small.png - - +++ - .. link-button:: lightning_advanced_example - :type: ref - :text: Use LightningTrainer with Ray Data and Batch Predictor - :classes: btn-link btn-block stretched-link trainLightning + .. grid-item-card:: + :img-top: /images/pytorch_lightning_small.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: dolly_lightning_fsdp_finetuning + + Fine-tune LLM with AIR LightningTrainer and FSDP Ray Train Examples Using Loggers & Callbacks -------------------------------------------- -.. panels:: - :container: container pb-4 full-width - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - --- - :img-top: /images/mlflow.png +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 - +++ - .. link-button:: train_mlflow_example - :type: ref - :text: Logging Training Runs with MLflow - :classes: btn-link btn-block stretched-link trainMlflow + .. grid-item-card:: + :img-top: /images/mlflow.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: train_mlflow_example + + Logging Training Runs with MLflow Ray Train & Tune Integration Examples ------------------------------------- -.. panels:: - :container: container pb-4 full-width - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: /images/tune.png + .. button-ref:: tune_train_tf_example - +++ - .. link-button:: tune_train_tf_example - :type: ref - :text: End-to-end Example for Tuning a TensorFlow Model - :classes: btn-link btn-block stretched-link trainTuneTensorflow + End-to-end Example for Tuning a TensorFlow Model - --- - :img-top: /images/tune.png + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - +++ - .. link-button:: tune_train_torch_example - :type: ref - :text: End-to-end Example for Tuning a PyTorch Model with PBT - :classes: btn-link btn-block stretched-link trainTunePyTorch + .. button-ref:: tune_train_torch_example + + End-to-end Example for Tuning a PyTorch Model with PBT .. TODO implement these examples! @@ -152,16 +149,15 @@ Ray Train & Tune Integration Examples Ray Train Benchmarks -------------------- -.. panels:: - :container: container pb-4 full-width - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - --- - :img-top: /ray-overview/images/ray_svg_logo.svg +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: train_benchmark - +++ - .. link-button:: train_benchmark - :type: ref - :text: Benchmark example for the PyTorch data transfer auto pipeline - :classes: btn-link btn-block stretched-link trainBenchmark + Benchmark example for the PyTorch data transfer auto pipeline diff --git a/doc/source/train/examples/lightning/lightning_cola_advanced.ipynb b/doc/source/train/examples/lightning/lightning_cola_advanced.ipynb index 2c6b85e8dc40..d95cbe9b2195 100644 --- a/doc/source/train/examples/lightning/lightning_cola_advanced.ipynb +++ b/doc/source/train/examples/lightning/lightning_cola_advanced.ipynb @@ -11,7 +11,7 @@ "\n", ":::{note}\n", "\n", - "This is an advanced example for {class}`LightningTrainer `, which demonstrates how to use LightningTrainer with `Ray Dataset` and `Batch Predictor`. \n", + "This is an advanced example for {class}`LightningTrainer `, which demonstrates how to use LightningTrainer with {ref}`Dataset ` and {ref}`Batch Predictor `. \n", "\n", "If you just want to quickly convert your existing PyTorch Lightning scripts into Ray AIR, you can refer to this starter example:\n", "{ref}`Train a Pytorch Lightning Image Classifier `.\n", @@ -20,7 +20,7 @@ "\n", "In this demo, we will introduce how to finetune a text classifier on [CoLA(The Corpus of Linguistic Acceptability)](https://nyu-mll.github.io/CoLA/) datasets with pretrained BERT. \n", "In particular, we will:\n", - "- Create Ray Datasets from the original CoLA dataset.\n", + "- Create Ray Data from the original CoLA dataset.\n", "- Define a preprocessor to tokenize the sentences.\n", "- Finetune a BERT model using LightningTrainer.\n", "- Construct a BatchPredictor with the checkpoint and preprocessor.\n", @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 5, "metadata": { "tags": [ "remove-cell" @@ -40,9 +40,32 @@ "SMOKE_TEST = True" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following line in order to install all the necessary dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install numpy datasets \"transformers>=4.19.1\" \"pytorch_lightning>=1.6.5\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start by importing the needed libraries:" + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -61,28 +84,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 1. Pre-process CoLA Dataset\n", + "## Pre-process CoLA Dataset\n", "\n", - "CoLA is a binary sentence classification task with 10.6K training examples. First, we download the dataset and metrics using the HuggingFace API, and create Ray Datasets for each split accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Reusing dataset glue (/home/ray/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n", - "100%|██████████| 3/3 [00:00<00:00, 948.44it/s]\n" - ] - } - ], - "source": [ - "dataset = load_dataset(\"glue\", \"cola\")\n", - "metric = load_metric(\"glue\", \"cola\")" + "CoLA is a binary sentence classification task with 10.6K training examples. First, we download the dataset and metrics using the HuggingFace API, and create Ray Data for each split accordingly." ] }, { @@ -91,6 +95,9 @@ "metadata": {}, "outputs": [], "source": [ + "dataset = load_dataset(\"glue\", \"cola\")\n", + "metric = load_metric(\"glue\", \"cola\")\n", + "\n", "ray_datasets = ray.data.from_huggingface(dataset)" ] }, @@ -104,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -136,14 +143,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Define a PyTorch Lightning Model\n", + "## Define a PyTorch Lightning Model\n", "\n", "You don't have to make any change of your `LightningModule` definition. Just copy and paste your code here:" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -202,16 +209,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Finetune the model with LightningTrainer\n", + "## Configure your LightningTrainer\n", "\n", "Define a LightningTrainer with necessary configurations, including hyper-parameters, checkpointing and compute resources settings. \n", "\n", - "You may find the API of {class}`LightningConfigBuilder ` useful.\n" + "You may find the API of {class}`LightningConfigBuilder ` and the discussion {ref}`here ` useful.\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -225,8 +233,31 @@ " .trainer(max_epochs=5, accelerator=\"gpu\")\n", " .checkpointing(save_on_train_epoch_end=False)\n", " .build()\n", - ")\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + ":::{note}\n", + "Note that the `lightning_config` is created on the head node and will be passed to the worker nodes later. Be aware that the environment variables and hardware settings may differ between the head node and worker nodes.\n", + ":::\n", "\n", + ":::{note}\n", + "{meth}`LightningConfigBuilder.checkpointing() ` creates a [ModelCheckpoint](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html#lightning.pytorch.callbacks.ModelCheckpoint) callback. This callback defines the checkpoint frequency and saves checkpoint files in Lightning style. \n", + "\n", + "If you want to save AIR checkpoints for Batch Prediction, please also provide an AIR {class}`CheckpointConfig `.\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ "# Save AIR checkpoints according to the performance on validation set\n", "run_config = RunConfig(\n", " name=\"ptl-sent-classification\",\n", @@ -237,7 +268,7 @@ " ),\n", ")\n", "\n", - "# Scale the training workload across 4 GPUs\n", + "# Scale the DDP training workload across 4 GPUs\n", "# You can change this config based on your compute resources.\n", "scaling_config = ScalingConfig(\n", " num_workers=4, use_gpu=True, resources_per_worker={\"CPU\": 1, \"GPU\": 1}\n", @@ -246,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": { "tags": [ "remove-cell" @@ -272,16 +303,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## Fine-tune the model with LightningTrainer\n", + "\n", "Train the model with the configuration we specified above. \n", "\n", "To feed data into LightningTrainer, we need to configure the following arguments:\n", "\n", - "- datasets: A dictionary of the input Ray datasets, with special keys \"train\" and \"val\".\n", - "- datasets_iter_config: The argument list of {meth}`iter_torch_batches() `. It defines the way we iterate dataset shards for each worker.\n", - "- preprocessor: The preprocessor that will be applied to the input dataset.\n", + "- `datasets`: A dictionary of the input Ray datasets, with special keys \"train\" and \"val\".\n", + "- `datasets_iter_config`: The argument list of {meth}`iter_torch_batches() `. It defines the way we iterate dataset shards for each worker.\n", + "- `preprocessor`: The preprocessor that will be applied to the input dataset.\n", "\n", ":::{note}\n", - "Note that we are using Ray Dataset for data ingestion for faster preprocessing here, but you can also continue to use the native `PyTorch DataLoader` or `LightningDataModule`. See {ref}`this example `. \n", + "Note that we are using Dataset for data ingestion for faster preprocessing here, but you can also continue to use the native `PyTorch DataLoader` or `LightningDataModule`. See {ref}`this example `. \n", "\n", ":::\n", "\n", @@ -291,9 +324,1002 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "
    \n", + "
    \n", + "

    Tune Status

    \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Current time:2023-04-24 10:42:50
    Running for: 00:06:26.94
    Memory: 23.8/186.6 GiB
    \n", + "
    \n", + "
    \n", + "
    \n", + "

    System Info

    \n", + " Using FIFO scheduling algorithm.
    Logical resource usage: 0/48 CPUs, 0/4 GPUs (0.0/1.0 accelerator_type:T4)\n", + "
    \n", + " \n", + "
    \n", + "
    \n", + "
    \n", + "

    Trial Status

    \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Trial name status loc iter total time (s) train_loss matthews_correlation epoch
    LightningTrainer_87ecf_00000TERMINATED10.0.60.127:67819 5 376.028 0.0119807 0.589931 4
    \n", + "
    \n", + "
    \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(pid=67819) /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "(pid=67819) from pandas import MultiIndex, Int64Index\n", + "(LightningTrainer pid=67819) 2023-04-24 10:36:31,679\tINFO backend_executor.py:128 -- Starting distributed worker processes: ['68396 (10.0.60.127)', '68397 (10.0.60.127)', '68398 (10.0.60.127)', '68399 (10.0.60.127)']\n", + "(RayTrainWorker pid=68396) 2023-04-24 10:36:32,731\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f9443dd2a6dc49029ef7fb4d7a596729", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=67819) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00 TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder]\n", + "(LightningTrainer pid=67819) 2023-04-24 10:36:34,052\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "(LightningTrainer pid=67819) 2023-04-24 10:36:34,053\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1.\n", + "(RayTrainWorker pid=68396) /home/ray/anaconda3/lib/python3.9/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", + "(RayTrainWorker pid=68396) from pandas import MultiIndex, Int64Index\n", + "Downloading: 0%| | 0.00/416M [00:00 TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:36:59,629\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:36:59,629\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "70151d1b6133418fb5bf5e39b0089dd6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=68398) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00 TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder] [repeated 3x across cluster]\n", + "(RayTrainWorker pid=68399) 2023-04-24 10:36:59,628\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False) [repeated 3x across cluster]\n", + "(RayTrainWorker pid=68399) 2023-04-24 10:36:59,629\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1. [repeated 3x across cluster]\n", + "(RayTrainWorker pid=68398) [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", + "(RayTrainWorker pid=68396) 2023-04-24 10:37:27.091660: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", + "(RayTrainWorker pid=68396) To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "(RayTrainWorker pid=68399) [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) [repeated 3x across cluster]\n", + "(RayTrainWorker pid=68396) 2023-04-24 10:37:27.373013: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "(RayTrainWorker pid=68396) 2023-04-24 10:37:28.763569: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n", + "(RayTrainWorker pid=68396) 2023-04-24 10:37:28.763761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n", + "(RayTrainWorker pid=68396) 2023-04-24 10:37:28.763770: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:38:01,220\tINFO streaming_executor.py:87 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:38:01,221\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:38:01,221\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "50090e60317342e8a2fa5747b2dfc7dd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=68398) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00\n", + "

    Trial Progress

    \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Trial name _report_on date done epoch experiment_taghostname iterations_since_restore matthews_correlationnode_ip pidshould_checkpoint step time_since_restore time_this_iter_s time_total_s timestamp train_loss training_iterationtrial_id
    LightningTrainer_87ecf_00000validation_end2023-04-24_10-42-46True 4 0ip-10-0-60-127 5 0.58993110.0.60.12767819True 670 376.028 70.6609 376.028 1682358165 0.0119807 587ecf_00000
    \n", + "
    \n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=68398) 2023-04-24 10:39:03,705\tINFO streaming_executor.py:87 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder] [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:39:03,706\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False) [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:39:03,706\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1. [repeated 4x across cluster]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "125ccea4d26e48c0bf4e45610f9ae64a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=68398) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00 TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder] [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:40:09,873\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False) [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:40:09,873\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1. [repeated 4x across cluster]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "db4c22b67b844a6d8ff3e1882540bce4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=68398) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00 TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder] [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:41:18,552\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False) [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:41:18,552\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1. [repeated 4x across cluster]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ccc3d13c44b344e8891a81794fd17ffe", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=68398) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00 TaskPoolMapOperator[BatchMapper] -> AllToAllOperator[RandomizeBlockOrder] [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:42:29,325\tINFO streaming_executor.py:88 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False) [repeated 4x across cluster]\n", + "(RayTrainWorker pid=68398) 2023-04-24 10:42:29,325\tINFO streaming_executor.py:90 -- Tip: To enable per-operator progress reporting, set RAY_DATA_VERBOSE_PROGRESS=1. [repeated 4x across cluster]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "55f6f7e8333341d1b57a890809bc90ad", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "(pid=68398) - RandomizeBlockOrder 1: 0%| | 0/1 [00:00`. \n", + "\n", + ":::" + ] + }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": { "tags": [] }, @@ -317,13 +1354,13 @@ "data": { "text/plain": [ "Result(\n", - " metrics={'_report_on': 'validation_end', 'train_loss': 0.05989973247051239, 'matthews_correlation': 0.5175218541439164, 'epoch': 4, 'step': 670, 'should_checkpoint': True, 'done': True, 'trial_id': '5ae4c_00000', 'experiment_tag': '0'},\n", - " path='/home/ray/ray_results/ptl-sent-classification/LightningTrainer_5ae4c_00000_0_2023-04-05_12-45-05',\n", - " checkpoint=LightningCheckpoint(local_path=/home/ray/ray_results/ptl-sent-classification/LightningTrainer_5ae4c_00000_0_2023-04-05_12-45-05/checkpoint_000004)\n", + " metrics={'_report_on': 'validation_end', 'train_loss': 0.011980690062046051, 'matthews_correlation': 0.5899314497879129, 'epoch': 4, 'step': 670, 'should_checkpoint': True, 'done': True, 'trial_id': '87ecf_00000', 'experiment_tag': '0'},\n", + " path='/home/ray/ray_results/ptl-sent-classification/LightningTrainer_87ecf_00000_0_2023-04-24_10-36-23',\n", + " checkpoint=LightningCheckpoint(local_path=/home/ray/ray_results/ptl-sent-classification/LightningTrainer_87ecf_00000_0_2023-04-24_10-36-23/checkpoint_000004)\n", ")" ] }, - "execution_count": 11, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -337,7 +1374,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Do Batch Inference with a Saved Checkpoint" + "## Do Batch Inference with a Saved Checkpoint" ] }, { @@ -352,7 +1389,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 17, "metadata": { "tags": [] }, @@ -365,7 +1402,8 @@ "checkpoint = result.checkpoint\n", "\n", "# You can also load a checkpoint from disk:\n", - "# checkpoint = LightningCheckpoint.from_directory(\"YOUR_CHECKPOINT_DIR\")\n", + "# YOUR_CHECKPOINT_DIR = result.checkpoint.path\n", + "# checkpoint = LightningCheckpoint.from_directory(YOUR_CHECKPOINT_DIR)\n", "\n", "batch_predictor = BatchPredictor(\n", " checkpoint=checkpoint,\n", @@ -373,17 +1411,8 @@ " use_gpu=True,\n", " model_class=SentimentModel,\n", " preprocessor=preprocessor,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ + ")\n", + "\n", "# Use 2 GPUs for batch inference\n", "predictions = batch_predictor.predict(\n", " ray_datasets[\"validation\"],\n", @@ -406,34 +1435,65 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": { "tags": [] }, + "outputs": [], + "source": [ + "# Internally, BatchPredictor calls forward() method of the LightningModule.\n", + "# Convert the logits tensor into labels with argmax.\n", + "def argmax(batch):\n", + " batch[\"predictions\"] = batch[\"predictions\"].apply(lambda x: np.argmax(x))\n", + " return batch\n", + "\n", + "\n", + "results = predictions.map_batches(argmax, batch_format=\"pandas\").to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'matthews_correlation': 0.5175218541439164}\n" + " predictions label\n", + "0 1 1\n", + "1 1 1\n", + "2 0 1\n", + "3 1 1\n", + "4 0 0\n", + "5 1 0\n", + "6 1 0\n", + "7 1 1\n", + "8 1 1\n", + "9 1 1\n", + "\n", + "{'matthews_correlation': 0.5899314497879129}\n" ] } ], "source": [ - "# Internally, BatchPredictor calls forward() method of the LightningModule.\n", - "# Convert the logits tensor into labels with argmax.\n", - "def argmax(batch):\n", - " batch[\"predictions\"] = batch[\"predictions\"].apply(lambda x: np.argmax(x))\n", - " return batch\n", - "\n", - "\n", - "results = predictions.map_batches(argmax).to_pandas()\n", - "\n", "matthews_corr = metric.compute(\n", " predictions=results[\"predictions\"], references=results[\"label\"]\n", ")\n", + "print(results.head(10))\n", "print(matthews_corr)" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What's next?\n", + "\n", + "- {ref}`Fine-tune a Large Language Model with LightningTrainer and FSDP `\n", + "- {ref}`Hyperparameter searching with LightningTrainer + Ray Tune. `" + ] } ], "metadata": { diff --git a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb index 043dba3a08ac..fed176064fdd 100644 --- a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb +++ b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb @@ -14,20 +14,16 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "tags": [ - "remove-cell" - ] - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "SMOKE_TEST = True" + "!pip install \"torchmetrics>=0.9\" \"pytorch_lightning>=1.6\" " ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -108,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -123,7 +119,9 @@ " nn.ReLU(),\n", " )\n", " self.lr = lr\n", - " self.accuracy = Accuracy()\n", + " self.accuracy = Accuracy(task=\"multiclass\", num_classes=10)\n", + " self.eval_loss = []\n", + " self.eval_accuracy = []\n", "\n", " def forward(self, x):\n", " x = x.view(-1, 28 * 28)\n", @@ -140,6 +138,8 @@ " def validation_step(self, val_batch, batch_idx):\n", " loss, acc = self._shared_eval(val_batch)\n", " self.log(\"val_accuracy\", acc)\n", + " self.eval_loss.append(loss)\n", + " self.eval_accuracy.append(acc)\n", " return {\"val_loss\": loss, \"val_accuracy\": acc}\n", "\n", " def test_step(self, test_batch, batch_idx):\n", @@ -154,11 +154,13 @@ " acc = self.accuracy(logits, y)\n", " return loss, acc\n", "\n", - " def validation_epoch_end(self, outputs):\n", - " avg_loss = torch.stack([x[\"val_loss\"] for x in outputs]).mean()\n", - " avg_acc = torch.stack([x[\"val_accuracy\"] for x in outputs]).mean()\n", + " def on_validation_epoch_end(self):\n", + " avg_loss = torch.stack(self.eval_loss).mean()\n", + " avg_acc = torch.stack(self.eval_accuracy).mean()\n", " self.log(\"val_loss\", avg_loss, sync_dist=True)\n", " self.log(\"val_accuracy\", avg_acc, sync_dist=True)\n", + " self.eval_loss.clear()\n", + " self.eval_accuracy.clear()\n", "\n", " def configure_optimizers(self):\n", " optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)\n", @@ -177,6 +179,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "(lightning-config-builder-intro)=\n", + "\n", "## Define the Cofigurations for AIR LightningTrainer\n", "\n", "The {meth}`LightningConfigBuilder ` class stores all the parameters involved in training a PyTorch Lightning module. It takes the same parameter lists as those in PyTorch Lightning.\n", @@ -192,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -205,7 +209,7 @@ ")\n", "\n", "\n", - "def build_lightning_config_from_existing_code():\n", + "def build_lightning_config_from_existing_code(use_gpu):\n", " # Create a config builder to encapsulate all required parameters.\n", " # Note that model instantiation and fitting will occur later in the LightingTrainer,\n", " # rather than in the config builder.\n", @@ -232,7 +236,7 @@ " # )\n", " config_builder.trainer(\n", " max_epochs=10,\n", - " accelerator=\"cpu\",\n", + " accelerator=\"gpu\" if use_gpu else \"cpu\",\n", " log_every_n_steps=100,\n", " logger=CSVLogger(\"logs\"),\n", " )\n", @@ -259,34 +263,23 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "lightning_config = (\n", - " LightningConfigBuilder()\n", - " .module(MNISTClassifier, lr=1e-3, feature_dim=128)\n", - " .trainer(\n", - " max_epochs=10,\n", - " accelerator=\"cpu\",\n", - " log_every_n_steps=100,\n", - " logger=CSVLogger(\"logs\"),\n", - " )\n", - " .fit_params(datamodule=datamodule)\n", - " .checkpointing(monitor=\"val_accuracy\", mode=\"max\", save_top_k=3)\n", - " .build()\n", - ")" + "# Set it to False if you want to run without GPUs\n", + "use_gpu = True" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "scaling_config = ScalingConfig(\n", - " num_workers=4, use_gpu=True, resources_per_worker={\"CPU\": 1, \"GPU\": 1}\n", - ")\n", + "lightning_config = build_lightning_config_from_existing_code(use_gpu=use_gpu)\n", + "\n", + "scaling_config = ScalingConfig(num_workers=4, use_gpu=use_gpu)\n", "\n", "run_config = RunConfig(\n", " name=\"ptl-mnist-example\",\n", @@ -296,31 +289,8 @@ " checkpoint_score_attribute=\"val_accuracy\",\n", " checkpoint_score_order=\"max\",\n", " ),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "tags": [ - "remove-cell" - ] - }, - "outputs": [], - "source": [ - "if SMOKE_TEST:\n", - " scaling_config = ScalingConfig(\n", - " num_workers=4, use_gpu=False, resources_per_worker={\"CPU\": 1}\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ + ")\n", + "\n", "trainer = LightningTrainer(\n", " lightning_config=lightning_config,\n", " scaling_config=scaling_config,\n", @@ -337,9 +307,21 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "find: ‘.git’: No such file or directory\n", + "2023-04-28 09:30:43,657\tINFO worker.py:1432 -- Connecting to existing Ray cluster at address: 10.0.12.241:6379...\n", + "2023-04-28 09:30:43,665\tINFO worker.py:1607 -- Connected to Ray cluster. View the dashboard at https://console.anyscale-staging.com/api/v2/sessions/ses_vhpce9uvpnmhikmask3c5db399/services?redirect_to=dashboard \n", + "2023-04-28 09:30:43,671\tINFO packaging.py:347 -- Pushing file package 'gcs://_ray_pkg_c896ee9346ecab5d19a2dbcff95e2084.zip' (0.07MiB) to Ray cluster...\n", + "2023-04-28 09:30:43,672\tINFO packaging.py:360 -- Successfully pushed file package 'gcs://_ray_pkg_c896ee9346ecab5d19a2dbcff95e2084.zip'.\n", + "2023-04-28 09:30:43,725\tINFO tune.py:221 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.\n" + ] + }, { "data": { "text/html": [ @@ -349,16 +331,16 @@ "

    Tune Status

    \n", " \n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "
    Current time:2023-03-23 17:06:23
    Running for: 00:00:42.86
    Memory: 5.2/62.0 GiB
    Current time:2023-04-28 09:31:32
    Running for: 00:00:48.90
    Memory: 16.9/186.6 GiB
    \n", "
    \n", "
    \n", "
    \n", "

    System Info

    \n", - " Using FIFO scheduling algorithm.
    Logical resource usage: 0/16 CPUs, 0/0 GPUs\n", + " Using FIFO scheduling algorithm.
    Logical resource usage: 0/48 CPUs, 0/4 GPUs (0.0/1.0 accelerator_type:T4)\n", "
    \n", " \n", "
    \n", @@ -367,10 +349,10 @@ "

    Trial Status

    \n", " \n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
    Trial name status loc iter total time (s) train_loss val_accuracy val_loss
    Trial name status loc iter total time (s) train_loss val_accuracy val_loss
    LightningTrainer_9cfa6_00000TERMINATED10.0.61.115:358929 10 32.1313 0.0822004 0.969926 -12.5678
    LightningTrainer_0593e_00000TERMINATED10.0.12.241:56808 10 33.056 0.0840481 0.970436 -12.5445
    \n", "
    \n", @@ -417,39 +399,123 @@ "name": "stderr", "output_type": "stream", "text": [ - "(pid=358929) /home/ray/anaconda3/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "(pid=358929) from pandas import MultiIndex, Int64Index\n", - "(RayTrainWorker pid=359239) 2023-03-23 17:05:52,362\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n", - "(RayTrainWorker pid=359241) /home/ray/anaconda3/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "(RayTrainWorker pid=359241) from pandas import MultiIndex, Int64Index\n", - "(RayTrainWorker pid=359242) /home/ray/anaconda3/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "(RayTrainWorker pid=359242) from pandas import MultiIndex, Int64Index\n", - "(RayTrainWorker pid=359239) /home/ray/anaconda3/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "(RayTrainWorker pid=359239) from pandas import MultiIndex, Int64Index\n", - "(RayTrainWorker pid=359240) /home/ray/anaconda3/lib/python3.8/site-packages/xgboost/compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n", - "(RayTrainWorker pid=359240) from pandas import MultiIndex, Int64Index\n", - "(RayTrainWorker pid=359239) GPU available: False, used: False\n", - "(RayTrainWorker pid=359239) TPU available: False, using: 0 TPU cores\n", - "(RayTrainWorker pid=359239) IPU available: False, using: 0 IPUs\n", - "(RayTrainWorker pid=359239) HPU available: False, using: 0 HPUs\n", - "(RayTrainWorker pid=359241) Missing logger folder: logs/lightning_logs\n", - "(RayTrainWorker pid=359242) Missing logger folder: logs/lightning_logs\n", - "(RayTrainWorker pid=359239) Missing logger folder: logs/lightning_logs\n", - "(RayTrainWorker pid=359239) \n", - "(RayTrainWorker pid=359239) | Name | Type | Params\n", - "(RayTrainWorker pid=359239) -------------------------------------------------\n", - "(RayTrainWorker pid=359239) 0 | linear_relu_stack | Sequential | 101 K \n", - "(RayTrainWorker pid=359239) 1 | accuracy | Accuracy | 0 \n", - "(RayTrainWorker pid=359239) -------------------------------------------------\n", - "(RayTrainWorker pid=359239) 101 K Trainable params\n", - "(RayTrainWorker pid=359239) 0 Non-trainable params\n", - "(RayTrainWorker pid=359239) 101 K Total params\n", - "(RayTrainWorker pid=359239) 0.407 Total estimated model params size (MB)\n", - "(RayTrainWorker pid=359240) Missing logger folder: logs/lightning_logs\n", - "(RayTrainWorker pid=359241) [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", - "(RayTrainWorker pid=359239) [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", - "(RayTrainWorker pid=359242) [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n", - "(RayTrainWorker pid=359240) [W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())\n" + "(pid=56808) /mnt/cluster_storage/pypi/lib/python3.9/site-packages/neptune/common/warnings.py:62: NeptuneDeprecationWarning: You're importing the Neptune client library via the deprecated `neptune.new` module, which will be removed in a future release. Import directly from `neptune` instead.\n", + "(pid=56808) warnings.warn(\n", + "(LightningTrainer pid=56808) 2023-04-28 09:31:00,123\tINFO backend_executor.py:128 -- Starting distributed worker processes: ['57429 (10.0.12.241)', '57430 (10.0.12.241)', '57431 (10.0.12.241)', '57432 (10.0.12.241)']\n", + "(RayTrainWorker pid=57429) 2023-04-28 09:31:01,088\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n", + "(RayTrainWorker pid=57431) warnings.warn(\n", + "(RayTrainWorker pid=57431) warnings.warn(\n", + "(RayTrainWorker pid=57429) /mnt/cluster_storage/pypi/lib/python3.9/site-packages/neptune/common/warnings.py:62: NeptuneDeprecationWarning: You're importing the Neptune client library via the deprecated `neptune.new` module, which will be removed in a future release. Import directly from `neptune` instead.\n", + "(RayTrainWorker pid=57429) warnings.warn(\n", + "(RayTrainWorker pid=57429) GPU available: True, used: True\n", + "(RayTrainWorker pid=57429) TPU available: False, using: 0 TPU cores\n", + "(RayTrainWorker pid=57429) IPU available: False, using: 0 IPUs\n", + "(RayTrainWorker pid=57429) HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=57432) Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/9912422 [00:00Trial Progress\n", " \n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
    Trial name _report_on date done epoch experiment_taghostname iterations_since_restorenode_ip pidshould_checkpoint step time_since_restore time_this_iter_s time_total_s timestamp train_loss training_iterationtrial_id val_accuracy val_loss
    Trial name _report_on date done epoch experiment_taghostname iterations_since_restorenode_ip pidshould_checkpoint step time_since_restore time_this_iter_s time_total_s timestamp train_loss training_iterationtrial_id val_accuracy val_loss
    LightningTrainer_9cfa6_00000train_epoch_end2023-03-23_17-06-20True 9 0ip-10-0-61-115 1010.0.61.115358929True 1080 32.1313 2.26905 32.1313 1679616380 0.0822004 109cfa6_00000 0.969926 -12.5678
    LightningTrainer_0593e_00000train_epoch_end2023-04-28_09-31-29True 9 0ip-10-0-12-241 1010.0.12.24156808True 1080 33.056 1.58153 33.056 1682699489 0.0840481 100593e_00000 0.970436 -12.5445
    \n", "
    \n", @@ -491,27 +557,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-03-23 17:06:23,588\tINFO tune.py:817 -- Total run time: 42.87 seconds (42.86 seconds for the tuning loop).\n" + "2023-04-28 09:31:32,674\tINFO tune.py:1010 -- Total run time: 48.95 seconds (48.90 seconds for the tuning loop).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Validation Accuracy: 0.9699258804321289\n" + "Validation Accuracy: 0.9704360961914062\n" ] }, { "data": { "text/plain": [ "Result(\n", - " metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.08220043778419495, 'val_accuracy': 0.9699258804321289, 'val_loss': -12.567845344543457, 'epoch': 9, 'step': 1080, 'should_checkpoint': True, 'done': True, 'trial_id': '9cfa6_00000', 'experiment_tag': '0'},\n", - " log_dir=PosixPath('/tmp/ray_results/ptl-mnist-example/LightningTrainer_9cfa6_00000_0_2023-03-23_17-05-40'),\n", - " checkpoint=LightningCheckpoint(local_path=/tmp/ray_results/ptl-mnist-example/LightningTrainer_9cfa6_00000_0_2023-03-23_17-05-40/checkpoint_000009)\n", + " metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.0840480849146843, 'val_accuracy': 0.9704360961914062, 'val_loss': -12.544519424438477, 'epoch': 9, 'step': 1080, 'should_checkpoint': True, 'done': True, 'trial_id': '0593e_00000', 'experiment_tag': '0'},\n", + " path='/tmp/ray_results/ptl-mnist-example/LightningTrainer_0593e_00000_0_2023-04-28_09-30-46',\n", + " checkpoint=LightningCheckpoint(local_path=/tmp/ray_results/ptl-mnist-example/LightningTrainer_0593e_00000_0_2023-04-28_09-30-46/checkpoint_000009)\n", ")" ] }, - "execution_count": 57, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -533,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 9, "metadata": { "tags": [] }, @@ -545,7 +611,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 10, "metadata": { "tags": [] }, @@ -554,18 +620,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/ray/anaconda3/lib/python3.8/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.\n", + "/home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:92: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.\n", " rank_zero_warn(\n", - "GPU available: False, used: False\n", + "GPU available: True, used: False\n", "TPU available: False, using: 0 TPU cores\n", "IPU available: False, using: 0 IPUs\n", - "HPU available: False, using: 0 HPUs\n" + "HPU available: False, using: 0 HPUs\n", + "/home/ray/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1814: PossibleUserWarning: GPU available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='gpu', devices=4)`.\n", + " rank_zero_warn(\n", + "Missing logger folder: /home/ray/default/doc/source/train/examples/lightning/lightning_logs\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fe27955de52247bfadf2a4320af1cf44", + "model_id": "c3034eb12cf846b0aff76f28c348be06", "version_major": 2, "version_minor": 0 }, @@ -576,13 +645,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-04-28 09:31:33.611773: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-04-28 09:31:33.762802: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-04-28 09:31:34.628099: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n", + "2023-04-28 09:31:34.628189: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64\n", + "2023-04-28 09:31:34.628194: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n" + ] + }, { "data": { "text/html": [ "
    ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
            "┃        Test metric               DataLoader 0        ┃\n",
            "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
    -       "│       test_accuracy           0.9742000102996826     │\n",
    +       "│       test_accuracy           0.9735999703407288     │\n",
            "└───────────────────────────┴───────────────────────────┘\n",
            "
    \n" ], @@ -590,7 +671,7 @@ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1m Test metric \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m DataLoader 0 \u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", - "│\u001b[36m \u001b[0m\u001b[36m test_accuracy \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m 0.9742000102996826 \u001b[0m\u001b[35m \u001b[0m│\n", + "│\u001b[36m \u001b[0m\u001b[36m test_accuracy \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m 0.9735999703407288 \u001b[0m\u001b[35m \u001b[0m│\n", "└───────────────────────────┴───────────────────────────┘\n" ] }, @@ -614,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 11, "metadata": { "tags": [] }, @@ -623,7 +704,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.9742\n" + "Accuracy: 0.9736\n" ] } ], @@ -631,7 +712,7 @@ "from ray.train.lightning import LightningPredictor\n", "\n", "predictor = LightningPredictor.from_checkpoint(\n", - " checkpoint, MNISTClassifier, use_gpu=False\n", + " checkpoint, MNISTClassifier, use_gpu=use_gpu\n", ")\n", "\n", "\n", @@ -658,9 +739,10 @@ "metadata": {}, "source": [ "## What's next?\n", - "- Use Ray Dataset for more efficient data preprocessing.\n", - "- Use {class}`BatchPredictor ` for large-scale distributed inference.\n", - "- Find the best hyperparameter settings with Ray Tune." + "\n", + "- {ref}`Use LightningTrainer with Ray Data and Batch Predictor `\n", + "- {ref}`Fine-tune a Large Language Model with LightningTrainer and FSDP `\n", + "- {ref}`Hyperparameter searching with LightningTrainer + Ray Tune. `" ] } ], diff --git a/doc/source/train/faq.rst b/doc/source/train/faq.rst index 737cf16f81e0..320aa4610d20 100644 --- a/doc/source/train/faq.rst +++ b/doc/source/train/faq.rst @@ -27,109 +27,6 @@ you can initialize the ``Trainer`` with ``resources_per_worker`` specified in `` currently assume each worker is allocated exactly 1 GPU. The partial GPU and multi GPU use-cases can still be run with Ray Train today without these functions. -.. _train-restore-faq: - -How do I restore a Ray Train experiment? ----------------------------------------- - -A Train experiment may be interrupted due to one of the following reasons: - -- The experiment was manually interrupted (e.g., Ctrl+C, or pre-empted head node instance). -- The head node crashed (e.g., OOM or some other runtime error). -- The entire cluster went down (e.g., network error affecting all nodes). - -In these cases, a Trainer :ref:`can be restored ` for the experiment to resume. - -Since this is applicable to all of Ray Train's built-in trainers, -we'll use `FrameworkTrainer` to refer to a generic trainer for the remainder of this answer. - -To restore an experiment, first find the experiment directory that your previous -run was saved to. If you saved locally, this will look like ``{storage_path}/{name}``, -where ``storage_path`` may be ``~/ray_results``, and ``name`` is something -like ``FrameworkTrainer_2023-xxx``. - -Note that these are the same parameters that you pass through :class:`~ray.air.RunConfig`. - -.. code-block:: python - - datasets = {"train": ray.data.from_items([{"x": i, "y": 2 * i} for i in range(10)])} - - restored_trainer = FrameworkTrainer.restore( - path="~/ray_results/FrameworkTrainer_2023-02-15_00-46-58", - datasets=datasets, - ) - -It's also possible to restore from a remote path (e.g., from an experiment directory -stored in a s3 bucket). - -.. code-block:: python - - datasets = {"train": ray.data.from_items([{"x": i, "y": 2 * i} for i in range(10)])} - - restored_trainer = FrameworkTrainer.restore( - path="s3://results-bucket/FrameworkTrainer_2023-02-15_00-46-58", - datasets=datasets, - ) - -.. note:: - - `FrameworkTrainer.restore` may allow more parameters to be re-specified depending - on which trainer you're using. See :ref:`train-framework-specific-restore` for more details. - - -Single Script for Automatic Restoration -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Adding the branching logic below will allow you to run the same script after the interrupt, -picking up training from where you left on the previous run. Notice that we use the -:meth:`FrameworkTrainer.can_restore ` utility method -to determine the existence/validity of the given experiment directory. - -.. code-block:: python - - # run_train_experiment.py - - # Load datasets, define a preprocessor, etc. - # datasets = { ... } - # preprocessor = ... - - experiment_name = "train_experiment" - experiment_dir = f"~/ray_results/{experiment_name}" - - if FrameworkTrainer.can_restore(experiment_dir): - trainer = FrameworkTrainer.restore( - experiment_dir, - datasets=datasets, - ) - else: - trainer = FrameworkTrainer( - datasets=datasets, - preprocessor=preprocessor, - scaling_config=air.ScalingConfig(num_workers=2, use_gpu=False), - run_config=air.RunConfig( - name=experiment_name, - storage_path="~/ray_results", - failure_config=air.FailureConfig(max_failures=3), - stop={"training_iteration": 10}, - ), - ) - -.. seealso:: - - See the :meth:`BaseTrainer.restore ` docstring - for a full example. - -.. note:: - - `FrameworkTrainer.restore` is different from - :class:`FrameworkTrainer(..., resume_from_checkpoint=...) `. - `resume_from_checkpoint` is meant to be used to start a *new* Train experiment, - which writes results to a new directory and starts over from iteration 0. - - `FrameworkTrainer.restore` is used to continue an existing experiment, where - new results will continue to be appended to existing logs. - - My multi-node PyTorch GPU training is hanging or giving me obscure NCCL errors. What do I do? --------------------------------------------------------------------------------------------- diff --git a/doc/source/train/gbdt.rst b/doc/source/train/gbdt.rst index a4f1aec6cb68..d470ff4c49af 100644 --- a/doc/source/train/gbdt.rst +++ b/doc/source/train/gbdt.rst @@ -12,29 +12,57 @@ Just as in the original `xgboost.train() `__ functions, the training parameters are passed as the ``params`` dictionary. -.. tabbed:: XGBoost +.. tab-set:: - Run ``pip install -U xgboost_ray``. + .. tab-item:: XGBoost - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __xgboost_start__ - :end-before: __xgboost_end__ + Run ``pip install -U xgboost_ray``. -.. tabbed:: LightGBM + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgboost_start__ + :end-before: __xgboost_end__ - Run ``pip install -U lightgbm_ray``. + .. tab-item:: LightGBM - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __lightgbm_start__ - :end-before: __lightgbm_end__ + Run ``pip install -U lightgbm_ray``. + + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lightgbm_start__ + :end-before: __lightgbm_end__ Ray-specific params are passed in through the trainer constructors. +Saving and Loading XGBoost and LightGBM Checkpoints +--------------------------------------------------- + +When a new tree is trained on every boosting round, +it's possible to save a checkpoint to snapshot the training progress so far. +:class:`~ray.train.xgboost.XGBoostTrainer` and :class:`~ray.train.lightgbm.LightGBMTrainer` +both implement checkpointing out of the box. + +The only required change is to configure :class:`~ray.air.CheckpointConfig` to set +the checkpointing frequency. For example, the following configuration will +save a checkpoint on every boosting round and will only keep the latest checkpoint: + +.. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __checkpoint_config_ckpt_freq_start__ + :end-before: __checkpoint_config_ckpt_freq_end__ + +.. tip:: + + Once checkpointing is enabled, you can follow :ref:`this guide ` + to enable fault tolerance. + + See the :ref:`Trainer restore API reference ` for more details. + + How to scale out training? -------------------------- + The benefit of using Ray AIR is that you can seamlessly scale up your training by adjusting the :class:`ScalingConfig `. @@ -49,48 +77,49 @@ adjusting the :class:`ScalingConfig `. Here are some examples for common use-cases: +.. tab-set:: -.. tabbed:: Multi-node CPU + .. tab-item:: Multi-node CPU - Setup: 4 nodes with 8 CPUs each. + Setup: 4 nodes with 8 CPUs each. - Use-case: To utilize all resources in multi-node training. + Use-case: To utilize all resources in multi-node training. - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __scaling_cpu_start__ - :end-before: __scaling_cpu_end__ + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __scaling_cpu_start__ + :end-before: __scaling_cpu_end__ - Note that we pass 0 CPUs for the trainer resources, so that all resources can - be allocated to the actual distributed training workers. + Note that we pass 0 CPUs for the trainer resources, so that all resources can + be allocated to the actual distributed training workers. -.. tabbed:: Single-node multi-GPU + .. tab-item:: Single-node multi-GPU - Setup: 1 node with 8 CPUs and 4 GPUs. + Setup: 1 node with 8 CPUs and 4 GPUs. - Use-case: If you have a single node with multiple GPUs, you need to use - distributed training to leverage all GPUs. + Use-case: If you have a single node with multiple GPUs, you need to use + distributed training to leverage all GPUs. - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __scaling_gpu_start__ - :end-before: __scaling_gpu_end__ + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __scaling_gpu_start__ + :end-before: __scaling_gpu_end__ -.. tabbed:: Multi-node multi-GPU + .. tab-item:: Multi-node multi-GPU - Setup: 4 node with 8 CPUs and 4 GPUs each. + Setup: 4 node with 8 CPUs and 4 GPUs each. - Use-case: If you have a multiple nodes with multiple GPUs, you need to - schedule one worker per GPU. + Use-case: If you have a multiple nodes with multiple GPUs, you need to + schedule one worker per GPU. - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __scaling_gpumulti_start__ - :end-before: __scaling_gpumulti_end__ + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __scaling_gpumulti_start__ + :end-before: __scaling_gpumulti_end__ - Note that you just have to adjust the number of workers - everything else - will be handled by Ray automatically. + Note that you just have to adjust the number of workers - everything else + will be handled by Ray automatically. How many remote actors should I use? diff --git a/doc/source/train/getting-started.rst b/doc/source/train/getting-started.rst index 113eb6ff607e..a7105abed9db 100644 --- a/doc/source/train/getting-started.rst +++ b/doc/source/train/getting-started.rst @@ -6,186 +6,188 @@ Getting Started with Distributed Model Training in Ray Train Ray Train offers multiple ``Trainers`` which implement scalable model training for different machine learning frameworks. Here are examples for some of the commonly used trainers: -.. tabbed:: XGBoost +.. tab-set:: - In this example we will train a model using distributed XGBoost. + .. tab-item:: XGBoost - First, we load the dataset from S3 using Ray Datasets and split it into a - train and validation dataset. + In this example we will train a model using distributed XGBoost. - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __xgb_detail_intro_start__ - :end-before: __xgb_detail_intro_end__ + First, we load the dataset from S3 using Ray Data and split it into a + train and validation dataset. - In the :class:`ScalingConfig `, - we configure the number of workers to use: + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_intro_start__ + :end-before: __xgb_detail_intro_end__ - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __xgb_detail_scaling_start__ - :end-before: __xgb_detail_scaling_end__ + In the :class:`ScalingConfig `, + we configure the number of workers to use: - We then instantiate our XGBoostTrainer by passing in: + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_scaling_start__ + :end-before: __xgb_detail_scaling_end__ - - The aforementioned ``ScalingConfig``. - - The ``label_column`` refers to the column name containing the labels in the Ray Dataset - - The ``params`` are `XGBoost training parameters `__ + We then instantiate our XGBoostTrainer by passing in: - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __xgb_detail_training_start__ - :end-before: __xgb_detail_training_end__ + - The aforementioned ``ScalingConfig``. + - The ``label_column`` refers to the column name containing the labels in the Dataset + - The ``params`` are `XGBoost training parameters `__ - Lastly, we call ``trainer.fit()`` to kick off training and obtain the results. + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_training_start__ + :end-before: __xgb_detail_training_end__ - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __xgb_detail_fit_start__ - :end-before: __xgb_detail_fit_end__ + Lastly, we call ``trainer.fit()`` to kick off training and obtain the results. -.. tabbed:: LightGBM + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_fit_start__ + :end-before: __xgb_detail_fit_end__ - In this example we will train a model using distributed LightGBM. + .. tab-item:: LightGBM - First, we load the dataset from S3 using Ray Datasets and split it into a - train and validation dataset. + In this example we will train a model using distributed LightGBM. - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __lgbm_detail_intro_start__ - :end-before: __lgbm_detail_intro_end__ + First, we load the dataset from S3 using Ray Data and split it into a + train and validation dataset. - In the :class:`ScalingConfig `, - we configure the number of workers to use: + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lgbm_detail_intro_start__ + :end-before: __lgbm_detail_intro_end__ - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __xgb_detail_scaling_start__ - :end-before: __xgb_detail_scaling_end__ + In the :class:`ScalingConfig `, + we configure the number of workers to use: - We then instantiate our LightGBMTrainer by passing in: + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgb_detail_scaling_start__ + :end-before: __xgb_detail_scaling_end__ - - The aforementioned ``ScalingConfig`` - - The ``label_column`` refers to the column name containing the labels in the Ray Dataset - - The ``params`` are core `LightGBM training parameters `__ + We then instantiate our LightGBMTrainer by passing in: - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __lgbm_detail_training_start__ - :end-before: __lgbm_detail_training_end__ + - The aforementioned ``ScalingConfig`` + - The ``label_column`` refers to the column name containing the labels in the Dataset + - The ``params`` are core `LightGBM training parameters `__ - And lastly we call ``trainer.fit()`` to kick off training and obtain the results. + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lgbm_detail_training_start__ + :end-before: __lgbm_detail_training_end__ - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __lgbm_detail_fit_start__ - :end-before: __lgbm_detail_fit_end__ + And lastly we call ``trainer.fit()`` to kick off training and obtain the results. -.. tabbed:: PyTorch + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lgbm_detail_fit_start__ + :end-before: __lgbm_detail_fit_end__ - This example shows how you can use Ray Train with PyTorch. + .. tab-item:: PyTorch - First, set up your dataset and model. + This example shows how you can use Ray Train with PyTorch. - .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py - :language: python - :start-after: __torch_setup_begin__ - :end-before: __torch_setup_end__ + First, set up your dataset and model. + .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py + :language: python + :start-after: __torch_setup_begin__ + :end-before: __torch_setup_end__ - Now define your single-worker PyTorch training function. - .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py - :language: python - :start-after: __torch_single_begin__ - :end-before: __torch_single_end__ + Now define your single-worker PyTorch training function. - This training function can be executed with: + .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py + :language: python + :start-after: __torch_single_begin__ + :end-before: __torch_single_end__ - .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py - :language: python - :start-after: __torch_single_run_begin__ - :end-before: __torch_single_run_end__ - :dedent: + This training function can be executed with: - Now let's convert this to a distributed multi-worker training function! + .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py + :language: python + :start-after: __torch_single_run_begin__ + :end-before: __torch_single_run_end__ + :dedent: - All you have to do is use the ``ray.train.torch.prepare_model`` and - ``ray.train.torch.prepare_data_loader`` utility functions to - easily setup your model & data for distributed training. - This will automatically wrap your model with ``DistributedDataParallel`` - and place it on the right device, and add ``DistributedSampler`` to your DataLoaders. + Now let's convert this to a distributed multi-worker training function! - .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py - :language: python - :start-after: __torch_distributed_begin__ - :end-before: __torch_distributed_end__ + All you have to do is use the ``ray.train.torch.prepare_model`` and + ``ray.train.torch.prepare_data_loader`` utility functions to + easily setup your model & data for distributed training. + This will automatically wrap your model with ``DistributedDataParallel`` + and place it on the right device, and add ``DistributedSampler`` to your DataLoaders. - Then, instantiate a ``TorchTrainer`` - with 4 workers, and use it to run the new training function! + .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py + :language: python + :start-after: __torch_distributed_begin__ + :end-before: __torch_distributed_end__ - .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py - :language: python - :start-after: __torch_trainer_begin__ - :end-before: __torch_trainer_end__ - :dedent: + Then, instantiate a ``TorchTrainer`` + with 4 workers, and use it to run the new training function! - See :ref:`train-porting-code` for a more comprehensive example. + .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_quick_start.py + :language: python + :start-after: __torch_trainer_begin__ + :end-before: __torch_trainer_end__ + :dedent: -.. tabbed:: TensorFlow + See :ref:`train-porting-code` for a more comprehensive example. - This example shows how you can use Ray Train to set up `Multi-worker training - with Keras `_. + .. tab-item:: TensorFlow - First, set up your dataset and model. + This example shows how you can use Ray Train to set up `Multi-worker training + with Keras `_. - .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py - :language: python - :start-after: __tf_setup_begin__ - :end-before: __tf_setup_end__ + First, set up your dataset and model. - Now define your single-worker TensorFlow training function. + .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py + :language: python + :start-after: __tf_setup_begin__ + :end-before: __tf_setup_end__ - .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py - :language: python - :start-after: __tf_single_begin__ - :end-before: __tf_single_end__ + Now define your single-worker TensorFlow training function. - This training function can be executed with: + .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py + :language: python + :start-after: __tf_single_begin__ + :end-before: __tf_single_end__ - .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py - :language: python - :start-after: __tf_single_run_begin__ - :end-before: __tf_single_run_end__ - :dedent: + This training function can be executed with: - Now let's convert this to a distributed multi-worker training function! - All you need to do is: + .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py + :language: python + :start-after: __tf_single_run_begin__ + :end-before: __tf_single_run_end__ + :dedent: - 1. Set the per-worker batch size - each worker will process the same size - batch as in the single-worker code. - 2. Choose your TensorFlow distributed training strategy. In this example - we use the ``MultiWorkerMirroredStrategy``. + Now let's convert this to a distributed multi-worker training function! + All you need to do is: - .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py - :language: python - :start-after: __tf_distributed_begin__ - :end-before: __tf_distributed_end__ + 1. Set the per-worker batch size - each worker will process the same size + batch as in the single-worker code. + 2. Choose your TensorFlow distributed training strategy. In this example + we use the ``MultiWorkerMirroredStrategy``. - Then, instantiate a ``TensorflowTrainer`` with 4 workers, - and use it to run the new training function! + .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py + :language: python + :start-after: __tf_distributed_begin__ + :end-before: __tf_distributed_end__ - .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py - :language: python - :start-after: __tf_trainer_begin__ - :end-before: __tf_trainer_end__ - :dedent: + Then, instantiate a ``TensorflowTrainer`` with 4 workers, + and use it to run the new training function! - See :ref:`train-porting-code` for a more comprehensive example. + .. literalinclude:: /../../python/ray/train/examples/tf/tensorflow_quick_start.py + :language: python + :start-after: __tf_trainer_begin__ + :end-before: __tf_trainer_end__ + :dedent: + + See :ref:`train-porting-code` for a more comprehensive example. Next Steps ---------- -* To check how your application is doing, you can use the :ref:`Ray dashboard`. \ No newline at end of file +* To check how your application is doing, you can use the :ref:`Ray dashboard`. diff --git a/doc/source/train/key-concepts.rst b/doc/source/train/key-concepts.rst index c477099d2104..61ac57ac16e5 100644 --- a/doc/source/train/key-concepts.rst +++ b/doc/source/train/key-concepts.rst @@ -28,44 +28,46 @@ Deep Learning, Tree-Based, and other Trainers There are three categories of built-in Trainers: -.. tabbed:: Deep Learning Trainers +.. tab-set:: - Ray Train supports the following deep learning trainers: + .. tab-item:: Deep Learning Trainers - - :class:`TorchTrainer ` - - :class:`TensorflowTrainer ` - - :class:`HorovodTrainer ` + Ray Train supports the following deep learning trainers: - For these trainers, you usually define your own training function that loads the model - and executes single-worker training steps. Refer to the following guides for more details: + - :class:`TorchTrainer ` + - :class:`TensorflowTrainer ` + - :class:`HorovodTrainer ` - - :ref:`Deep learning user guide ` - - :ref:`Quick overview of deep-learning trainers in the Ray AIR documentation ` + For these trainers, you usually define your own training function that loads the model + and executes single-worker training steps. Refer to the following guides for more details: -.. tabbed:: Tree-Based Trainers + - :ref:`Deep learning user guide ` + - :ref:`Quick overview of deep-learning trainers in the Ray AIR documentation ` - Tree-based trainers utilize gradient-based decision trees for training. The most popular libraries - for this are XGBoost and LightGBM. + .. tab-item:: Tree-Based Trainers - - :class:`XGBoostTrainer ` - - :class:`LightGBMTrainer ` + Tree-based trainers utilize gradient-based decision trees for training. The most popular libraries + for this are XGBoost and LightGBM. - For these trainers, you just pass a dataset and parameters. The training loop is configured - automatically. + - :class:`XGBoostTrainer ` + - :class:`LightGBMTrainer ` - - :ref:`XGBoost/LightGBM user guide ` - - :ref:`Quick overview of tree-based trainers in the Ray AIR documentation ` + For these trainers, you just pass a dataset and parameters. The training loop is configured + automatically. + - :ref:`XGBoost/LightGBM user guide ` + - :ref:`Quick overview of tree-based trainers in the Ray AIR documentation ` -.. tabbed:: Other Trainers - Some trainers don't fit into the other two categories, such as: + .. tab-item:: Other Trainers - - :class:`HuggingFaceTrainer ` for NLP - - :class:`RLTrainer ` for reinforcement learning - - :class:`SklearnTrainer ` for (non-distributed) training of sklearn models. + Some trainers don't fit into the other two categories, such as: - - :ref:`Other trainers in the Ray AIR documentation ` + - :class:`TransformersTrainer ` for NLP + - :class:`RLTrainer ` for reinforcement learning + - :class:`SklearnTrainer ` for (non-distributed) training of sklearn models. + + - :ref:`Other trainers in the Ray AIR documentation ` .. _train-key-concepts-config: @@ -113,7 +115,7 @@ Each Trainer has a respective Predictor implementation that is compatible with i A predictor can be passed into a :class:`BatchPredictor ` is used to scale up prediction over a Ray cluster. -It takes a Ray Dataset as input. +It takes a Dataset as input. .. dropdown:: Example: Batch prediction with :class:`XGBoostPredictor ` diff --git a/doc/source/train/train.rst b/doc/source/train/train.rst index c79ab6ea278a..c2d7e1b11223 100644 --- a/doc/source/train/train.rst +++ b/doc/source/train/train.rst @@ -37,7 +37,7 @@ There are three broad categories of Trainers that Train offers: **Batteries included**: Train is part of :ref:`Ray AIR ` and seamlessly operates in the Ray ecosystem. -* Use :ref:`Ray Datasets ` with Train to load and process datasets both small and large. +* Use :ref:`Ray Data ` with Train to load and process datasets both small and large. * Use :ref:`Ray Tune ` with Train to sweep parameter grids and leverage cutting edge hyperparameter search algorithms. * Leverage the :ref:`Ray cluster launcher ` to launch autoscaling or spot instance clusters on any cloud. @@ -45,37 +45,38 @@ There are three broad categories of Trainers that Train offers: Quick Start to Distributed Training with Ray Train -------------------------------------------------- -.. tabbed:: XGBoost +.. tab-set:: - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __xgboost_start__ - :end-before: __xgboost_end__ + .. tab-item:: XGBoost -.. tabbed:: LightGBM + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __xgboost_start__ + :end-before: __xgboost_end__ - .. literalinclude:: doc_code/gbdt_user_guide.py - :language: python - :start-after: __lightgbm_start__ - :end-before: __lightgbm_end__ + .. tab-item:: LightGBM -.. tabbed:: Pytorch + .. literalinclude:: doc_code/gbdt_user_guide.py + :language: python + :start-after: __lightgbm_start__ + :end-before: __lightgbm_end__ - .. literalinclude:: /ray-air/doc_code/torch_trainer.py - :language: python + .. tab-item:: Pytorch -.. tabbed:: Tensorflow + .. literalinclude:: /ray-air/doc_code/torch_trainer.py + :language: python - .. literalinclude:: /ray-air/doc_code/tf_starter.py - :language: python - :start-after: __air_tf_train_start__ - :end-before: __air_tf_train_end__ + .. tab-item:: Tensorflow -.. tabbed:: Horovod + .. literalinclude:: /ray-air/doc_code/tf_starter.py + :language: python + :start-after: __air_tf_train_start__ + :end-before: __air_tf_train_end__ - .. literalinclude:: /ray-air/doc_code/hvd_trainer.py - :language: python + .. tab-item:: Horovod + .. literalinclude:: /ray-air/doc_code/hvd_trainer.py + :language: python .. _train-framework-catalog: @@ -108,9 +109,9 @@ classes that ship out of the box with Train: * - :class:`SklearnTrainer ` - :class:`SklearnCheckpoint ` - :class:`SklearnPredictor ` - * - :class:`HuggingFaceTrainer ` - - :class:`HuggingFaceCheckpoint ` - - :class:`HuggingFacePredictor ` + * - :class:`TransformersTrainer ` + - :class:`TransformersCheckpoint ` + - :class:`TransformersPredictor ` * - :class:`RLTrainer ` - :class:`RLCheckpoint ` - :class:`RLPredictor ` diff --git a/doc/source/train/user-guides.rst b/doc/source/train/user-guides.rst index 67d6cca88930..be20df04242a 100644 --- a/doc/source/train/user-guides.rst +++ b/doc/source/train/user-guides.rst @@ -3,45 +3,39 @@ Ray Train User Guides ===================== -.. panels:: - :container: container pb-4 full-width - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: config_guide - :type: ref - :text: Configurations User Guide - :classes: btn-link btn-block stretched-link - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: dl_guide - :type: ref - :text: Deep Learning User Guide - :classes: btn-link btn-block stretched-link - - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: gbdt - :type: ref - :text: XGBoost / LightGBM User Guide - :classes: btn-link btn-block stretched-link - - --- - :img-top: /ray-overview/images/ray_svg_logo.svg - - +++ - .. link-button:: architecture - :type: ref - :text: Ray Train Architecture - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: config_guide + + Configurations User Guide + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: dl_guide + + Deep Learning User Guide + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: gbdt + + XGBoost / LightGBM User Guide + + .. grid-item-card:: + :img-top: /ray-overview/images/ray_svg_logo.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: architecture + + Ray Train Architecture diff --git a/doc/source/tune/api/cli.rst b/doc/source/tune/api/cli.rst index b3d69a3f66f6..8e7ecc5bcac9 100644 --- a/doc/source/tune/api/cli.rst +++ b/doc/source/tune/api/cli.rst @@ -2,11 +2,6 @@ Tune CLI (Experimental) ======================= ``tune`` has an easy-to-use command line interface (CLI) to manage and monitor your experiments on Ray. -To do this, verify that you have the ``tabulate`` library installed: - -.. code-block:: bash - - $ pip install tabulate Here is an example command line call: diff --git a/doc/source/tune/api/env.rst b/doc/source/tune/api/env.rst index 8ff39bd73bae..c6846107484e 100644 --- a/doc/source/tune/api/env.rst +++ b/doc/source/tune/api/env.rst @@ -21,6 +21,7 @@ These are the environment variables Ray Tune currently considers: * **TUNE_DISABLE_DATED_SUBDIR**: Ray Tune automatically adds a date string to experiment directories when the name is not specified explicitly or the trainable isn't passed as a string. Setting this environment variable to ``1`` disables adding these date strings. +* **TUNE_NEW_EXECUTION**: Disable :ref:`Ray Tune's new execution engine `. * **TUNE_DISABLE_STRICT_METRIC_CHECKING**: When you report metrics to Tune via ``session.report()`` and passed a ``metric`` parameter to ``Tuner()``, a scheduler, or a search algorithm, Tune will error @@ -89,6 +90,8 @@ These are the environment variables Ray Tune currently considers: repeatedly every this amount of seconds. Defaults to 60 (seconds). * **TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S**: Threshold for throwing a warning if the experiment state is synced multiple times in that many seconds. Defaults to 30 (seconds). +* **TUNE_WARN_SLOW_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S**: Threshold for throwing a warning if the experiment state syncing + takes longer than this time in seconds. Defaults to 30 (seconds). * **TUNE_STATE_REFRESH_PERIOD**: Frequency of updating the resource tracking from Ray. Defaults to 10 (seconds). * **TUNE_RESTORE_RETRY_NUM**: The number of retries that are done before a particular trial's restore is determined unsuccessful. After that, the trial is not restored to its previous checkpoint but rather from scratch. diff --git a/doc/source/tune/examples/experiment-tracking.rst b/doc/source/tune/examples/experiment-tracking.rst index 3fbbb8157cbf..2a14d75b2301 100644 --- a/doc/source/tune/examples/experiment-tracking.rst +++ b/doc/source/tune/examples/experiment-tracking.rst @@ -6,42 +6,39 @@ such as CometML, or Weights & Biases. If you're interested in learning how to use Ray Tune with Tensorboard, you can find more information in our :ref:`Guide to logging and outputs `. -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /images/aim_logo.png - - +++ - .. link-button:: tune-aim-ref - :type: ref - :text: Using Aim with Ray Tune For Experiment Management - :classes: btn-link btn-block stretched-link - --- - :img-top: /images/comet_logo_full.png - - +++ - .. link-button:: tune-comet-ref - :type: ref - :text: Using Comet with Ray Tune For Experiment Management - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/wandb_logo.png - - +++ - .. link-button:: tune-wandb-ref - :type: ref - :text: Tracking Your Experiment Process Weights & Biases - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/mlflow.png - - +++ - .. link-button:: tune-mlflow-ref - :type: ref - :text: Using MLflow Tracking & AutoLogging with Tune - :classes: btn-link btn-block stretched-link \ No newline at end of file +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + + .. grid-item-card:: + :img-top: /images/aim_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-aim-ref + + Using Aim with Ray Tune For Experiment Management + + .. grid-item-card:: + :img-top: /images/comet_logo_full.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-comet-ref + + Using Comet with Ray Tune For Experiment Management + + .. grid-item-card:: + :img-top: /images/wandb_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-wandb-ref + + Tracking Your Experiment Process Weights & Biases + + .. grid-item-card:: + :img-top: /images/mlflow.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-mlflow-ref + + Using MLflow Tracking & AutoLogging with Tune diff --git a/doc/source/tune/examples/hpo-frameworks.rst b/doc/source/tune/examples/hpo-frameworks.rst index 6541964d2b83..cd66e61d01b2 100644 --- a/doc/source/tune/examples/hpo-frameworks.rst +++ b/doc/source/tune/examples/hpo-frameworks.rst @@ -5,116 +5,102 @@ Tune integrates with a wide variety of hyperparameter optimization frameworks and their respective search algorithms. Here you can find detailed examples on each of our integrations: -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: ../images/ax.png - - +++ - .. link-button:: ax_example - :type: ref - :text: How To Use Tune With Ax - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/dragonfly.png - - +++ - .. link-button:: dragonfly_example - :type: ref - :text: How To Use Tune With Dragonfly - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/skopt.png - - +++ - .. link-button:: skopt_example - :type: ref - :text: How To Use Tune With Scikit-Optimize - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/hyperopt.png - - +++ - .. link-button:: hyperopt_example - :type: ref - :text: How To Use Tune With HyperOpt - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/bayesopt.png - - +++ - .. link-button:: bayesopt_example - :type: ref - :text: How To Use Tune With BayesOpt - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/flaml.png - - +++ - .. link-button:: flaml_example - :type: ref - :text: How To Use Tune With BlendSearch and CFO - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/bohb.png - - +++ - .. link-button:: bohb_example - :type: ref - :text: How To Use Tune With TuneBOHB - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/nevergrad.png - - +++ - .. link-button:: nevergrad_example - :type: ref - :text: How To Use Tune With Nevergrad - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/optuna.png - - +++ - .. link-button:: optuna_example - :type: ref - :text: How To Use Tune With Optuna - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/zoopt.png - - +++ - .. link-button:: zoopt_example - :type: ref - :text: How To Use Tune With ZOOpt - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/sigopt.png - - +++ - .. link-button:: sigopt_example - :type: ref - :text: How To Use Tune With SigOpt - :classes: btn-link btn-block stretched-link - - --- - :img-top: ../images/hebo.png - - +++ - .. link-button:: hebo_example - :type: ref - :text: How To Use Tune With HEBO - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + .. grid-item-card:: + :img-top: ../images/ax.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: ax_example + + How To Use Tune With Ax + + .. grid-item-card:: + :img-top: ../images/dragonfly.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: dragonfly_example + + How To Use Tune With Dragonfly + + .. grid-item-card:: + :img-top: ../images/skopt.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: skopt_example + + How To Use Tune With Scikit-Optimize + + .. grid-item-card:: + :img-top: ../images/hyperopt.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: hyperopt_example + + How To Use Tune With HyperOpt + + .. grid-item-card:: + :img-top: ../images/bayesopt.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: bayesopt_example + + How To Use Tune With BayesOpt + + .. grid-item-card:: + :img-top: ../images/flaml.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: flaml_example + + How To Use Tune With BlendSearch and CFO + + .. grid-item-card:: + :img-top: ../images/bohb.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: bohb_example + + How To Use Tune With TuneBOHB + + .. grid-item-card:: + :img-top: ../images/nevergrad.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: nevergrad_example + + How To Use Tune With Nevergrad + + .. grid-item-card:: + :img-top: ../images/optuna.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: optuna_example + + How To Use Tune With Optuna + + .. grid-item-card:: + :img-top: ../images/zoopt.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: zoopt_example + + How To Use Tune With ZOOpt + + .. grid-item-card:: + :img-top: ../images/sigopt.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: sigopt_example + + How To Use Tune With SigOpt + + .. grid-item-card:: + :img-top: ../images/hebo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: hebo_example + + How To Use Tune With HEBO diff --git a/doc/source/tune/examples/ml-frameworks.rst b/doc/source/tune/examples/ml-frameworks.rst index a284be504916..441c8286693f 100644 --- a/doc/source/tune/examples/ml-frameworks.rst +++ b/doc/source/tune/examples/ml-frameworks.rst @@ -5,107 +5,94 @@ Ray Tune integrates with many popular machine learning frameworks. Here you find a few practical examples showing you how to tune your models. At the end of these guides you will often find links to even more examples. -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-75 d-block mx-auto - - --- - :img-top: /images/tune-sklearn.png - - +++ - .. link-button:: tune-sklearn - :type: ref - :text: How To Use Tune's Scikit-Learn Adapters? - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/keras.png - - +++ - .. link-button:: tune-mnist-keras - :type: ref - :text: How To Use Tune With Keras & TF Models - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/pytorch_logo.png - - +++ - .. link-button:: tune-pytorch-cifar-ref - :type: ref - :text: How To Use Tune With PyTorch Models - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/pytorch_lightning_small.png - - +++ - .. link-button:: tune-pytorch-lightning-ref - :type: ref - :text: How To Tune PyTorch Lightning Models - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/mxnet_logo.png - - +++ - .. link-button:: tune-mxnet-example - :type: ref - :text: How To Tune MXNet Models - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/serve.svg - - +++ - .. link-button:: tune-serve-integration-mnist - :type: ref - :text: Model Selection & Serving With Ray Serve - :classes: btn-link btn-block stretched-link - - --- - :img-top: /rllib/images/rllib-logo.png - - +++ - .. link-button:: tune-rllib-example - :type: ref - :text: Tuning RL Experiments With Ray Tune & Ray Serve - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/xgboost_logo.png - - +++ - .. link-button:: tune-xgboost-ref - :type: ref - :text: A Guide To Tuning XGBoost Parameters With Tune - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/lightgbm_logo.png - - +++ - .. link-button:: tune-lightgbm-example - :type: ref - :text: A Guide To Tuning LightGBM Parameters With Tune - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/horovod.png - - +++ - .. link-button:: tune-horovod-example - :type: ref - :text: A Guide To Tuning Horovod Parameters With Tune - :classes: btn-link btn-block stretched-link - - --- - :img-top: /images/hugging.png - - +++ - .. link-button:: tune-huggingface-example - :type: ref - :text: A Guide To Tuning Huggingface Transformers With Tune - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + .. grid-item-card:: + :img-top: /images/tune-sklearn.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-sklearn + + How To Use Tune's Scikit-Learn Adapters? + + .. grid-item-card:: + :img-top: /images/keras.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-mnist-keras + + How To Use Tune With Keras & TF Models + + .. grid-item-card:: + :img-top: /images/pytorch_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-pytorch-cifar-ref + + How To Use Tune With PyTorch Models + + .. grid-item-card:: + :img-top: /images/pytorch_lightning_small.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-pytorch-lightning-ref + + How To Tune PyTorch Lightning Models + + .. grid-item-card:: + :img-top: /images/mxnet_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-mxnet-example + + How To Tune MXNet Models + + .. grid-item-card:: + :img-top: /images/serve.svg + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-serve-integration-mnist + + Model Selection & Serving With Ray Serve + + .. grid-item-card:: + :img-top: /rllib/images/rllib-logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-rllib-example + + Tuning RL Experiments With Ray Tune & Ray Serve + + .. grid-item-card:: + :img-top: /images/xgboost_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-xgboost-ref + + A Guide To Tuning XGBoost Parameters With Tune + + .. grid-item-card:: + :img-top: /images/lightgbm_logo.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-lightgbm-example + + A Guide To Tuning LightGBM Parameters With Tune + + .. grid-item-card:: + :img-top: /images/horovod.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-horovod-example + + A Guide To Tuning Horovod Parameters With Tune + + .. grid-item-card:: + :img-top: /images/hugging.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-huggingface-example + + A Guide To Tuning Huggingface Transformers With Tune diff --git a/doc/source/tune/examples/optuna_example.ipynb b/doc/source/tune/examples/optuna_example.ipynb index d020da9bbd5f..2b2d7f36fd53 100644 --- a/doc/source/tune/examples/optuna_example.ipynb +++ b/doc/source/tune/examples/optuna_example.ipynb @@ -264,7 +264,7 @@ "id": "4287fa79", "metadata": {}, "source": [ - "We also constrain the the number of concurrent trials to `4` with a `ConcurrencyLimiter`." + "We also constrain the number of concurrent trials to `4` with a `ConcurrencyLimiter`." ] }, { diff --git a/doc/source/tune/examples/sigopt_example.ipynb b/doc/source/tune/examples/sigopt_example.ipynb index 4d884dbb745c..ac478b0a3547 100644 --- a/doc/source/tune/examples/sigopt_example.ipynb +++ b/doc/source/tune/examples/sigopt_example.ipynb @@ -375,7 +375,7 @@ "id": "3d8441a6", "metadata": {}, "source": [ - "And here are they hyperparameters found to minimize the the objective on average." + "And here are they hyperparameters found to minimize the objective on average." ] }, { diff --git a/doc/source/tune/examples/skopt_example.ipynb b/doc/source/tune/examples/skopt_example.ipynb index 40a5f97ffd33..84e8a7741460 100644 --- a/doc/source/tune/examples/skopt_example.ipynb +++ b/doc/source/tune/examples/skopt_example.ipynb @@ -225,7 +225,7 @@ "id": "2892b243", "metadata": {}, "source": [ - "The search algorithm is instantiated from the `SkOptSearch` class. We also constrain the the number of concurrent trials to `4` with a `ConcurrencyLimiter`." + "The search algorithm is instantiated from the `SkOptSearch` class. We also constrain the number of concurrent trials to `4` with a `ConcurrencyLimiter`." ] }, { diff --git a/doc/source/tune/examples/tune-pytorch-lightning.ipynb b/doc/source/tune/examples/tune-pytorch-lightning.ipynb index c8083b5ead01..5b364a4497c8 100644 --- a/doc/source/tune/examples/tune-pytorch-lightning.ipynb +++ b/doc/source/tune/examples/tune-pytorch-lightning.ipynb @@ -582,6 +582,7 @@ "\n", "- {ref}`Use LightningTrainer for Image Classification `.\n", "- {ref}`Use LightningTrainer with Ray Data and Batch Predictor `\n", + "- {ref}`Fine-tune a Large Language Model with LightningTrainer and FSDP `\n", "- {doc}`/tune/examples/includes/mlflow_ptl_example`: Example for using [MLflow](https://github.com/mlflow/mlflow/)\n", " and [Pytorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning) with Ray Tune.\n", "- {doc}`/tune/examples/includes/mnist_ptl_mini`:\n", @@ -607,7 +608,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.15" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/doc/source/tune/examples/tune-xgboost.ipynb b/doc/source/tune/examples/tune-xgboost.ipynb index cde9b8ce92e0..08f98ef1933e 100644 --- a/doc/source/tune/examples/tune-xgboost.ipynb +++ b/doc/source/tune/examples/tune-xgboost.ipynb @@ -1,1256 +1,1256 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "edce67b9", - "metadata": {}, - "source": [ - "# Tuning XGBoost hyperparameters with Ray Tune\n", - "\n", - "(tune-xgboost-ref)=\n", - "\n", - "XGBoost is currently one of the most popular machine learning algorithms. It performs\n", - "very well on a large selection of tasks, and was the key to success in many Kaggle\n", - "competitions.\n", - "\n", - "```{image} /images/xgboost_logo.png\n", - ":align: center\n", - ":alt: XGBoost\n", - ":target: https://xgboost.readthedocs.io/en/latest/\n", - ":width: 200px\n", - "```\n", - "\n", - "This tutorial will give you a quick introduction to XGBoost, show you how\n", - "to train an XGBoost model, and then guide you on how to optimize XGBoost\n", - "parameters using Tune to get the best performance. We tackle the following topics:\n", - "\n", - "```{contents}\n", - ":depth: 2\n", - "```\n", - "\n", - ":::{note}\n", - "To run this tutorial, you will need to install the following:\n", - "\n", - "```bash\n", - "$ pip install xgboost\n", - "```\n", - ":::\n", - "\n", - "## What is XGBoost\n", - "\n", - "XGBoost is an acronym for e**X**treme **G**radient **Boost**ing. Internally,\n", - "XGBoost uses [decision trees](https://en.wikipedia.org/wiki/Decision_tree). Instead\n", - "of training just one large decision tree, XGBoost and other related algorithms train\n", - "many small decision trees. The intuition behind this is that even though single\n", - "decision trees can be inaccurate and suffer from high variance,\n", - "combining the output of a large number of these weak learners can actually lead to\n", - "strong learner, resulting in better predictions and less variance.\n", - "\n", - ":::{figure} /images/tune-xgboost-ensemble.svg\n", - ":alt: Single vs. ensemble learning\n", - "\n", - "A single decision tree (left) might be able to get to an accuracy of 70%\n", - "for a binary classification task. By combining the output of several small\n", - "decision trees, an ensemble learner (right) might end up with a higher accuracy\n", - "of 90%.\n", - ":::\n", - "\n", - "Boosting algorithms start with a single small decision tree and evaluate how well\n", - "it predicts the given examples. When building the next tree, those samples that have\n", - "been misclassified before have a higher chance of being used to generate the tree.\n", - "This is useful because it avoids overfitting to samples that can be easily classified\n", - "and instead tries to come up with models that are able to classify hard examples, too.\n", - "Please see [here for a more thorough introduction to bagging and boosting algorithms](https://towardsdatascience.com/ensemble-methods-bagging-boosting-and-stacking-c9214a10a205).\n", - "\n", - "There are many boosting algorithms. In their core, they are all very similar. XGBoost\n", - "uses second-level derivatives to find splits that maximize the *gain* (the inverse of\n", - "the *loss*) - hence the name. In practice, there really is no drawback in using\n", - "XGBoost over other boosting algorithms - in fact, it usually shows the best performance.\n", - "\n", - "## Training a simple XGBoost classifier\n", - "\n", - "Let's first see how a simple XGBoost classifier can be trained. We'll use the\n", - "`breast_cancer`-Dataset included in the `sklearn` dataset collection. This is\n", - "a binary classification dataset. Given 30 different input features, our task is to\n", - "learn to identify subjects with breast cancer and those without.\n", - "\n", - "Here is the full code to train a simple XGBoost model:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "77b3c71c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.9650\n" - ] - } - ], - "source": [ - "import sklearn.datasets\n", - "import sklearn.metrics\n", - "from sklearn.model_selection import train_test_split\n", - "import xgboost as xgb\n", - "\n", - "\n", - "def train_breast_cancer(config):\n", - " # Load dataset\n", - " data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", - " # Split into train and test set\n", - " train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)\n", - " # Build input matrices for XGBoost\n", - " train_set = xgb.DMatrix(train_x, label=train_y)\n", - " test_set = xgb.DMatrix(test_x, label=test_y)\n", - " # Train the classifier\n", - " results = {}\n", - " bst = xgb.train(\n", - " config,\n", - " train_set,\n", - " evals=[(test_set, \"eval\")],\n", - " evals_result=results,\n", - " verbose_eval=False,\n", - " )\n", - " return results\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " results = train_breast_cancer(\n", - " {\"objective\": \"binary:logistic\", \"eval_metric\": [\"logloss\", \"error\"]}\n", - " )\n", - " accuracy = 1.0 - results[\"eval\"][\"error\"][-1]\n", - " print(f\"Accuracy: {accuracy:.4f}\")\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ec2a13f8", - "metadata": {}, - "source": [ - "As you can see, the code is quite simple. First, the dataset is loaded and split\n", - "into a `test` and `train` set. The XGBoost model is trained with `xgb.train()`.\n", - "XGBoost automatically evaluates metrics we specified on the test set. In our case\n", - "it calculates the *logloss* and the prediction *error*, which is the percentage of\n", - "misclassified examples. To calculate the accuracy, we just have to subtract the error\n", - "from `1.0`. Even in this simple example, most runs result\n", - "in a good accuracy of over `0.90`.\n", - "\n", - "Maybe you have noticed the `config` parameter we pass to the XGBoost algorithm. This\n", - "is a {class}`dict` in which you can specify parameters for the XGBoost algorithm. In this\n", - "simple example, the only parameters we passed are the `objective` and `eval_metric` parameters.\n", - "The value `binary:logistic` tells XGBoost that we aim to train a logistic regression model for\n", - "a binary classification task. You can find an overview over all valid objectives\n", - "[here in the XGBoost documentation](https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters).\n", - "\n", - "## XGBoost Hyperparameters\n", - "\n", - "Even with the default settings, XGBoost was able to get to a good accuracy on the\n", - "breast cancer dataset. However, as in many machine learning algorithms, there are\n", - "many knobs to tune which might lead to even better performance. Let's explore some of\n", - "them below.\n", - "\n", - "### Maximum tree depth\n", - "\n", - "Remember that XGBoost internally uses many decision tree models to come up with\n", - "predictions. When training a decision tree, we need to tell the algorithm how\n", - "large the tree may get. The parameter for this is called the tree *depth*.\n", - "\n", - ":::{figure} /images/tune-xgboost-depth.svg\n", - ":align: center\n", - ":alt: Decision tree depth\n", - "\n", - "In this image, the left tree has a depth of 2, and the right tree a depth of 3.\n", - "Note that with each level, $2^{(d-1)}$ splits are added, where *d* is the depth\n", - "of the tree.\n", - ":::\n", - "\n", - "Tree depth is a property that concerns the model complexity. If you only allow short\n", - "trees, the models are likely not very precise - they underfit the data. If you allow\n", - "very large trees, the single models are likely to overfit to the data. In practice,\n", - "a number between `2` and `6` is often a good starting point for this parameter.\n", - "\n", - "XGBoost's default value is `3`.\n", - "\n", - "### Minimum child weight\n", - "\n", - "When a decision tree creates new leaves, it splits up the remaining data at one node\n", - "into two groups. If there are only few samples in one of these groups, it often\n", - "doesn't make sense to split it further. One of the reasons for this is that the\n", - "model is harder to train when we have fewer samples.\n", - "\n", - ":::{figure} /images/tune-xgboost-weight.svg\n", - ":align: center\n", - ":alt: Minimum child weight\n", - "\n", - "In this example, we start with 100 examples. At the first node, they are split\n", - "into 4 and 96 samples, respectively. In the next step, our model might find\n", - "that it doesn't make sense to split the 4 examples more. It thus only continues\n", - "to add leaves on the right side.\n", - ":::\n", - "\n", - "The parameter used by the model to decide if it makes sense to split a node is called\n", - "the *minimum child weight*. In the case of linear regression, this is just the absolute\n", - "number of nodes requried in each child. In other objectives, this value is determined\n", - "using the weights of the examples, hence the name.\n", - "\n", - "The larger the value, the more constrained the trees are and the less deep they will be.\n", - "This parameter thus also affects the model complexity. Values can range between 0\n", - "and infinity and are dependent on the sample size. For our ca. 500 examples in the\n", - "breast cancer dataset, values between `0` and `10` should be sensible.\n", - "\n", - "XGBoost's default value is `1`.\n", - "\n", - "### Subsample size\n", - "\n", - "Each decision tree we add is trained on a subsample of the total training dataset.\n", - "The probabilities for the samples are weighted according to the XGBoost algorithm,\n", - "but we can decide on which fraction of the samples we want to train each decision\n", - "tree on.\n", - "\n", - "Setting this value to `0.7` would mean that we randomly sample `70%` of the\n", - "training dataset before each training iteration.\n", - "\n", - "XGBoost's default value is `1`.\n", - "\n", - "### Learning rate / Eta\n", - "\n", - "Remember that XGBoost sequentially trains many decision trees, and that later trees\n", - "are more likely trained on data that has been misclassified by prior trees. In effect\n", - "this means that earlier trees make decisions for easy samples (i.e. those samples that\n", - "can easily be classified) and later trees make decisions for harder samples. It is then\n", - "sensible to assume that the later trees are less accurate than earlier trees.\n", - "\n", - "To address this fact, XGBoost uses a parameter called *Eta*, which is sometimes called\n", - "the *learning rate*. Don't confuse this with learning rates from gradient descent!\n", - "The original [paper on stochastic gradient boosting](https://jerryfriedman.su.domains/ftp/stobst.pdf)\n", - "introduces this parameter like so:\n", - "\n", - "$$\n", - "F_m(x) = F_{m-1}(x) + \\eta \\cdot \\gamma_{lm} \\textbf{1}(x \\in R_{lm})\n", - "$$\n", - "\n", - "This is just a complicated way to say that when we train we new decision tree,\n", - "represented by $\\gamma_{lm} \\textbf{1}(x \\in R_{lm})$, we want to dampen\n", - "its effect on the previous prediction $F_{m-1}(x)$ with a factor\n", - "$\\eta$.\n", - "\n", - "Typical values for this parameter are between `0.01` and `` 0.3` ``.\n", - "\n", - "XGBoost's default value is `0.3`.\n", - "\n", - "### Number of boost rounds\n", - "\n", - "Lastly, we can decide on how many boosting rounds we perform, which means how\n", - "many decision trees we ultimately train. When we do heavy subsampling or use small\n", - "learning rate, it might make sense to increase the number of boosting rounds.\n", - "\n", - "XGBoost's default value is `10`.\n", - "\n", - "### Putting it together\n", - "\n", - "Let's see how this looks like in code! We just need to adjust our `config` dict:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "35073e88", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.9790\n" - ] - } - ], - "source": [ - "if __name__ == \"__main__\":\n", - " config = {\n", - " \"objective\": \"binary:logistic\",\n", - " \"eval_metric\": [\"logloss\", \"error\"],\n", - " \"max_depth\": 2,\n", - " \"min_child_weight\": 0,\n", - " \"subsample\": 0.8,\n", - " \"eta\": 0.2,\n", - " }\n", - " results = train_breast_cancer(config)\n", - " accuracy = 1.0 - results[\"eval\"][\"error\"][-1]\n", - " print(f\"Accuracy: {accuracy:.4f}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "69cf0c13", - "metadata": {}, - "source": [ - "The rest stays the same. Please note that we do not adjust the `num_boost_rounds` here.\n", - "The result should also show a high accuracy of over 90%.\n", - "\n", - "## Tuning the configuration parameters\n", - "\n", - "XGBoosts default parameters already lead to a good accuracy, and even our guesses in the\n", - "last section should result in accuracies well above 90%. However, our guesses were\n", - "just that: guesses. Often we do not know what combination of parameters would actually\n", - "lead to the best results on a machine learning task.\n", - "\n", - "Unfortunately, there are infinitely many combinations of hyperparameters we could try\n", - "out. Should we combine `max_depth=3` with `subsample=0.8` or with `subsample=0.9`?\n", - "What about the other parameters?\n", - "\n", - "This is where hyperparameter tuning comes into play. By using tuning libraries such as\n", - "Ray Tune we can try out combinations of hyperparameters. Using sophisticated search\n", - "strategies, these parameters can be selected so that they are likely to lead to good\n", - "results (avoiding an expensive *exhaustive search*). Also, trials that do not perform\n", - "well can be preemptively stopped to reduce waste of computing resources. Lastly, Ray Tune\n", - "also takes care of training these runs in parallel, greatly increasing search speed.\n", - "\n", - "Let's start with a basic example on how to use Tune for this. We just need to make\n", - "a few changes to our code-block:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ff856a82", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-22 15:52:52,004\tINFO services.py:1483 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8268\u001b[39m\u001b[22m\n", - "2022-07-22 15:52:55,858\tWARNING function_trainable.py:619 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n" - ] - }, - { - "data": { - "text/html": [ - "== Status ==
    Current time: 2022-07-22 15:53:04 (running for 00:00:07.77)
    Memory usage on this node: 10.5/16.0 GiB
    Using FIFO scheduling algorithm.
    Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.57 GiB heap, 0.0/2.0 GiB objects
    Result logdir: /Users/kai/ray_results/train_breast_cancer_2022-07-22_15-52-48
    Number of trials: 10/10 (10 TERMINATED)
    \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
    Trial name status loc eta max_depth min_child_weight subsample acc iter total time (s)
    train_breast_cancer_f8669_00000TERMINATED127.0.0.1:488520.0069356 5 3 0.8235040.944056 1 0.0316169
    train_breast_cancer_f8669_00001TERMINATED127.0.0.1:488570.00145619 6 3 0.8329470.958042 1 0.0328588
    train_breast_cancer_f8669_00002TERMINATED127.0.0.1:488580.00108208 7 3 0.9873190.944056 1 0.0319381
    train_breast_cancer_f8669_00003TERMINATED127.0.0.1:488590.00530429 8 2 0.6156910.923077 1 0.028388
    train_breast_cancer_f8669_00004TERMINATED127.0.0.1:488600.000721843 8 1 0.6509730.958042 1 0.0299618
    train_breast_cancer_f8669_00005TERMINATED127.0.0.1:488610.0074509 1 1 0.7383410.874126 1 0.0193682
    train_breast_cancer_f8669_00006TERMINATED127.0.0.1:488620.0879882 8 2 0.6715760.944056 1 0.0267372
    train_breast_cancer_f8669_00007TERMINATED127.0.0.1:488630.0765404 7 2 0.7081570.965035 1 0.0276129
    train_breast_cancer_f8669_00008TERMINATED127.0.0.1:488640.000627649 6 1 0.81121 0.951049 1 0.0310998
    train_breast_cancer_f8669_00009TERMINATED127.0.0.1:488650.000383711 2 3 0.9905790.93007 1 0.0274954


    " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-22 15:52:57,385\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/kai/coding/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/kai/coding/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result for train_breast_cancer_f8669_00000:\n", - " date: 2022-07-22_15-53-00\n", - " done: true\n", - " experiment_id: 07d10c5f31e74133b53272b7ccf9c528\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.9440559440559441\n", - " node_ip: 127.0.0.1\n", - " pid: 48852\n", - " time_since_restore: 0.031616926193237305\n", - " time_this_iter_s: 0.031616926193237305\n", - " time_total_s: 0.031616926193237305\n", - " timestamp: 1658501580\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00000\n", - " warmup_time: 0.0027849674224853516\n", - " \n", - "Result for train_breast_cancer_f8669_00009:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: bc0d5dd2d079432b859faac8a18928f0\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.9300699300699301\n", - " node_ip: 127.0.0.1\n", - " pid: 48865\n", - " time_since_restore: 0.027495384216308594\n", - " time_this_iter_s: 0.027495384216308594\n", - " time_total_s: 0.027495384216308594\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00009\n", - " warmup_time: 0.005235910415649414\n", - " \n", - "Result for train_breast_cancer_f8669_00001:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: 4b10d350d4374a0d9e7d0c3b1d4e3203\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.958041958041958\n", - " node_ip: 127.0.0.1\n", - " pid: 48857\n", - " time_since_restore: 0.032858848571777344\n", - " time_this_iter_s: 0.032858848571777344\n", - " time_total_s: 0.032858848571777344\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00001\n", - " warmup_time: 0.004731178283691406\n", - " \n", - "Result for train_breast_cancer_f8669_00008:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: 91c25cbbeb6f409d93e1d6537cb8e1ee\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.951048951048951\n", - " node_ip: 127.0.0.1\n", - " pid: 48864\n", - " time_since_restore: 0.031099796295166016\n", - " time_this_iter_s: 0.031099796295166016\n", - " time_total_s: 0.031099796295166016\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00008\n", - " warmup_time: 0.003270864486694336\n", - " \n", - "Result for train_breast_cancer_f8669_00005:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: d225b0fb59e14da7adba952456ccf1d5\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.8741258741258742\n", - " node_ip: 127.0.0.1\n", - " pid: 48861\n", - " time_since_restore: 0.01936817169189453\n", - " time_this_iter_s: 0.01936817169189453\n", - " time_total_s: 0.01936817169189453\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00005\n", - " warmup_time: 0.003901958465576172\n", - " \n", - "Result for train_breast_cancer_f8669_00004:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: 322484af6ea5422f8aaf8ff6a91af4f7\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.958041958041958\n", - " node_ip: 127.0.0.1\n", - " pid: 48860\n", - " time_since_restore: 0.029961824417114258\n", - " time_this_iter_s: 0.029961824417114258\n", - " time_total_s: 0.029961824417114258\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00004\n", - " warmup_time: 0.003547191619873047\n", - " \n", - "Result for train_breast_cancer_f8669_00002:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: 3f588954160b42ce8ce200f68127ebcd\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.9440559440559441\n", - " node_ip: 127.0.0.1\n", - " pid: 48858\n", - " time_since_restore: 0.03193807601928711\n", - " time_this_iter_s: 0.03193807601928711\n", - " time_total_s: 0.03193807601928711\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00002\n", - " warmup_time: 0.003523111343383789\n", - " \n", - "Result for train_breast_cancer_f8669_00003:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: a39ea777ce2d4ebca51b3d7a4179dae5\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.9230769230769231\n", - " node_ip: 127.0.0.1\n", - " pid: 48859\n", - " time_since_restore: 0.028388023376464844\n", - " time_this_iter_s: 0.028388023376464844\n", - " time_total_s: 0.028388023376464844\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00003\n", - " warmup_time: 0.0035560131072998047\n", - " \n", - "Result for train_breast_cancer_f8669_00006:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: f97c6b9674854f8d89ec26ba58cc1618\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.9440559440559441\n", - " node_ip: 127.0.0.1\n", - " pid: 48862\n", - " time_since_restore: 0.026737213134765625\n", - " time_this_iter_s: 0.026737213134765625\n", - " time_total_s: 0.026737213134765625\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00006\n", - " warmup_time: 0.003425121307373047\n", - " \n", - "Result for train_breast_cancer_f8669_00007:\n", - " date: 2022-07-22_15-53-04\n", - " done: true\n", - " experiment_id: ff172037065a4d55998ed72f51bdc5df\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " mean_accuracy: 0.965034965034965\n", - " node_ip: 127.0.0.1\n", - " pid: 48863\n", - " time_since_restore: 0.027612924575805664\n", - " time_this_iter_s: 0.027612924575805664\n", - " time_total_s: 0.027612924575805664\n", - " timestamp: 1658501584\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: f8669_00007\n", - " warmup_time: 0.0031311511993408203\n", - " \n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-22 15:53:04,846\tINFO tune.py:738 -- Total run time: 8.99 seconds (7.74 seconds for the tuning loop).\n" - ] - } - ], - "source": [ - "import sklearn.datasets\n", - "import sklearn.metrics\n", - "\n", - "from ray import air, tune\n", - "from ray.air import session\n", - "\n", - "\n", - "def train_breast_cancer(config):\n", - " # Load dataset\n", - " data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", - " # Split into train and test set\n", - " train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)\n", - " # Build input matrices for XGBoost\n", - " train_set = xgb.DMatrix(train_x, label=train_y)\n", - " test_set = xgb.DMatrix(test_x, label=test_y)\n", - " # Train the classifier\n", - " results = {}\n", - " xgb.train(\n", - " config,\n", - " train_set,\n", - " evals=[(test_set, \"eval\")],\n", - " evals_result=results,\n", - " verbose_eval=False,\n", - " )\n", - " # Return prediction accuracy\n", - " accuracy = 1.0 - results[\"eval\"][\"error\"][-1]\n", - " session.report({\"mean_accuracy\": accuracy, \"done\": True})\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " config = {\n", - " \"objective\": \"binary:logistic\",\n", - " \"eval_metric\": [\"logloss\", \"error\"],\n", - " \"max_depth\": tune.randint(1, 9),\n", - " \"min_child_weight\": tune.choice([1, 2, 3]),\n", - " \"subsample\": tune.uniform(0.5, 1.0),\n", - " \"eta\": tune.loguniform(1e-4, 1e-1),\n", - " }\n", - " tuner = tune.Tuner(\n", - " train_breast_cancer,\n", - " tune_config=tune.TuneConfig(\n", - " num_samples=10,\n", - " ),\n", - " param_space=config,\n", - " )\n", - " results = tuner.fit()\n" - ] - }, - { - "cell_type": "markdown", - "id": "4999e858", - "metadata": {}, - "source": [ - "As you can see, the changes in the actual training function are minimal. Instead of\n", - "returning the accuracy value, we report it back to Tune using `session.report()`.\n", - "Our `config` dictionary only changed slightly. Instead of passing hard-coded\n", - "parameters, we tell Tune to choose values from a range of valid options. There are\n", - "a number of options we have here, all of which are explained in\n", - "{ref}`the Tune docs `.\n", - "\n", - "For a brief explanation, this is what they do:\n", - "\n", - "- `tune.randint(min, max)` chooses a random integer value between *min* and *max*.\n", - " Note that *max* is exclusive, so it will not be sampled.\n", - "- `tune.choice([a, b, c])` chooses one of the items of the list at random. Each item\n", - " has the same chance to be sampled.\n", - "- `tune.uniform(min, max)` samples a floating point number between *min* and *max*.\n", - " Note that *max* is exclusive here, too.\n", - "- `tune.loguniform(min, max, base=10)` samples a floating point number between *min* and *max*,\n", - " but applies a logarithmic transformation to these boundaries first. Thus, this makes\n", - " it easy to sample values from different orders of magnitude.\n", - "\n", - "The `num_samples=10` option we pass to the `TuneConfig()` means that we sample 10 different\n", - "hyperparameter configurations from this search space.\n", - "\n", - "The output of our training run coud look like this:\n", - "\n", - "```{code-block} bash\n", - ":emphasize-lines: 14\n", - "\n", - " Number of trials: 10/10 (10 TERMINATED)\n", - " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+\n", - " | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | acc | iter | total time (s) |\n", - " |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------|\n", - " | train_breast_cancer_b63aa_00000 | TERMINATED | | 0.000117625 | 2 | 2 | 0.616347 | 0.916084 | 1 | 0.0306492 |\n", - " | train_breast_cancer_b63aa_00001 | TERMINATED | | 0.0382954 | 8 | 2 | 0.581549 | 0.937063 | 1 | 0.0357082 |\n", - " | train_breast_cancer_b63aa_00002 | TERMINATED | | 0.000217926 | 1 | 3 | 0.528428 | 0.874126 | 1 | 0.0264609 |\n", - " | train_breast_cancer_b63aa_00003 | TERMINATED | | 0.000120929 | 8 | 1 | 0.634508 | 0.958042 | 1 | 0.036406 |\n", - " | train_breast_cancer_b63aa_00004 | TERMINATED | | 0.00839715 | 5 | 1 | 0.730624 | 0.958042 | 1 | 0.0389378 |\n", - " | train_breast_cancer_b63aa_00005 | TERMINATED | | 0.000732948 | 8 | 2 | 0.915863 | 0.958042 | 1 | 0.0382841 |\n", - " | train_breast_cancer_b63aa_00006 | TERMINATED | | 0.000856226 | 4 | 1 | 0.645209 | 0.916084 | 1 | 0.0357089 |\n", - " | train_breast_cancer_b63aa_00007 | TERMINATED | | 0.00769908 | 7 | 1 | 0.729443 | 0.909091 | 1 | 0.0390737 |\n", - " | train_breast_cancer_b63aa_00008 | TERMINATED | | 0.00186339 | 5 | 3 | 0.595744 | 0.944056 | 1 | 0.0343912 |\n", - " | train_breast_cancer_b63aa_00009 | TERMINATED | | 0.000950272 | 3 | 2 | 0.835504 | 0.965035 | 1 | 0.0348201 |\n", - " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+\n", - "```\n", - "\n", - "The best configuration we found used `eta=0.000950272`, `max_depth=3`,\n", - "`min_child_weight=2`, `subsample=0.835504` and reached an accuracy of\n", - "`0.965035`.\n", - "\n", - "## Early stopping\n", - "\n", - "Currently, Tune samples 10 different hyperparameter configurations and trains a full\n", - "XGBoost on all of them. In our small example, training is very fast. However,\n", - "if training takes longer, a significant amount of computer resources is spent on trials\n", - "that will eventually show a bad performance, e.g. a low accuracy. It would be good\n", - "if we could identify these trials early and stop them, so we don't waste any resources.\n", - "\n", - "This is where Tune's *Schedulers* shine. A Tune `TrialScheduler` is responsible\n", - "for starting and stopping trials. Tune implements a number of different schedulers, each\n", - "described {ref}`in the Tune documentation `.\n", - "For our example, we will use the `AsyncHyperBandScheduler` or `ASHAScheduler`.\n", - "\n", - "The basic idea of this scheduler: We sample a number of hyperparameter configurations.\n", - "Each of these configurations is trained for a specific number of iterations.\n", - "After these iterations, only the best performing hyperparameters are retained. These\n", - "are selected according to some loss metric, usually an evaluation loss. This cycle is\n", - "repeated until we end up with the best configuration.\n", - "\n", - "The `ASHAScheduler` needs to know three things:\n", - "\n", - "1. Which metric should be used to identify badly performing trials?\n", - "2. Should this metric be maximized or minimized?\n", - "3. How many iterations does each trial train for?\n", - "\n", - "There are more parameters, which are explained in the\n", - "{ref}`documentation `.\n", - "\n", - "Lastly, we have to report the loss metric to Tune. We do this with a `Callback` that\n", - "XGBoost accepts and calls after each evaluation round. Ray Tune comes\n", - "with {ref}`two XGBoost callbacks `\n", - "we can use for this. The `TuneReportCallback` just reports the evaluation\n", - "metrics back to Tune. The `TuneReportCheckpointCallback` also saves\n", - "checkpoints after each evaluation round. We will just use the latter in this\n", - "example so that we can retrieve the saved model later.\n", - "\n", - "These parameters from the `eval_metrics` configuration setting are then automatically\n", - "reported to Tune via the callback. Here, the raw error will be reported, not the accuracy.\n", - "To display the best reached accuracy, we will inverse it later.\n", - "\n", - "We will also load the best checkpointed model so that we can use it for predictions.\n", - "The best model is selected with respect to the `metric` and `mode` parameters we\n", - "pass to the `TunerConfig()`." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "d08b5b0a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "== Status ==
    Current time: 2022-07-22 16:56:01 (running for 00:00:10.38)
    Memory usage on this node: 10.3/16.0 GiB
    Using AsyncHyperBand: num_stopped=10\n", - "Bracket: Iter 8.000: -0.5107275277792991 | Iter 4.000: -0.5876629346317344 | Iter 2.000: -0.6544494184997531 | Iter 1.000: -0.6859214191253369
    Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.57 GiB heap, 0.0/2.0 GiB objects
    Current best trial: c28a3_00003 with eval-logloss=0.38665050018083796 and parameters={'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 2, 'min_child_weight': 3, 'subsample': 0.782626252548841, 'eta': 0.06385952388342125}
    Result logdir: /Users/kai/ray_results/train_breast_cancer_2022-07-22_16-55-50
    Number of trials: 10/10 (10 TERMINATED)
    \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
    Trial name status loc eta max_depth min_child_weight subsample iter total time (s) eval-logloss eval-error
    train_breast_cancer_c28a3_00000TERMINATED127.0.0.1:544160.0186954 2 2 0.516916 10 0.22218 0.571496 0.0629371
    train_breast_cancer_c28a3_00001TERMINATED127.0.0.1:544400.0304404 8 2 0.745969 2 0.135674 0.650353 0.0629371
    train_breast_cancer_c28a3_00002TERMINATED127.0.0.1:544410.0217157 8 3 0.764138 2 0.173076 0.658545 0.041958
    train_breast_cancer_c28a3_00003TERMINATED127.0.0.1:544420.0638595 2 3 0.782626 10 0.281865 0.386651 0.041958
    train_breast_cancer_c28a3_00004TERMINATED127.0.0.1:544430.00442794 7 2 0.792359 1 0.0270212 0.689577 0.0699301
    train_breast_cancer_c28a3_00005TERMINATED127.0.0.1:544440.00222624 3 1 0.536331 1 0.0238512 0.691446 0.0839161
    train_breast_cancer_c28a3_00006TERMINATED127.0.0.1:544450.000825129 1 1 0.82472 1 0.015312 0.692624 0.118881
    train_breast_cancer_c28a3_00007TERMINATED127.0.0.1:544460.000770826 7 2 0.947268 1 0.0175898 0.692598 0.132867
    train_breast_cancer_c28a3_00008TERMINATED127.0.0.1:544470.000429759 7 1 0.88524 1 0.0193739 0.692785 0.0559441
    train_breast_cancer_c28a3_00009TERMINATED127.0.0.1:544480.0149863 2 1 0.722738 1 0.0165932 0.682266 0.111888


    " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result for train_breast_cancer_c28a3_00000:\n", - " date: 2022-07-22_16-55-55\n", - " done: false\n", - " eval-error: 0.08391608391608392\n", - " eval-logloss: 0.6790360066440556\n", - " experiment_id: 2a3189442db341519836a07fb2d65dd9\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54416\n", - " time_since_restore: 0.01624011993408203\n", - " time_this_iter_s: 0.01624011993408203\n", - " time_total_s: 0.01624011993408203\n", - " timestamp: 1658505355\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00000\n", - " warmup_time: 0.0035409927368164062\n", - " \n", - "Result for train_breast_cancer_c28a3_00000:\n", - " date: 2022-07-22_16-55-56\n", - " done: true\n", - " eval-error: 0.06293706293706294\n", - " eval-logloss: 0.5714958122560194\n", - " experiment_id: 2a3189442db341519836a07fb2d65dd9\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 10\n", - " node_ip: 127.0.0.1\n", - " pid: 54416\n", - " time_since_restore: 0.22218012809753418\n", - " time_this_iter_s: 0.007044076919555664\n", - " time_total_s: 0.22218012809753418\n", - " timestamp: 1658505356\n", - " timesteps_since_restore: 0\n", - " training_iteration: 10\n", - " trial_id: c28a3_00000\n", - " warmup_time: 0.0035409927368164062\n", - " \n", - "Result for train_breast_cancer_c28a3_00003:\n", - " date: 2022-07-22_16-56-01\n", - " done: false\n", - " eval-error: 0.08391608391608392\n", - " eval-logloss: 0.6472820101918041\n", - " experiment_id: 7ff6133237404b4ea4755b9f8cd114f2\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54442\n", - " time_since_restore: 0.023206233978271484\n", - " time_this_iter_s: 0.023206233978271484\n", - " time_total_s: 0.023206233978271484\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00003\n", - " warmup_time: 0.006722211837768555\n", - " \n", - "Result for train_breast_cancer_c28a3_00005:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.08391608391608392\n", - " eval-logloss: 0.6914464114429234\n", - " experiment_id: 344762ab6d574b63a9374e19526d0510\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54444\n", - " time_since_restore: 0.02385115623474121\n", - " time_this_iter_s: 0.02385115623474121\n", - " time_total_s: 0.02385115623474121\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00005\n", - " warmup_time: 0.008936882019042969\n", - " \n", - "Result for train_breast_cancer_c28a3_00009:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.11188811188811189\n", - " eval-logloss: 0.6822656309688008\n", - " experiment_id: 133901655fa64bf79f2dcc4e8e5e41b1\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54448\n", - " time_since_restore: 0.016593217849731445\n", - " time_this_iter_s: 0.016593217849731445\n", - " time_total_s: 0.016593217849731445\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00009\n", - " warmup_time: 0.004940032958984375\n", - " \n", - "Result for train_breast_cancer_c28a3_00007:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.13286713286713286\n", - " eval-logloss: 0.6925980357023386\n", - " experiment_id: b4331027cbaf442ab905b2e51797dbbd\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54446\n", - " time_since_restore: 0.017589807510375977\n", - " time_this_iter_s: 0.017589807510375977\n", - " time_total_s: 0.017589807510375977\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00007\n", - " warmup_time: 0.003782033920288086\n", - " \n", - "Result for train_breast_cancer_c28a3_00006:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.11888111888111888\n", - " eval-logloss: 0.6926244418104212\n", - " experiment_id: d3906de5943a4e05a4cc782382f67d24\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54445\n", - " time_since_restore: 0.015311956405639648\n", - " time_this_iter_s: 0.015311956405639648\n", - " time_total_s: 0.015311956405639648\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00006\n", - " warmup_time: 0.005506038665771484\n", - " \n", - "Result for train_breast_cancer_c28a3_00002:\n", - " date: 2022-07-22_16-56-01\n", - " done: false\n", - " eval-error: 0.04895104895104895\n", - " eval-logloss: 0.6752762102580571\n", - " experiment_id: a3645fc2d43145d88a1f5b7cc94df703\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54441\n", - " time_since_restore: 0.027367830276489258\n", - " time_this_iter_s: 0.027367830276489258\n", - " time_total_s: 0.027367830276489258\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00002\n", - " warmup_time: 0.0062830448150634766\n", - " \n", - "Result for train_breast_cancer_c28a3_00001:\n", - " date: 2022-07-22_16-56-01\n", - " done: false\n", - " eval-error: 0.07692307692307693\n", - " eval-logloss: 0.6698804135089154\n", - " experiment_id: 85766fe4d9fa482a91e396a8fd509a19\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54440\n", - " time_since_restore: 0.017169952392578125\n", - " time_this_iter_s: 0.017169952392578125\n", - " time_total_s: 0.017169952392578125\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00001\n", - " warmup_time: 0.006204843521118164\n", - " \n", - "Result for train_breast_cancer_c28a3_00008:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.05594405594405594\n", - " eval-logloss: 0.692784742458717\n", - " experiment_id: 2c7d8bc38ad04536b1dec76819a2b3bf\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54447\n", - " time_since_restore: 0.01937389373779297\n", - " time_this_iter_s: 0.01937389373779297\n", - " time_total_s: 0.01937389373779297\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00008\n", - " warmup_time: 0.004342079162597656\n", - " \n", - "Result for train_breast_cancer_c28a3_00001:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.06293706293706294\n", - " eval-logloss: 0.6503534216980834\n", - " experiment_id: 85766fe4d9fa482a91e396a8fd509a19\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 2\n", - " node_ip: 127.0.0.1\n", - " pid: 54440\n", - " time_since_restore: 0.13567376136779785\n", - " time_this_iter_s: 0.11850380897521973\n", - " time_total_s: 0.13567376136779785\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 2\n", - " trial_id: c28a3_00001\n", - " warmup_time: 0.006204843521118164\n", - " \n", - "Result for train_breast_cancer_c28a3_00004:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.06993006993006994\n", - " eval-logloss: 0.689577207281873\n", - " experiment_id: ef4fdc645c444112985b4957ab8a84e9\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 1\n", - " node_ip: 127.0.0.1\n", - " pid: 54443\n", - " time_since_restore: 0.027021169662475586\n", - " time_this_iter_s: 0.027021169662475586\n", - " time_total_s: 0.027021169662475586\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 1\n", - " trial_id: c28a3_00004\n", - " warmup_time: 0.0063669681549072266\n", - " \n", - "Result for train_breast_cancer_c28a3_00002:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.04195804195804196\n", - " eval-logloss: 0.658545415301423\n", - " experiment_id: a3645fc2d43145d88a1f5b7cc94df703\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 2\n", - " node_ip: 127.0.0.1\n", - " pid: 54441\n", - " time_since_restore: 0.17307591438293457\n", - " time_this_iter_s: 0.1457080841064453\n", - " time_total_s: 0.17307591438293457\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 2\n", - " trial_id: c28a3_00002\n", - " warmup_time: 0.0062830448150634766\n", - " \n", - "Result for train_breast_cancer_c28a3_00003:\n", - " date: 2022-07-22_16-56-01\n", - " done: true\n", - " eval-error: 0.04195804195804196\n", - " eval-logloss: 0.38665050018083796\n", - " experiment_id: 7ff6133237404b4ea4755b9f8cd114f2\n", - " hostname: Kais-MacBook-Pro.local\n", - " iterations_since_restore: 10\n", - " node_ip: 127.0.0.1\n", - " pid: 54442\n", - " time_since_restore: 0.28186488151550293\n", - " time_this_iter_s: 0.03063178062438965\n", - " time_total_s: 0.28186488151550293\n", - " timestamp: 1658505361\n", - " timesteps_since_restore: 0\n", - " training_iteration: 10\n", - " trial_id: c28a3_00003\n", - " warmup_time: 0.006722211837768555\n", - " \n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-07-22 16:56:01,498\tINFO tune.py:738 -- Total run time: 10.53 seconds (10.37 seconds for the tuning loop).\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 2, 'min_child_weight': 3, 'subsample': 0.782626252548841, 'eta': 0.06385952388342125}\n", - "Best model total accuracy: 0.9580\n" - ] - } - ], - "source": [ - "import sklearn.datasets\n", - "import sklearn.metrics\n", - "import os\n", - "from ray.tune.schedulers import ASHAScheduler\n", - "from sklearn.model_selection import train_test_split\n", - "import xgboost as xgb\n", - "\n", - "from ray import air, tune\n", - "from ray.air import session\n", - "from ray.tune.integration.xgboost import TuneReportCheckpointCallback\n", - "\n", - "\n", - "def train_breast_cancer(config: dict):\n", - " # This is a simple training function to be passed into Tune\n", - " # Load dataset\n", - " data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", - " # Split into train and test set\n", - " train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)\n", - " # Build input matrices for XGBoost\n", - " train_set = xgb.DMatrix(train_x, label=train_y)\n", - " test_set = xgb.DMatrix(test_x, label=test_y)\n", - " # Train the classifier, using the Tune callback\n", - " xgb.train(\n", - " config,\n", - " train_set,\n", - " evals=[(test_set, \"eval\")],\n", - " verbose_eval=False,\n", - " callbacks=[TuneReportCheckpointCallback(filename=\"model.xgb\")],\n", - " )\n", - "\n", - "\n", - "def get_best_model_checkpoint(results):\n", - " best_bst = xgb.Booster()\n", - " best_result = results.get_best_result()\n", - "\n", - " with best_result.checkpoint.as_directory() as best_checkpoint_dir:\n", - " best_bst.load_model(os.path.join(best_checkpoint_dir, \"model.xgb\"))\n", - " accuracy = 1.0 - best_result.metrics[\"eval-error\"]\n", - " print(f\"Best model parameters: {best_result.config}\")\n", - " print(f\"Best model total accuracy: {accuracy:.4f}\")\n", - " return best_bst\n", - "\n", - "\n", - "def tune_xgboost(smoke_test=False):\n", - " search_space = {\n", - " # You can mix constants with search space objects.\n", - " \"objective\": \"binary:logistic\",\n", - " \"eval_metric\": [\"logloss\", \"error\"],\n", - " \"max_depth\": tune.randint(1, 9),\n", - " \"min_child_weight\": tune.choice([1, 2, 3]),\n", - " \"subsample\": tune.uniform(0.5, 1.0),\n", - " \"eta\": tune.loguniform(1e-4, 1e-1),\n", - " }\n", - " # This will enable aggressive early stopping of bad trials.\n", - " scheduler = ASHAScheduler(\n", - " max_t=10, grace_period=1, reduction_factor=2 # 10 training iterations\n", - " )\n", - "\n", - " tuner = tune.Tuner(\n", - " train_breast_cancer,\n", - " tune_config=tune.TuneConfig(\n", - " metric=\"eval-logloss\",\n", - " mode=\"min\",\n", - " scheduler=scheduler,\n", - " num_samples=1 if smoke_test else 10,\n", - " ),\n", - " param_space=search_space,\n", - " )\n", - " results = tuner.fit()\n", - "\n", - " return results\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " import argparse\n", - "\n", - " parser = argparse.ArgumentParser()\n", - " parser.add_argument(\n", - " \"--smoke-test\", action=\"store_true\", help=\"Finish quickly for testing\"\n", - " )\n", - " args, _ = parser.parse_known_args()\n", - "\n", - " results = tune_xgboost(smoke_test=args.smoke_test)\n", - "\n", - " # Load the best model checkpoint.\n", - " best_bst = get_best_model_checkpoint(results)\n", - "\n", - " # You could now do further predictions with\n", - " # best_bst.predict(...)\n" - ] - }, - { - "cell_type": "markdown", - "id": "20732fe4", - "metadata": {}, - "source": [ - "The output of our run could look like this:\n", - "\n", - "```{code-block} bash\n", - ":emphasize-lines: 7\n", - "\n", - " Number of trials: 10/10 (10 TERMINATED)\n", - " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+\n", - " | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | iter | total time (s) | eval-logloss | eval-error |\n", - " |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------|\n", - " | train_breast_cancer_ba275_00000 | TERMINATED | | 0.00205087 | 2 | 1 | 0.898391 | 10 | 0.380619 | 0.678039 | 0.090909 |\n", - " | train_breast_cancer_ba275_00001 | TERMINATED | | 0.000183834 | 4 | 3 | 0.924939 | 1 | 0.0228798 | 0.693009 | 0.111888 |\n", - " | train_breast_cancer_ba275_00002 | TERMINATED | | 0.0242721 | 7 | 2 | 0.501551 | 10 | 0.376154 | 0.54472 | 0.06993 |\n", - " | train_breast_cancer_ba275_00003 | TERMINATED | | 0.000449692 | 5 | 3 | 0.890212 | 1 | 0.0234981 | 0.692811 | 0.090909 |\n", - " | train_breast_cancer_ba275_00004 | TERMINATED | | 0.000376393 | 7 | 2 | 0.883609 | 1 | 0.0231569 | 0.692847 | 0.062937 |\n", - " | train_breast_cancer_ba275_00005 | TERMINATED | | 0.00231942 | 3 | 3 | 0.877464 | 2 | 0.104867 | 0.689541 | 0.083916 |\n", - " | train_breast_cancer_ba275_00006 | TERMINATED | | 0.000542326 | 1 | 2 | 0.578584 | 1 | 0.0213971 | 0.692765 | 0.083916 |\n", - " | train_breast_cancer_ba275_00007 | TERMINATED | | 0.0016801 | 1 | 2 | 0.975302 | 1 | 0.02226 | 0.691999 | 0.083916 |\n", - " | train_breast_cancer_ba275_00008 | TERMINATED | | 0.000595756 | 8 | 3 | 0.58429 | 1 | 0.0221152 | 0.692657 | 0.06993 |\n", - " | train_breast_cancer_ba275_00009 | TERMINATED | | 0.000357845 | 8 | 1 | 0.637776 | 1 | 0.022635 | 0.692859 | 0.090909 |\n", - " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+\n", - "\n", - "\n", - " Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.5015513240240503, 'eta': 0.024272050872920895}\n", - " Best model total accuracy: 0.9301\n", - "```\n", - "\n", - "As you can see, most trials have been stopped only after a few iterations. Only the\n", - "two most promising trials were run for the full 10 iterations.\n", - "\n", - "You can also ensure that all available resources are being used as the scheduler\n", - "terminates trials, freeing them up. This can be done through the\n", - "`ResourceChangingScheduler`. An example of this can be found here:\n", - "{doc}`/tune/examples/includes/xgboost_dynamic_resources_example`.\n", - "\n", - "## Using fractional GPUs\n", - "\n", - "You can often accelerate your training by using GPUs in addition to CPUs. However,\n", - "you usually don't have as many GPUs as you have trials to run. For instance, if you\n", - "run 10 Tune trials in parallel, you usually don't have access to 10 separate GPUs.\n", - "\n", - "Tune supports *fractional GPUs*. This means that each task is assigned a fraction\n", - "of the GPU memory for training. For 10 tasks, this could look like this:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d1b20a3", - "metadata": {}, - "outputs": [], - "source": [ - "config = {\n", - " \"objective\": \"binary:logistic\",\n", - " \"eval_metric\": [\"logloss\", \"error\"],\n", - " \"tree_method\": \"gpu_hist\",\n", - " \"max_depth\": tune.randint(1, 9),\n", - " \"min_child_weight\": tune.choice([1, 2, 3]),\n", - " \"subsample\": tune.uniform(0.5, 1.0),\n", - " \"eta\": tune.loguniform(1e-4, 1e-1),\n", - "}\n", - "\n", - "tuner = tune.Tuner(\n", - " tune.with_resources(train_breast_cancer, resources={\"cpu\": 1, \"gpu\": 0.1}),\n", - " tune_config=tune.TuneConfig(\n", - " num_samples=10,\n", - " ),\n", - " param_space=config,\n", - ")\n", - "results = tuner.fit()\n" - ] - }, - { - "cell_type": "markdown", - "id": "ee131861", - "metadata": {}, - "source": [ - "Each task thus works with 10% of the available GPU memory. You also have to tell\n", - "XGBoost to use the `gpu_hist` tree method, so it knows it should use the GPU.\n", - "\n", - "## Conclusion\n", - "\n", - "You should now have a basic understanding on how to train XGBoost models and on how\n", - "to tune the hyperparameters to yield the best results. In our simple example,\n", - "Tuning the parameters didn't make a huge difference for the accuracy.\n", - "But in larger applications, intelligent hyperparameter tuning can make the\n", - "difference between a model that doesn't seem to learn at all, and a model\n", - "that outperforms all the other ones.\n", - "\n", - "## More XGBoost Examples\n", - "\n", - "- {doc}`/tune/examples/includes/xgboost_dynamic_resources_example`:\n", - " Trains a basic XGBoost model with Tune with the class-based API and a ResourceChangingScheduler, ensuring all resources are being used at all time.\n", - "\n", - "## Learn More\n", - "\n", - "- [XGBoost Hyperparameter Tuning - A Visual Guide](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)\n", - "- [Notes on XGBoost Parameter Tuning](https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html)\n", - "- [Doing XGBoost Hyperparameter Tuning the smart way](https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ray_dev_py38", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:05:16) \n[Clang 12.0.1 ]" - }, - "orphan": true, - "vscode": { - "interpreter": { - "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517" - } - } + "cells": [ + { + "cell_type": "markdown", + "id": "edce67b9", + "metadata": {}, + "source": [ + "# Tuning XGBoost hyperparameters with Ray Tune\n", + "\n", + "(tune-xgboost-ref)=\n", + "\n", + "XGBoost is currently one of the most popular machine learning algorithms. It performs\n", + "very well on a large selection of tasks, and was the key to success in many Kaggle\n", + "competitions.\n", + "\n", + "```{image} /images/xgboost_logo.png\n", + ":align: center\n", + ":alt: XGBoost\n", + ":target: https://xgboost.readthedocs.io/en/latest/\n", + ":width: 200px\n", + "```\n", + "\n", + "This tutorial will give you a quick introduction to XGBoost, show you how\n", + "to train an XGBoost model, and then guide you on how to optimize XGBoost\n", + "parameters using Tune to get the best performance. We tackle the following topics:\n", + "\n", + "```{contents}\n", + ":depth: 2\n", + "```\n", + "\n", + ":::{note}\n", + "To run this tutorial, you will need to install the following:\n", + "\n", + "```bash\n", + "$ pip install xgboost\n", + "```\n", + ":::\n", + "\n", + "## What is XGBoost\n", + "\n", + "XGBoost is an acronym for e**X**treme **G**radient **Boost**ing. Internally,\n", + "XGBoost uses [decision trees](https://en.wikipedia.org/wiki/Decision_tree). Instead\n", + "of training just one large decision tree, XGBoost and other related algorithms train\n", + "many small decision trees. The intuition behind this is that even though single\n", + "decision trees can be inaccurate and suffer from high variance,\n", + "combining the output of a large number of these weak learners can actually lead to\n", + "strong learner, resulting in better predictions and less variance.\n", + "\n", + ":::{figure} /images/tune-xgboost-ensemble.svg\n", + ":alt: Single vs. ensemble learning\n", + "\n", + "A single decision tree (left) might be able to get to an accuracy of 70%\n", + "for a binary classification task. By combining the output of several small\n", + "decision trees, an ensemble learner (right) might end up with a higher accuracy\n", + "of 90%.\n", + ":::\n", + "\n", + "Boosting algorithms start with a single small decision tree and evaluate how well\n", + "it predicts the given examples. When building the next tree, those samples that have\n", + "been misclassified before have a higher chance of being used to generate the tree.\n", + "This is useful because it avoids overfitting to samples that can be easily classified\n", + "and instead tries to come up with models that are able to classify hard examples, too.\n", + "Please see [here for a more thorough introduction to bagging and boosting algorithms](https://towardsdatascience.com/ensemble-methods-bagging-boosting-and-stacking-c9214a10a205).\n", + "\n", + "There are many boosting algorithms. In their core, they are all very similar. XGBoost\n", + "uses second-level derivatives to find splits that maximize the *gain* (the inverse of\n", + "the *loss*) - hence the name. In practice, there really is no drawback in using\n", + "XGBoost over other boosting algorithms - in fact, it usually shows the best performance.\n", + "\n", + "## Training a simple XGBoost classifier\n", + "\n", + "Let's first see how a simple XGBoost classifier can be trained. We'll use the\n", + "`breast_cancer`-Dataset included in the `sklearn` dataset collection. This is\n", + "a binary classification dataset. Given 30 different input features, our task is to\n", + "learn to identify subjects with breast cancer and those without.\n", + "\n", + "Here is the full code to train a simple XGBoost model:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "77b3c71c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9650\n" + ] + } + ], + "source": [ + "import sklearn.datasets\n", + "import sklearn.metrics\n", + "from sklearn.model_selection import train_test_split\n", + "import xgboost as xgb\n", + "\n", + "\n", + "def train_breast_cancer(config):\n", + " # Load dataset\n", + " data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", + " # Split into train and test set\n", + " train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)\n", + " # Build input matrices for XGBoost\n", + " train_set = xgb.DMatrix(train_x, label=train_y)\n", + " test_set = xgb.DMatrix(test_x, label=test_y)\n", + " # Train the classifier\n", + " results = {}\n", + " bst = xgb.train(\n", + " config,\n", + " train_set,\n", + " evals=[(test_set, \"eval\")],\n", + " evals_result=results,\n", + " verbose_eval=False,\n", + " )\n", + " return results\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " results = train_breast_cancer(\n", + " {\"objective\": \"binary:logistic\", \"eval_metric\": [\"logloss\", \"error\"]}\n", + " )\n", + " accuracy = 1.0 - results[\"eval\"][\"error\"][-1]\n", + " print(f\"Accuracy: {accuracy:.4f}\")\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ec2a13f8", + "metadata": {}, + "source": [ + "As you can see, the code is quite simple. First, the dataset is loaded and split\n", + "into a `test` and `train` set. The XGBoost model is trained with `xgb.train()`.\n", + "XGBoost automatically evaluates metrics we specified on the test set. In our case\n", + "it calculates the *logloss* and the prediction *error*, which is the percentage of\n", + "misclassified examples. To calculate the accuracy, we just have to subtract the error\n", + "from `1.0`. Even in this simple example, most runs result\n", + "in a good accuracy of over `0.90`.\n", + "\n", + "Maybe you have noticed the `config` parameter we pass to the XGBoost algorithm. This\n", + "is a {class}`dict` in which you can specify parameters for the XGBoost algorithm. In this\n", + "simple example, the only parameters we passed are the `objective` and `eval_metric` parameters.\n", + "The value `binary:logistic` tells XGBoost that we aim to train a logistic regression model for\n", + "a binary classification task. You can find an overview over all valid objectives\n", + "[here in the XGBoost documentation](https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters).\n", + "\n", + "## XGBoost Hyperparameters\n", + "\n", + "Even with the default settings, XGBoost was able to get to a good accuracy on the\n", + "breast cancer dataset. However, as in many machine learning algorithms, there are\n", + "many knobs to tune which might lead to even better performance. Let's explore some of\n", + "them below.\n", + "\n", + "### Maximum tree depth\n", + "\n", + "Remember that XGBoost internally uses many decision tree models to come up with\n", + "predictions. When training a decision tree, we need to tell the algorithm how\n", + "large the tree may get. The parameter for this is called the tree *depth*.\n", + "\n", + ":::{figure} /images/tune-xgboost-depth.svg\n", + ":align: center\n", + ":alt: Decision tree depth\n", + "\n", + "In this image, the left tree has a depth of 2, and the right tree a depth of 3.\n", + "Note that with each level, $2^{(d-1)}$ splits are added, where *d* is the depth\n", + "of the tree.\n", + ":::\n", + "\n", + "Tree depth is a property that concerns the model complexity. If you only allow short\n", + "trees, the models are likely not very precise - they underfit the data. If you allow\n", + "very large trees, the single models are likely to overfit to the data. In practice,\n", + "a number between `2` and `6` is often a good starting point for this parameter.\n", + "\n", + "XGBoost's default value is `3`.\n", + "\n", + "### Minimum child weight\n", + "\n", + "When a decision tree creates new leaves, it splits up the remaining data at one node\n", + "into two groups. If there are only few samples in one of these groups, it often\n", + "doesn't make sense to split it further. One of the reasons for this is that the\n", + "model is harder to train when we have fewer samples.\n", + "\n", + ":::{figure} /images/tune-xgboost-weight.svg\n", + ":align: center\n", + ":alt: Minimum child weight\n", + "\n", + "In this example, we start with 100 examples. At the first node, they are split\n", + "into 4 and 96 samples, respectively. In the next step, our model might find\n", + "that it doesn't make sense to split the 4 examples more. It thus only continues\n", + "to add leaves on the right side.\n", + ":::\n", + "\n", + "The parameter used by the model to decide if it makes sense to split a node is called\n", + "the *minimum child weight*. In the case of linear regression, this is just the absolute\n", + "number of nodes requried in each child. In other objectives, this value is determined\n", + "using the weights of the examples, hence the name.\n", + "\n", + "The larger the value, the more constrained the trees are and the less deep they will be.\n", + "This parameter thus also affects the model complexity. Values can range between 0\n", + "and infinity and are dependent on the sample size. For our ca. 500 examples in the\n", + "breast cancer dataset, values between `0` and `10` should be sensible.\n", + "\n", + "XGBoost's default value is `1`.\n", + "\n", + "### Subsample size\n", + "\n", + "Each decision tree we add is trained on a subsample of the total training dataset.\n", + "The probabilities for the samples are weighted according to the XGBoost algorithm,\n", + "but we can decide on which fraction of the samples we want to train each decision\n", + "tree on.\n", + "\n", + "Setting this value to `0.7` would mean that we randomly sample `70%` of the\n", + "training dataset before each training iteration.\n", + "\n", + "XGBoost's default value is `1`.\n", + "\n", + "### Learning rate / Eta\n", + "\n", + "Remember that XGBoost sequentially trains many decision trees, and that later trees\n", + "are more likely trained on data that has been misclassified by prior trees. In effect\n", + "this means that earlier trees make decisions for easy samples (i.e. those samples that\n", + "can easily be classified) and later trees make decisions for harder samples. It is then\n", + "sensible to assume that the later trees are less accurate than earlier trees.\n", + "\n", + "To address this fact, XGBoost uses a parameter called *Eta*, which is sometimes called\n", + "the *learning rate*. Don't confuse this with learning rates from gradient descent!\n", + "The original [paper on stochastic gradient boosting](https://www.researchgate.net/publication/222573328_Stochastic_Gradient_Boosting)\n", + "introduces this parameter like so:\n", + "\n", + "$$\n", + "F_m(x) = F_{m-1}(x) + \\eta \\cdot \\gamma_{lm} \\textbf{1}(x \\in R_{lm})\n", + "$$\n", + "\n", + "This is just a complicated way to say that when we train we new decision tree,\n", + "represented by $\\gamma_{lm} \\textbf{1}(x \\in R_{lm})$, we want to dampen\n", + "its effect on the previous prediction $F_{m-1}(x)$ with a factor\n", + "$\\eta$.\n", + "\n", + "Typical values for this parameter are between `0.01` and `` 0.3` ``.\n", + "\n", + "XGBoost's default value is `0.3`.\n", + "\n", + "### Number of boost rounds\n", + "\n", + "Lastly, we can decide on how many boosting rounds we perform, which means how\n", + "many decision trees we ultimately train. When we do heavy subsampling or use small\n", + "learning rate, it might make sense to increase the number of boosting rounds.\n", + "\n", + "XGBoost's default value is `10`.\n", + "\n", + "### Putting it together\n", + "\n", + "Let's see how this looks like in code! We just need to adjust our `config` dict:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "35073e88", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9790\n" + ] + } + ], + "source": [ + "if __name__ == \"__main__\":\n", + " config = {\n", + " \"objective\": \"binary:logistic\",\n", + " \"eval_metric\": [\"logloss\", \"error\"],\n", + " \"max_depth\": 2,\n", + " \"min_child_weight\": 0,\n", + " \"subsample\": 0.8,\n", + " \"eta\": 0.2,\n", + " }\n", + " results = train_breast_cancer(config)\n", + " accuracy = 1.0 - results[\"eval\"][\"error\"][-1]\n", + " print(f\"Accuracy: {accuracy:.4f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "69cf0c13", + "metadata": {}, + "source": [ + "The rest stays the same. Please note that we do not adjust the `num_boost_rounds` here.\n", + "The result should also show a high accuracy of over 90%.\n", + "\n", + "## Tuning the configuration parameters\n", + "\n", + "XGBoosts default parameters already lead to a good accuracy, and even our guesses in the\n", + "last section should result in accuracies well above 90%. However, our guesses were\n", + "just that: guesses. Often we do not know what combination of parameters would actually\n", + "lead to the best results on a machine learning task.\n", + "\n", + "Unfortunately, there are infinitely many combinations of hyperparameters we could try\n", + "out. Should we combine `max_depth=3` with `subsample=0.8` or with `subsample=0.9`?\n", + "What about the other parameters?\n", + "\n", + "This is where hyperparameter tuning comes into play. By using tuning libraries such as\n", + "Ray Tune we can try out combinations of hyperparameters. Using sophisticated search\n", + "strategies, these parameters can be selected so that they are likely to lead to good\n", + "results (avoiding an expensive *exhaustive search*). Also, trials that do not perform\n", + "well can be preemptively stopped to reduce waste of computing resources. Lastly, Ray Tune\n", + "also takes care of training these runs in parallel, greatly increasing search speed.\n", + "\n", + "Let's start with a basic example on how to use Tune for this. We just need to make\n", + "a few changes to our code-block:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ff856a82", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-22 15:52:52,004\tINFO services.py:1483 -- View the Ray dashboard at \u001B[1m\u001B[32mhttp://127.0.0.1:8268\u001B[39m\u001B[22m\n", + "2022-07-22 15:52:55,858\tWARNING function_trainable.py:619 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "data": { + "text/html": [ + "== Status ==
    Current time: 2022-07-22 15:53:04 (running for 00:00:07.77)
    Memory usage on this node: 10.5/16.0 GiB
    Using FIFO scheduling algorithm.
    Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.57 GiB heap, 0.0/2.0 GiB objects
    Result logdir: /Users/kai/ray_results/train_breast_cancer_2022-07-22_15-52-48
    Number of trials: 10/10 (10 TERMINATED)
    \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Trial name status loc eta max_depth min_child_weight subsample acc iter total time (s)
    train_breast_cancer_f8669_00000TERMINATED127.0.0.1:488520.0069356 5 3 0.8235040.944056 1 0.0316169
    train_breast_cancer_f8669_00001TERMINATED127.0.0.1:488570.00145619 6 3 0.8329470.958042 1 0.0328588
    train_breast_cancer_f8669_00002TERMINATED127.0.0.1:488580.00108208 7 3 0.9873190.944056 1 0.0319381
    train_breast_cancer_f8669_00003TERMINATED127.0.0.1:488590.00530429 8 2 0.6156910.923077 1 0.028388
    train_breast_cancer_f8669_00004TERMINATED127.0.0.1:488600.000721843 8 1 0.6509730.958042 1 0.0299618
    train_breast_cancer_f8669_00005TERMINATED127.0.0.1:488610.0074509 1 1 0.7383410.874126 1 0.0193682
    train_breast_cancer_f8669_00006TERMINATED127.0.0.1:488620.0879882 8 2 0.6715760.944056 1 0.0267372
    train_breast_cancer_f8669_00007TERMINATED127.0.0.1:488630.0765404 7 2 0.7081570.965035 1 0.0276129
    train_breast_cancer_f8669_00008TERMINATED127.0.0.1:488640.000627649 6 1 0.81121 0.951049 1 0.0310998
    train_breast_cancer_f8669_00009TERMINATED127.0.0.1:488650.000383711 2 3 0.9905790.93007 1 0.0274954


    " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-22 15:52:57,385\tINFO plugin_schema_manager.py:52 -- Loading the default runtime env schemas: ['/Users/kai/coding/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/working_dir_schema.json', '/Users/kai/coding/ray/python/ray/_private/runtime_env/../../runtime_env/schemas/pip_schema.json'].\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result for train_breast_cancer_f8669_00000:\n", + " date: 2022-07-22_15-53-00\n", + " done: true\n", + " experiment_id: 07d10c5f31e74133b53272b7ccf9c528\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.9440559440559441\n", + " node_ip: 127.0.0.1\n", + " pid: 48852\n", + " time_since_restore: 0.031616926193237305\n", + " time_this_iter_s: 0.031616926193237305\n", + " time_total_s: 0.031616926193237305\n", + " timestamp: 1658501580\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00000\n", + " warmup_time: 0.0027849674224853516\n", + " \n", + "Result for train_breast_cancer_f8669_00009:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: bc0d5dd2d079432b859faac8a18928f0\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.9300699300699301\n", + " node_ip: 127.0.0.1\n", + " pid: 48865\n", + " time_since_restore: 0.027495384216308594\n", + " time_this_iter_s: 0.027495384216308594\n", + " time_total_s: 0.027495384216308594\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00009\n", + " warmup_time: 0.005235910415649414\n", + " \n", + "Result for train_breast_cancer_f8669_00001:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: 4b10d350d4374a0d9e7d0c3b1d4e3203\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.958041958041958\n", + " node_ip: 127.0.0.1\n", + " pid: 48857\n", + " time_since_restore: 0.032858848571777344\n", + " time_this_iter_s: 0.032858848571777344\n", + " time_total_s: 0.032858848571777344\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00001\n", + " warmup_time: 0.004731178283691406\n", + " \n", + "Result for train_breast_cancer_f8669_00008:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: 91c25cbbeb6f409d93e1d6537cb8e1ee\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.951048951048951\n", + " node_ip: 127.0.0.1\n", + " pid: 48864\n", + " time_since_restore: 0.031099796295166016\n", + " time_this_iter_s: 0.031099796295166016\n", + " time_total_s: 0.031099796295166016\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00008\n", + " warmup_time: 0.003270864486694336\n", + " \n", + "Result for train_breast_cancer_f8669_00005:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: d225b0fb59e14da7adba952456ccf1d5\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.8741258741258742\n", + " node_ip: 127.0.0.1\n", + " pid: 48861\n", + " time_since_restore: 0.01936817169189453\n", + " time_this_iter_s: 0.01936817169189453\n", + " time_total_s: 0.01936817169189453\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00005\n", + " warmup_time: 0.003901958465576172\n", + " \n", + "Result for train_breast_cancer_f8669_00004:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: 322484af6ea5422f8aaf8ff6a91af4f7\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.958041958041958\n", + " node_ip: 127.0.0.1\n", + " pid: 48860\n", + " time_since_restore: 0.029961824417114258\n", + " time_this_iter_s: 0.029961824417114258\n", + " time_total_s: 0.029961824417114258\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00004\n", + " warmup_time: 0.003547191619873047\n", + " \n", + "Result for train_breast_cancer_f8669_00002:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: 3f588954160b42ce8ce200f68127ebcd\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.9440559440559441\n", + " node_ip: 127.0.0.1\n", + " pid: 48858\n", + " time_since_restore: 0.03193807601928711\n", + " time_this_iter_s: 0.03193807601928711\n", + " time_total_s: 0.03193807601928711\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00002\n", + " warmup_time: 0.003523111343383789\n", + " \n", + "Result for train_breast_cancer_f8669_00003:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: a39ea777ce2d4ebca51b3d7a4179dae5\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.9230769230769231\n", + " node_ip: 127.0.0.1\n", + " pid: 48859\n", + " time_since_restore: 0.028388023376464844\n", + " time_this_iter_s: 0.028388023376464844\n", + " time_total_s: 0.028388023376464844\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00003\n", + " warmup_time: 0.0035560131072998047\n", + " \n", + "Result for train_breast_cancer_f8669_00006:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: f97c6b9674854f8d89ec26ba58cc1618\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.9440559440559441\n", + " node_ip: 127.0.0.1\n", + " pid: 48862\n", + " time_since_restore: 0.026737213134765625\n", + " time_this_iter_s: 0.026737213134765625\n", + " time_total_s: 0.026737213134765625\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00006\n", + " warmup_time: 0.003425121307373047\n", + " \n", + "Result for train_breast_cancer_f8669_00007:\n", + " date: 2022-07-22_15-53-04\n", + " done: true\n", + " experiment_id: ff172037065a4d55998ed72f51bdc5df\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " mean_accuracy: 0.965034965034965\n", + " node_ip: 127.0.0.1\n", + " pid: 48863\n", + " time_since_restore: 0.027612924575805664\n", + " time_this_iter_s: 0.027612924575805664\n", + " time_total_s: 0.027612924575805664\n", + " timestamp: 1658501584\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: f8669_00007\n", + " warmup_time: 0.0031311511993408203\n", + " \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-22 15:53:04,846\tINFO tune.py:738 -- Total run time: 8.99 seconds (7.74 seconds for the tuning loop).\n" + ] + } + ], + "source": [ + "import sklearn.datasets\n", + "import sklearn.metrics\n", + "\n", + "from ray import air, tune\n", + "from ray.air import session\n", + "\n", + "\n", + "def train_breast_cancer(config):\n", + " # Load dataset\n", + " data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", + " # Split into train and test set\n", + " train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)\n", + " # Build input matrices for XGBoost\n", + " train_set = xgb.DMatrix(train_x, label=train_y)\n", + " test_set = xgb.DMatrix(test_x, label=test_y)\n", + " # Train the classifier\n", + " results = {}\n", + " xgb.train(\n", + " config,\n", + " train_set,\n", + " evals=[(test_set, \"eval\")],\n", + " evals_result=results,\n", + " verbose_eval=False,\n", + " )\n", + " # Return prediction accuracy\n", + " accuracy = 1.0 - results[\"eval\"][\"error\"][-1]\n", + " session.report({\"mean_accuracy\": accuracy, \"done\": True})\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " config = {\n", + " \"objective\": \"binary:logistic\",\n", + " \"eval_metric\": [\"logloss\", \"error\"],\n", + " \"max_depth\": tune.randint(1, 9),\n", + " \"min_child_weight\": tune.choice([1, 2, 3]),\n", + " \"subsample\": tune.uniform(0.5, 1.0),\n", + " \"eta\": tune.loguniform(1e-4, 1e-1),\n", + " }\n", + " tuner = tune.Tuner(\n", + " train_breast_cancer,\n", + " tune_config=tune.TuneConfig(\n", + " num_samples=10,\n", + " ),\n", + " param_space=config,\n", + " )\n", + " results = tuner.fit()\n" + ] + }, + { + "cell_type": "markdown", + "id": "4999e858", + "metadata": {}, + "source": [ + "As you can see, the changes in the actual training function are minimal. Instead of\n", + "returning the accuracy value, we report it back to Tune using `session.report()`.\n", + "Our `config` dictionary only changed slightly. Instead of passing hard-coded\n", + "parameters, we tell Tune to choose values from a range of valid options. There are\n", + "a number of options we have here, all of which are explained in\n", + "{ref}`the Tune docs `.\n", + "\n", + "For a brief explanation, this is what they do:\n", + "\n", + "- `tune.randint(min, max)` chooses a random integer value between *min* and *max*.\n", + " Note that *max* is exclusive, so it will not be sampled.\n", + "- `tune.choice([a, b, c])` chooses one of the items of the list at random. Each item\n", + " has the same chance to be sampled.\n", + "- `tune.uniform(min, max)` samples a floating point number between *min* and *max*.\n", + " Note that *max* is exclusive here, too.\n", + "- `tune.loguniform(min, max, base=10)` samples a floating point number between *min* and *max*,\n", + " but applies a logarithmic transformation to these boundaries first. Thus, this makes\n", + " it easy to sample values from different orders of magnitude.\n", + "\n", + "The `num_samples=10` option we pass to the `TuneConfig()` means that we sample 10 different\n", + "hyperparameter configurations from this search space.\n", + "\n", + "The output of our training run coud look like this:\n", + "\n", + "```{code-block} bash\n", + ":emphasize-lines: 14\n", + "\n", + " Number of trials: 10/10 (10 TERMINATED)\n", + " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+\n", + " | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | acc | iter | total time (s) |\n", + " |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------|\n", + " | train_breast_cancer_b63aa_00000 | TERMINATED | | 0.000117625 | 2 | 2 | 0.616347 | 0.916084 | 1 | 0.0306492 |\n", + " | train_breast_cancer_b63aa_00001 | TERMINATED | | 0.0382954 | 8 | 2 | 0.581549 | 0.937063 | 1 | 0.0357082 |\n", + " | train_breast_cancer_b63aa_00002 | TERMINATED | | 0.000217926 | 1 | 3 | 0.528428 | 0.874126 | 1 | 0.0264609 |\n", + " | train_breast_cancer_b63aa_00003 | TERMINATED | | 0.000120929 | 8 | 1 | 0.634508 | 0.958042 | 1 | 0.036406 |\n", + " | train_breast_cancer_b63aa_00004 | TERMINATED | | 0.00839715 | 5 | 1 | 0.730624 | 0.958042 | 1 | 0.0389378 |\n", + " | train_breast_cancer_b63aa_00005 | TERMINATED | | 0.000732948 | 8 | 2 | 0.915863 | 0.958042 | 1 | 0.0382841 |\n", + " | train_breast_cancer_b63aa_00006 | TERMINATED | | 0.000856226 | 4 | 1 | 0.645209 | 0.916084 | 1 | 0.0357089 |\n", + " | train_breast_cancer_b63aa_00007 | TERMINATED | | 0.00769908 | 7 | 1 | 0.729443 | 0.909091 | 1 | 0.0390737 |\n", + " | train_breast_cancer_b63aa_00008 | TERMINATED | | 0.00186339 | 5 | 3 | 0.595744 | 0.944056 | 1 | 0.0343912 |\n", + " | train_breast_cancer_b63aa_00009 | TERMINATED | | 0.000950272 | 3 | 2 | 0.835504 | 0.965035 | 1 | 0.0348201 |\n", + " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+\n", + "```\n", + "\n", + "The best configuration we found used `eta=0.000950272`, `max_depth=3`,\n", + "`min_child_weight=2`, `subsample=0.835504` and reached an accuracy of\n", + "`0.965035`.\n", + "\n", + "## Early stopping\n", + "\n", + "Currently, Tune samples 10 different hyperparameter configurations and trains a full\n", + "XGBoost on all of them. In our small example, training is very fast. However,\n", + "if training takes longer, a significant amount of computer resources is spent on trials\n", + "that will eventually show a bad performance, e.g. a low accuracy. It would be good\n", + "if we could identify these trials early and stop them, so we don't waste any resources.\n", + "\n", + "This is where Tune's *Schedulers* shine. A Tune `TrialScheduler` is responsible\n", + "for starting and stopping trials. Tune implements a number of different schedulers, each\n", + "described {ref}`in the Tune documentation `.\n", + "For our example, we will use the `AsyncHyperBandScheduler` or `ASHAScheduler`.\n", + "\n", + "The basic idea of this scheduler: We sample a number of hyperparameter configurations.\n", + "Each of these configurations is trained for a specific number of iterations.\n", + "After these iterations, only the best performing hyperparameters are retained. These\n", + "are selected according to some loss metric, usually an evaluation loss. This cycle is\n", + "repeated until we end up with the best configuration.\n", + "\n", + "The `ASHAScheduler` needs to know three things:\n", + "\n", + "1. Which metric should be used to identify badly performing trials?\n", + "2. Should this metric be maximized or minimized?\n", + "3. How many iterations does each trial train for?\n", + "\n", + "There are more parameters, which are explained in the\n", + "{ref}`documentation `.\n", + "\n", + "Lastly, we have to report the loss metric to Tune. We do this with a `Callback` that\n", + "XGBoost accepts and calls after each evaluation round. Ray Tune comes\n", + "with {ref}`two XGBoost callbacks `\n", + "we can use for this. The `TuneReportCallback` just reports the evaluation\n", + "metrics back to Tune. The `TuneReportCheckpointCallback` also saves\n", + "checkpoints after each evaluation round. We will just use the latter in this\n", + "example so that we can retrieve the saved model later.\n", + "\n", + "These parameters from the `eval_metrics` configuration setting are then automatically\n", + "reported to Tune via the callback. Here, the raw error will be reported, not the accuracy.\n", + "To display the best reached accuracy, we will inverse it later.\n", + "\n", + "We will also load the best checkpointed model so that we can use it for predictions.\n", + "The best model is selected with respect to the `metric` and `mode` parameters we\n", + "pass to the `TunerConfig()`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d08b5b0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "== Status ==
    Current time: 2022-07-22 16:56:01 (running for 00:00:10.38)
    Memory usage on this node: 10.3/16.0 GiB
    Using AsyncHyperBand: num_stopped=10\n", + "Bracket: Iter 8.000: -0.5107275277792991 | Iter 4.000: -0.5876629346317344 | Iter 2.000: -0.6544494184997531 | Iter 1.000: -0.6859214191253369
    Resources requested: 0/16 CPUs, 0/0 GPUs, 0.0/4.57 GiB heap, 0.0/2.0 GiB objects
    Current best trial: c28a3_00003 with eval-logloss=0.38665050018083796 and parameters={'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 2, 'min_child_weight': 3, 'subsample': 0.782626252548841, 'eta': 0.06385952388342125}
    Result logdir: /Users/kai/ray_results/train_breast_cancer_2022-07-22_16-55-50
    Number of trials: 10/10 (10 TERMINATED)
    \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
    Trial name status loc eta max_depth min_child_weight subsample iter total time (s) eval-logloss eval-error
    train_breast_cancer_c28a3_00000TERMINATED127.0.0.1:544160.0186954 2 2 0.516916 10 0.22218 0.571496 0.0629371
    train_breast_cancer_c28a3_00001TERMINATED127.0.0.1:544400.0304404 8 2 0.745969 2 0.135674 0.650353 0.0629371
    train_breast_cancer_c28a3_00002TERMINATED127.0.0.1:544410.0217157 8 3 0.764138 2 0.173076 0.658545 0.041958
    train_breast_cancer_c28a3_00003TERMINATED127.0.0.1:544420.0638595 2 3 0.782626 10 0.281865 0.386651 0.041958
    train_breast_cancer_c28a3_00004TERMINATED127.0.0.1:544430.00442794 7 2 0.792359 1 0.0270212 0.689577 0.0699301
    train_breast_cancer_c28a3_00005TERMINATED127.0.0.1:544440.00222624 3 1 0.536331 1 0.0238512 0.691446 0.0839161
    train_breast_cancer_c28a3_00006TERMINATED127.0.0.1:544450.000825129 1 1 0.82472 1 0.015312 0.692624 0.118881
    train_breast_cancer_c28a3_00007TERMINATED127.0.0.1:544460.000770826 7 2 0.947268 1 0.0175898 0.692598 0.132867
    train_breast_cancer_c28a3_00008TERMINATED127.0.0.1:544470.000429759 7 1 0.88524 1 0.0193739 0.692785 0.0559441
    train_breast_cancer_c28a3_00009TERMINATED127.0.0.1:544480.0149863 2 1 0.722738 1 0.0165932 0.682266 0.111888


    " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result for train_breast_cancer_c28a3_00000:\n", + " date: 2022-07-22_16-55-55\n", + " done: false\n", + " eval-error: 0.08391608391608392\n", + " eval-logloss: 0.6790360066440556\n", + " experiment_id: 2a3189442db341519836a07fb2d65dd9\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54416\n", + " time_since_restore: 0.01624011993408203\n", + " time_this_iter_s: 0.01624011993408203\n", + " time_total_s: 0.01624011993408203\n", + " timestamp: 1658505355\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00000\n", + " warmup_time: 0.0035409927368164062\n", + " \n", + "Result for train_breast_cancer_c28a3_00000:\n", + " date: 2022-07-22_16-55-56\n", + " done: true\n", + " eval-error: 0.06293706293706294\n", + " eval-logloss: 0.5714958122560194\n", + " experiment_id: 2a3189442db341519836a07fb2d65dd9\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 10\n", + " node_ip: 127.0.0.1\n", + " pid: 54416\n", + " time_since_restore: 0.22218012809753418\n", + " time_this_iter_s: 0.007044076919555664\n", + " time_total_s: 0.22218012809753418\n", + " timestamp: 1658505356\n", + " timesteps_since_restore: 0\n", + " training_iteration: 10\n", + " trial_id: c28a3_00000\n", + " warmup_time: 0.0035409927368164062\n", + " \n", + "Result for train_breast_cancer_c28a3_00003:\n", + " date: 2022-07-22_16-56-01\n", + " done: false\n", + " eval-error: 0.08391608391608392\n", + " eval-logloss: 0.6472820101918041\n", + " experiment_id: 7ff6133237404b4ea4755b9f8cd114f2\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54442\n", + " time_since_restore: 0.023206233978271484\n", + " time_this_iter_s: 0.023206233978271484\n", + " time_total_s: 0.023206233978271484\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00003\n", + " warmup_time: 0.006722211837768555\n", + " \n", + "Result for train_breast_cancer_c28a3_00005:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.08391608391608392\n", + " eval-logloss: 0.6914464114429234\n", + " experiment_id: 344762ab6d574b63a9374e19526d0510\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54444\n", + " time_since_restore: 0.02385115623474121\n", + " time_this_iter_s: 0.02385115623474121\n", + " time_total_s: 0.02385115623474121\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00005\n", + " warmup_time: 0.008936882019042969\n", + " \n", + "Result for train_breast_cancer_c28a3_00009:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.11188811188811189\n", + " eval-logloss: 0.6822656309688008\n", + " experiment_id: 133901655fa64bf79f2dcc4e8e5e41b1\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54448\n", + " time_since_restore: 0.016593217849731445\n", + " time_this_iter_s: 0.016593217849731445\n", + " time_total_s: 0.016593217849731445\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00009\n", + " warmup_time: 0.004940032958984375\n", + " \n", + "Result for train_breast_cancer_c28a3_00007:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.13286713286713286\n", + " eval-logloss: 0.6925980357023386\n", + " experiment_id: b4331027cbaf442ab905b2e51797dbbd\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54446\n", + " time_since_restore: 0.017589807510375977\n", + " time_this_iter_s: 0.017589807510375977\n", + " time_total_s: 0.017589807510375977\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00007\n", + " warmup_time: 0.003782033920288086\n", + " \n", + "Result for train_breast_cancer_c28a3_00006:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.11888111888111888\n", + " eval-logloss: 0.6926244418104212\n", + " experiment_id: d3906de5943a4e05a4cc782382f67d24\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54445\n", + " time_since_restore: 0.015311956405639648\n", + " time_this_iter_s: 0.015311956405639648\n", + " time_total_s: 0.015311956405639648\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00006\n", + " warmup_time: 0.005506038665771484\n", + " \n", + "Result for train_breast_cancer_c28a3_00002:\n", + " date: 2022-07-22_16-56-01\n", + " done: false\n", + " eval-error: 0.04895104895104895\n", + " eval-logloss: 0.6752762102580571\n", + " experiment_id: a3645fc2d43145d88a1f5b7cc94df703\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54441\n", + " time_since_restore: 0.027367830276489258\n", + " time_this_iter_s: 0.027367830276489258\n", + " time_total_s: 0.027367830276489258\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00002\n", + " warmup_time: 0.0062830448150634766\n", + " \n", + "Result for train_breast_cancer_c28a3_00001:\n", + " date: 2022-07-22_16-56-01\n", + " done: false\n", + " eval-error: 0.07692307692307693\n", + " eval-logloss: 0.6698804135089154\n", + " experiment_id: 85766fe4d9fa482a91e396a8fd509a19\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54440\n", + " time_since_restore: 0.017169952392578125\n", + " time_this_iter_s: 0.017169952392578125\n", + " time_total_s: 0.017169952392578125\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00001\n", + " warmup_time: 0.006204843521118164\n", + " \n", + "Result for train_breast_cancer_c28a3_00008:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.05594405594405594\n", + " eval-logloss: 0.692784742458717\n", + " experiment_id: 2c7d8bc38ad04536b1dec76819a2b3bf\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54447\n", + " time_since_restore: 0.01937389373779297\n", + " time_this_iter_s: 0.01937389373779297\n", + " time_total_s: 0.01937389373779297\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00008\n", + " warmup_time: 0.004342079162597656\n", + " \n", + "Result for train_breast_cancer_c28a3_00001:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.06293706293706294\n", + " eval-logloss: 0.6503534216980834\n", + " experiment_id: 85766fe4d9fa482a91e396a8fd509a19\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 2\n", + " node_ip: 127.0.0.1\n", + " pid: 54440\n", + " time_since_restore: 0.13567376136779785\n", + " time_this_iter_s: 0.11850380897521973\n", + " time_total_s: 0.13567376136779785\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 2\n", + " trial_id: c28a3_00001\n", + " warmup_time: 0.006204843521118164\n", + " \n", + "Result for train_breast_cancer_c28a3_00004:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.06993006993006994\n", + " eval-logloss: 0.689577207281873\n", + " experiment_id: ef4fdc645c444112985b4957ab8a84e9\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 1\n", + " node_ip: 127.0.0.1\n", + " pid: 54443\n", + " time_since_restore: 0.027021169662475586\n", + " time_this_iter_s: 0.027021169662475586\n", + " time_total_s: 0.027021169662475586\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 1\n", + " trial_id: c28a3_00004\n", + " warmup_time: 0.0063669681549072266\n", + " \n", + "Result for train_breast_cancer_c28a3_00002:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.04195804195804196\n", + " eval-logloss: 0.658545415301423\n", + " experiment_id: a3645fc2d43145d88a1f5b7cc94df703\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 2\n", + " node_ip: 127.0.0.1\n", + " pid: 54441\n", + " time_since_restore: 0.17307591438293457\n", + " time_this_iter_s: 0.1457080841064453\n", + " time_total_s: 0.17307591438293457\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 2\n", + " trial_id: c28a3_00002\n", + " warmup_time: 0.0062830448150634766\n", + " \n", + "Result for train_breast_cancer_c28a3_00003:\n", + " date: 2022-07-22_16-56-01\n", + " done: true\n", + " eval-error: 0.04195804195804196\n", + " eval-logloss: 0.38665050018083796\n", + " experiment_id: 7ff6133237404b4ea4755b9f8cd114f2\n", + " hostname: Kais-MacBook-Pro.local\n", + " iterations_since_restore: 10\n", + " node_ip: 127.0.0.1\n", + " pid: 54442\n", + " time_since_restore: 0.28186488151550293\n", + " time_this_iter_s: 0.03063178062438965\n", + " time_total_s: 0.28186488151550293\n", + " timestamp: 1658505361\n", + " timesteps_since_restore: 0\n", + " training_iteration: 10\n", + " trial_id: c28a3_00003\n", + " warmup_time: 0.006722211837768555\n", + " \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-07-22 16:56:01,498\tINFO tune.py:738 -- Total run time: 10.53 seconds (10.37 seconds for the tuning loop).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 2, 'min_child_weight': 3, 'subsample': 0.782626252548841, 'eta': 0.06385952388342125}\n", + "Best model total accuracy: 0.9580\n" + ] + } + ], + "source": [ + "import sklearn.datasets\n", + "import sklearn.metrics\n", + "import os\n", + "from ray.tune.schedulers import ASHAScheduler\n", + "from sklearn.model_selection import train_test_split\n", + "import xgboost as xgb\n", + "\n", + "from ray import air, tune\n", + "from ray.air import session\n", + "from ray.tune.integration.xgboost import TuneReportCheckpointCallback\n", + "\n", + "\n", + "def train_breast_cancer(config: dict):\n", + " # This is a simple training function to be passed into Tune\n", + " # Load dataset\n", + " data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)\n", + " # Split into train and test set\n", + " train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)\n", + " # Build input matrices for XGBoost\n", + " train_set = xgb.DMatrix(train_x, label=train_y)\n", + " test_set = xgb.DMatrix(test_x, label=test_y)\n", + " # Train the classifier, using the Tune callback\n", + " xgb.train(\n", + " config,\n", + " train_set,\n", + " evals=[(test_set, \"eval\")],\n", + " verbose_eval=False,\n", + " callbacks=[TuneReportCheckpointCallback(filename=\"model.xgb\")],\n", + " )\n", + "\n", + "\n", + "def get_best_model_checkpoint(results):\n", + " best_bst = xgb.Booster()\n", + " best_result = results.get_best_result()\n", + "\n", + " with best_result.checkpoint.as_directory() as best_checkpoint_dir:\n", + " best_bst.load_model(os.path.join(best_checkpoint_dir, \"model.xgb\"))\n", + " accuracy = 1.0 - best_result.metrics[\"eval-error\"]\n", + " print(f\"Best model parameters: {best_result.config}\")\n", + " print(f\"Best model total accuracy: {accuracy:.4f}\")\n", + " return best_bst\n", + "\n", + "\n", + "def tune_xgboost(smoke_test=False):\n", + " search_space = {\n", + " # You can mix constants with search space objects.\n", + " \"objective\": \"binary:logistic\",\n", + " \"eval_metric\": [\"logloss\", \"error\"],\n", + " \"max_depth\": tune.randint(1, 9),\n", + " \"min_child_weight\": tune.choice([1, 2, 3]),\n", + " \"subsample\": tune.uniform(0.5, 1.0),\n", + " \"eta\": tune.loguniform(1e-4, 1e-1),\n", + " }\n", + " # This will enable aggressive early stopping of bad trials.\n", + " scheduler = ASHAScheduler(\n", + " max_t=10, grace_period=1, reduction_factor=2 # 10 training iterations\n", + " )\n", + "\n", + " tuner = tune.Tuner(\n", + " train_breast_cancer,\n", + " tune_config=tune.TuneConfig(\n", + " metric=\"eval-logloss\",\n", + " mode=\"min\",\n", + " scheduler=scheduler,\n", + " num_samples=1 if smoke_test else 10,\n", + " ),\n", + " param_space=search_space,\n", + " )\n", + " results = tuner.fit()\n", + "\n", + " return results\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " import argparse\n", + "\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\n", + " \"--smoke-test\", action=\"store_true\", help=\"Finish quickly for testing\"\n", + " )\n", + " args, _ = parser.parse_known_args()\n", + "\n", + " results = tune_xgboost(smoke_test=args.smoke_test)\n", + "\n", + " # Load the best model checkpoint.\n", + " best_bst = get_best_model_checkpoint(results)\n", + "\n", + " # You could now do further predictions with\n", + " # best_bst.predict(...)\n" + ] + }, + { + "cell_type": "markdown", + "id": "20732fe4", + "metadata": {}, + "source": [ + "The output of our run could look like this:\n", + "\n", + "```{code-block} bash\n", + ":emphasize-lines: 7\n", + "\n", + " Number of trials: 10/10 (10 TERMINATED)\n", + " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+\n", + " | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | iter | total time (s) | eval-logloss | eval-error |\n", + " |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------|\n", + " | train_breast_cancer_ba275_00000 | TERMINATED | | 0.00205087 | 2 | 1 | 0.898391 | 10 | 0.380619 | 0.678039 | 0.090909 |\n", + " | train_breast_cancer_ba275_00001 | TERMINATED | | 0.000183834 | 4 | 3 | 0.924939 | 1 | 0.0228798 | 0.693009 | 0.111888 |\n", + " | train_breast_cancer_ba275_00002 | TERMINATED | | 0.0242721 | 7 | 2 | 0.501551 | 10 | 0.376154 | 0.54472 | 0.06993 |\n", + " | train_breast_cancer_ba275_00003 | TERMINATED | | 0.000449692 | 5 | 3 | 0.890212 | 1 | 0.0234981 | 0.692811 | 0.090909 |\n", + " | train_breast_cancer_ba275_00004 | TERMINATED | | 0.000376393 | 7 | 2 | 0.883609 | 1 | 0.0231569 | 0.692847 | 0.062937 |\n", + " | train_breast_cancer_ba275_00005 | TERMINATED | | 0.00231942 | 3 | 3 | 0.877464 | 2 | 0.104867 | 0.689541 | 0.083916 |\n", + " | train_breast_cancer_ba275_00006 | TERMINATED | | 0.000542326 | 1 | 2 | 0.578584 | 1 | 0.0213971 | 0.692765 | 0.083916 |\n", + " | train_breast_cancer_ba275_00007 | TERMINATED | | 0.0016801 | 1 | 2 | 0.975302 | 1 | 0.02226 | 0.691999 | 0.083916 |\n", + " | train_breast_cancer_ba275_00008 | TERMINATED | | 0.000595756 | 8 | 3 | 0.58429 | 1 | 0.0221152 | 0.692657 | 0.06993 |\n", + " | train_breast_cancer_ba275_00009 | TERMINATED | | 0.000357845 | 8 | 1 | 0.637776 | 1 | 0.022635 | 0.692859 | 0.090909 |\n", + " +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+--------+------------------+----------------+--------------+\n", + "\n", + "\n", + " Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.5015513240240503, 'eta': 0.024272050872920895}\n", + " Best model total accuracy: 0.9301\n", + "```\n", + "\n", + "As you can see, most trials have been stopped only after a few iterations. Only the\n", + "two most promising trials were run for the full 10 iterations.\n", + "\n", + "You can also ensure that all available resources are being used as the scheduler\n", + "terminates trials, freeing them up. This can be done through the\n", + "`ResourceChangingScheduler`. An example of this can be found here:\n", + "{doc}`/tune/examples/includes/xgboost_dynamic_resources_example`.\n", + "\n", + "## Using fractional GPUs\n", + "\n", + "You can often accelerate your training by using GPUs in addition to CPUs. However,\n", + "you usually don't have as many GPUs as you have trials to run. For instance, if you\n", + "run 10 Tune trials in parallel, you usually don't have access to 10 separate GPUs.\n", + "\n", + "Tune supports *fractional GPUs*. This means that each task is assigned a fraction\n", + "of the GPU memory for training. For 10 tasks, this could look like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d1b20a3", + "metadata": {}, + "outputs": [], + "source": [ + "config = {\n", + " \"objective\": \"binary:logistic\",\n", + " \"eval_metric\": [\"logloss\", \"error\"],\n", + " \"tree_method\": \"gpu_hist\",\n", + " \"max_depth\": tune.randint(1, 9),\n", + " \"min_child_weight\": tune.choice([1, 2, 3]),\n", + " \"subsample\": tune.uniform(0.5, 1.0),\n", + " \"eta\": tune.loguniform(1e-4, 1e-1),\n", + "}\n", + "\n", + "tuner = tune.Tuner(\n", + " tune.with_resources(train_breast_cancer, resources={\"cpu\": 1, \"gpu\": 0.1}),\n", + " tune_config=tune.TuneConfig(\n", + " num_samples=10,\n", + " ),\n", + " param_space=config,\n", + ")\n", + "results = tuner.fit()\n" + ] + }, + { + "cell_type": "markdown", + "id": "ee131861", + "metadata": {}, + "source": [ + "Each task thus works with 10% of the available GPU memory. You also have to tell\n", + "XGBoost to use the `gpu_hist` tree method, so it knows it should use the GPU.\n", + "\n", + "## Conclusion\n", + "\n", + "You should now have a basic understanding on how to train XGBoost models and on how\n", + "to tune the hyperparameters to yield the best results. In our simple example,\n", + "Tuning the parameters didn't make a huge difference for the accuracy.\n", + "But in larger applications, intelligent hyperparameter tuning can make the\n", + "difference between a model that doesn't seem to learn at all, and a model\n", + "that outperforms all the other ones.\n", + "\n", + "## More XGBoost Examples\n", + "\n", + "- {doc}`/tune/examples/includes/xgboost_dynamic_resources_example`:\n", + " Trains a basic XGBoost model with Tune with the class-based API and a ResourceChangingScheduler, ensuring all resources are being used at all time.\n", + "\n", + "## Learn More\n", + "\n", + "- [XGBoost Hyperparameter Tuning - A Visual Guide](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)\n", + "- [Notes on XGBoost Parameter Tuning](https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html)\n", + "- [Doing XGBoost Hyperparameter Tuning the smart way](https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ray_dev_py38", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:05:16) \n[Clang 12.0.1 ]" + }, + "orphan": true, + "vscode": { + "interpreter": { + "hash": "265d195fda5292fe8f69c6e37c435a5634a1ed3b6799724e66a975f68fa21517" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst index 66b67ab6bcc9..74c5ea165615 100644 --- a/doc/source/tune/index.rst +++ b/doc/source/tune/index.rst @@ -15,76 +15,78 @@ Tune further integrates with a wide range of additional hyperparameter optimizat **Click on the following tabs to see code examples for various machine learning frameworks**: -.. tabbed:: Quickstart +.. tab-set:: - To run this example, install the following: ``pip install "ray[tune]"``. + .. tab-item:: Quickstart - In this quick-start example you `minimize` a simple function of the form ``f(x) = a**2 + b``, our `objective` function. - The closer ``a`` is to zero and the smaller ``b`` is, the smaller the total value of ``f(x)``. - We will define a so-called `search space` for ``a`` and ``b`` and let Ray Tune explore the space for good values. + To run this example, install the following: ``pip install "ray[tune]"``. - .. callout:: + In this quick-start example you `minimize` a simple function of the form ``f(x) = a**2 + b``, our `objective` function. + The closer ``a`` is to zero and the smaller ``b`` is, the smaller the total value of ``f(x)``. + We will define a so-called `search space` for ``a`` and ``b`` and let Ray Tune explore the space for good values. - .. literalinclude:: ../../../python/ray/tune/tests/example.py - :language: python - :start-after: __quick_start_begin__ - :end-before: __quick_start_end__ + .. callout:: - .. annotations:: - <1> Define an objective function. + .. literalinclude:: ../../../python/ray/tune/tests/example.py + :language: python + :start-after: __quick_start_begin__ + :end-before: __quick_start_end__ - <2> Define a search space. + .. annotations:: + <1> Define an objective function. - <3> Start a Tune run and print the best result. + <2> Define a search space. + <3> Start a Tune run and print the best result. -.. tabbed:: Keras+Hyperopt - To tune your Keras models with Hyperopt, you wrap your model in an objective function whose ``config`` you - can access for selecting hyperparameters. - In the example below we only tune the ``activation`` parameter of the first layer of the model, but you can - tune any parameter of the model you want. - After defining the search space, you can simply initialize the ``HyperOptSearch`` object and pass it to ``run``. - It's important to tell Ray Tune which metric you want to optimize and whether you want to maximize or minimize it. + .. tab-item:: Keras+Hyperopt - .. callout:: + To tune your Keras models with Hyperopt, you wrap your model in an objective function whose ``config`` you + can access for selecting hyperparameters. + In the example below we only tune the ``activation`` parameter of the first layer of the model, but you can + tune any parameter of the model you want. + After defining the search space, you can simply initialize the ``HyperOptSearch`` object and pass it to ``run``. + It's important to tell Ray Tune which metric you want to optimize and whether you want to maximize or minimize it. - .. literalinclude:: doc_code/keras_hyperopt.py - :language: python - :start-after: __keras_hyperopt_start__ - :end-before: __keras_hyperopt_end__ + .. callout:: - .. annotations:: - <1> Wrap a Keras model in an objective function. + .. literalinclude:: doc_code/keras_hyperopt.py + :language: python + :start-after: __keras_hyperopt_start__ + :end-before: __keras_hyperopt_end__ - <2> Define a search space and initialize the search algorithm. + .. annotations:: + <1> Wrap a Keras model in an objective function. - <3> Start a Tune run that maximizes accuracy. + <2> Define a search space and initialize the search algorithm. -.. tabbed:: PyTorch+Optuna + <3> Start a Tune run that maximizes accuracy. - To tune your PyTorch models with Optuna, you wrap your model in an objective function whose ``config`` you - can access for selecting hyperparameters. - In the example below we only tune the ``momentum`` and learning rate (``lr``) parameters of the model's optimizer, - but you can tune any other model parameter you want. - After defining the search space, you can simply initialize the ``OptunaSearch`` object and pass it to ``run``. - It's important to tell Ray Tune which metric you want to optimize and whether you want to maximize or minimize it. - We stop tuning this training run after ``5`` iterations, but you can easily define other stopping rules as well. + .. tab-item:: PyTorch+Optuna + To tune your PyTorch models with Optuna, you wrap your model in an objective function whose ``config`` you + can access for selecting hyperparameters. + In the example below we only tune the ``momentum`` and learning rate (``lr``) parameters of the model's optimizer, + but you can tune any other model parameter you want. + After defining the search space, you can simply initialize the ``OptunaSearch`` object and pass it to ``run``. + It's important to tell Ray Tune which metric you want to optimize and whether you want to maximize or minimize it. + We stop tuning this training run after ``5`` iterations, but you can easily define other stopping rules as well. - .. callout:: - .. literalinclude:: doc_code/pytorch_optuna.py - :language: python - :start-after: __pytorch_optuna_start__ - :end-before: __pytorch_optuna_end__ + .. callout:: - .. annotations:: - <1> Wrap a PyTorch model in an objective function. + .. literalinclude:: doc_code/pytorch_optuna.py + :language: python + :start-after: __pytorch_optuna_start__ + :end-before: __pytorch_optuna_end__ - <2> Define a search space and initialize the search algorithm. + .. annotations:: + <1> Wrap a PyTorch model in an objective function. - <3> Start a Tune run that maximizes mean accuracy and stops after 5 iterations. + <2> Define a search space and initialize the search algorithm. + + <3> Start a Tune run that maximizes mean accuracy and stops after 5 iterations. With Tune you can also launch a multi-node :ref:`distributed hyperparameter sweep ` in less than 10 lines of code. @@ -93,87 +95,105 @@ And you can move your models from training to serving on the same infrastructure .. _`Ray Serve`: ../serve/index.html -.. panels:: - :container: text-center - :column: col-md-4 px-2 py-2 - :card: +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 + + .. grid-item-card:: + + **Getting Started** + ^^^ + + In our getting started tutorial you will learn how to tune a PyTorch model + effectively with Tune. + + +++ + .. button-ref:: tune-tutorial + :color: primary + :outline: + :expand: + + Get Started with Tune + + .. grid-item-card:: + + **Key Concepts** + ^^^ + + Understand the key concepts behind Ray Tune. + Learn about tune runs, search algorithms, schedulers and other features. + + +++ + .. button-ref:: tune-60-seconds + :color: primary + :outline: + :expand: + + Tune's Key Concepts + + .. grid-item-card:: + + **User Guides** + ^^^ - **Getting Started** - ^^^ + Our guides teach you about key features of Tune, + such as distributed training or early stopping. - In our getting started tutorial you will learn how to tune a PyTorch model - effectively with Tune. - +++ - .. link-button:: tune-tutorial - :type: ref - :text: Get Started with Tune - :classes: btn-outline-info btn-block - --- + +++ + .. button-ref:: tune-guides + :color: primary + :outline: + :expand: - **Key Concepts** - ^^^ + Learn How To Use Tune - Understand the key concepts behind Ray Tune. - Learn about tune runs, search algorithms, schedulers and other features. + .. grid-item-card:: - +++ - .. link-button:: tune-60-seconds - :type: ref - :text: Tune's Key Concepts - :classes: btn-outline-info btn-block - --- + **Examples** + ^^^ - **User Guides** - ^^^ + In our examples you can find practical tutorials for using frameworks such as + scikit-learn, Keras, TensorFlow, PyTorch, and mlflow, and state of the art search algorithm integrations. - Our guides teach you about key features of Tune, - such as distributed training or early stopping. + +++ + .. button-ref:: tune-examples-ref + :color: primary + :outline: + :expand: + Ray Tune Examples - +++ - .. link-button:: tune-guides - :type: ref - :text: Learn How To Use Tune - :classes: btn-outline-info btn-block - --- + .. grid-item-card:: - **Examples** - ^^^ + **Ray Tune FAQ** + ^^^ - In our examples you can find practical tutorials for using frameworks such as - scikit-learn, Keras, TensorFlow, PyTorch, and mlflow, and state of the art search algorithm integrations. + Find answers to commonly asked questions in our detailed FAQ. - +++ - .. link-button:: tune-examples-ref - :type: ref - :text: Ray Tune Examples - :classes: btn-outline-info btn-block - --- + +++ + .. button-ref:: tune-faq + :color: primary + :outline: + :expand: - **Ray Tune FAQ** - ^^^ + Ray Tune FAQ - Find answers to commonly asked questions in our detailed FAQ. + .. grid-item-card:: - +++ - .. link-button:: tune-faq - :type: ref - :text: Ray Tune FAQ - :classes: btn-outline-info btn-block - --- + **Ray Tune API** + ^^^ - **Ray Tune API** - ^^^ + Get more in-depth information about the Ray Tune API, including all about search spaces, + algorithms and training configurations. - Get more in-depth information about the Ray Tune API, including all about search spaces, - algorithms and training configurations. + +++ + .. button-ref:: tune-api-ref + :color: primary + :outline: + :expand: - +++ - .. link-button:: tune-api-ref - :type: ref - :text: Read the API Reference - :classes: btn-outline-info btn-block + Read the API Reference Why choose Tune? diff --git a/doc/source/tune/key-concepts.rst b/doc/source/tune/key-concepts.rst index ee559652943a..dde9f2ea5483 100644 --- a/doc/source/tune/key-concepts.rst +++ b/doc/source/tune/key-concepts.rst @@ -38,33 +38,35 @@ hyperparameters we want to tune to `minimize` the objective. Since the objective also has a variable ``x``, we need to test for different values of ``x``. Given concrete choices for ``a``, ``b`` and ``x`` we can evaluate the objective function and get a `score` to minimize. -.. tabbed:: Function API +.. tab-set:: - With the :ref:`the function-based API ` you create a function (here called ``trainable``) that - takes in a dictionary of hyperparameters. - This function computes a ``score`` in a "training loop" and `reports` this score back to Tune: + .. tab-item:: Function API - .. literalinclude:: doc_code/key_concepts.py - :language: python - :start-after: __function_api_start__ - :end-before: __function_api_end__ + With the :ref:`the function-based API ` you create a function (here called ``trainable``) that + takes in a dictionary of hyperparameters. + This function computes a ``score`` in a "training loop" and `reports` this score back to Tune: - Note that we use ``session.report(...)`` to report the intermediate ``score`` in the training loop, which can be useful - in many machine learning tasks. - If you just want to report the final ``score`` outside of this loop, you can simply return the score at the - end of the ``trainable`` function with ``return {"score": score}``. - You can also use ``yield {"score": score}`` instead of ``session.report()``. + .. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __function_api_start__ + :end-before: __function_api_end__ -.. tabbed:: Class API + Note that we use ``session.report(...)`` to report the intermediate ``score`` in the training loop, which can be useful + in many machine learning tasks. + If you just want to report the final ``score`` outside of this loop, you can simply return the score at the + end of the ``trainable`` function with ``return {"score": score}``. + You can also use ``yield {"score": score}`` instead of ``session.report()``. - Here's an example of specifying the objective function using the :ref:`class-based API `: + .. tab-item:: Class API - .. literalinclude:: doc_code/key_concepts.py - :language: python - :start-after: __class_api_start__ - :end-before: __class_api_end__ + Here's an example of specifying the objective function using the :ref:`class-based API `: - .. tip:: ``session.report`` can't be used within a ``Trainable`` class. + .. literalinclude:: doc_code/key_concepts.py + :language: python + :start-after: __class_api_start__ + :end-before: __class_api_end__ + + .. tip:: ``session.report`` can't be used within a ``Trainable`` class. Learn more about the details of :ref:`Trainables here ` and :ref:`have a look at our examples `. diff --git a/doc/source/tune/tutorials/overview.rst b/doc/source/tune/tutorials/overview.rst index ef3dc851fbf6..6936d0ca38dc 100644 --- a/doc/source/tune/tutorials/overview.rst +++ b/doc/source/tune/tutorials/overview.rst @@ -15,128 +15,127 @@ You can follow our :ref:`Tune Feature Guides `, but can als Tune Feature Guides ------------------- -.. panels:: - :container: container pb-4 - :column: col-md-4 px-2 py-2 - :img-top-cls: pt-5 w-50 d-block mx-auto - --- - :img-top: /images/tune.png - - .. link-button:: tune-run - :type: ref - :text: Running Basic Experiments - :classes: btn-link btn-block stretched-link +.. grid:: 1 2 3 4 + :gutter: 1 + :class-container: container pb-3 - --- - :img-top: /images/tune.png - - .. link-button:: tune-output - :type: ref - :text: Logging Tune Runs - :classes: btn-link btn-block stretched-link + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: /images/tune.png - - .. link-button:: tune-resources - :type: ref - :text: Setting Trial Resources - :classes: btn-link btn-block stretched-link + .. button-ref:: tune-run - --- - :img-top: /images/tune.png + Running Basic Experiments - .. link-button:: tune-search-space-tutorial - :type: ref - :text: Using Search Spaces - :classes: btn-link btn-block stretched-link + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: /images/tune.png + .. button-ref:: tune-output - .. link-button:: tune-stopping - :type: ref - :text: How to Define Stopping Criteria for a Ray Tune Experiment - :classes: btn-link btn-block stretched-link + Logging Tune Runs - --- - :img-top: /images/tune.png + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - .. link-button:: tune-trial-checkpoints - :type: ref - :text: How to Save and Load Trial Checkpoints - :classes: btn-link btn-block stretched-link + .. button-ref:: tune-resources - --- - :img-top: /images/tune.png + Setting Trial Resources - .. link-button:: tune-storage - :type: ref - :text: How to Configure Storage Options for a Distributed Tune Experiment - :classes: btn-link btn-block stretched-link + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: /images/tune.png + .. button-ref:: tune-search-space-tutorial - .. link-button:: tune-fault-tolerance - :type: ref - :text: How to Enable Fault Tolerance in Ray Tune - :classes: btn-link btn-block stretched-link + Using Search Spaces - --- - :img-top: /images/tune.png + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - .. link-button:: tune-metrics - :type: ref - :text: Using Callbacks and Metrics - :classes: btn-link btn-block stretched-link + .. button-ref:: tune-stopping - --- - :img-top: /images/tune.png + How to Define Stopping Criteria for a Ray Tune Experiment - .. link-button:: ../tutorials/tune_get_data_in_and_out - :type: ref - :text: Getting Data in and out of Tune - :classes: btn-link btn-block stretched-link + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: /images/tune.png + .. button-ref:: tune-trial-checkpoints - .. link-button:: ../examples/tune_analyze_results - :type: ref - :text: Analyzing Tune Experiment Results - :classes: btn-link btn-block stretched-link + How to Save and Load Trial Checkpoints - --- - :img-top: /images/tune.png + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - .. link-button:: ../examples/pbt_guide - :type: ref - :text: A Guide to Population-Based Training - :classes: btn-link btn-block stretched-link + .. button-ref:: tune-storage - --- - :img-top: /images/tune.png + How to Configure Storage Options for a Distributed Tune Experiment - .. link-button:: tune-distributed - :type: ref - :text: Deploying Tune in the Cloud - :classes: btn-link btn-block stretched-link + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - --- - :img-top: /images/tune.png + .. button-ref:: tune-fault-tolerance - .. link-button:: tune-lifecycle - :type: ref - :text: Tune Architecture - :classes: btn-link btn-block stretched-link + How to Enable Fault Tolerance in Ray Tune - --- - :img-top: /images/tune.png + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img - .. link-button:: tune-scalability - :type: ref - :text: Scalability Benchmarks - :classes: btn-link btn-block stretched-link + .. button-ref:: tune-metrics + Using Callbacks and Metrics + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: ../tutorials/tune_get_data_in_and_out + + Getting Data in and out of Tune + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: ../examples/tune_analyze_results + + Analyzing Tune Experiment Results + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: ../examples/pbt_guide + + A Guide to Population-Based Training + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-distributed + + Deploying Tune in the Cloud + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-lifecycle + + Tune Architecture + + .. grid-item-card:: + :img-top: /images/tune.png + :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img + + .. button-ref:: tune-scalability + + Scalability Benchmarks diff --git a/doc/source/tune/tutorials/tune-fault-tolerance.rst b/doc/source/tune/tutorials/tune-fault-tolerance.rst index eff8d9f922a0..e2ba45831338 100644 --- a/doc/source/tune/tutorials/tune-fault-tolerance.rst +++ b/doc/source/tune/tutorials/tune-fault-tolerance.rst @@ -162,8 +162,8 @@ To restore, we just need to re-specify the ``param_space`` via :meth:`Tuner.rest .. note:: - If you're tuning over :ref:`Ray Datasets `, you'll also need to re-specify them in the ``param_space``. - Ray Datasets can contain object references, so the same problems described above apply. + If you're tuning over :ref:`Ray Data `, you'll also need to re-specify them in the ``param_space``. + Ray Data can contain object references, so the same problems described above apply. See below for an example: diff --git a/doc/source/tune/tutorials/tune-output.rst b/doc/source/tune/tutorials/tune-output.rst index 8091267a8ad4..19f0674e1ab5 100644 --- a/doc/source/tune/tutorials/tune-output.rst +++ b/doc/source/tune/tutorials/tune-output.rst @@ -191,73 +191,75 @@ You can save trial artifacts directly in the trainable, as shown below: .. tip:: Make sure that any logging calls or objects stay within scope of the Trainable. You may see pickling or other serialization errors or inconsistent logs otherwise. -.. tabbed:: Function API +.. tab-set:: - .. code-block:: python - - import logging_library # ex: mlflow, wandb - from ray.air import session - - def trainable(config): - logging_library.init( - name=trial_id, - id=trial_id, - resume=trial_id, - reinit=True, - allow_val_change=True) - logging_library.set_log_path(os.getcwd()) - - for step in range(100): - logging_library.log_model(...) - logging_library.log(results, step=step) - - # You can also just write to a file directly. - # The working directory is set to the trial directory, so - # you don't need to worry about multiple workers saving - # to the same location. - with open(f"./artifact_{step}.txt", "w") as f: - f.write("Artifact Data") + .. tab-item:: Function API - session.report(results) - - -.. tabbed:: Class API - - .. code-block:: python + .. code-block:: python - import logging_library # ex: mlflow, wandb - from ray import tune + import logging_library # ex: mlflow, wandb + from ray.air import session - class CustomLogging(tune.Trainable) - def setup(self, config): - trial_id = self.trial_id + def trainable(config): logging_library.init( name=trial_id, id=trial_id, resume=trial_id, reinit=True, - allow_val_change=True - ) + allow_val_change=True) logging_library.set_log_path(os.getcwd()) - def step(self): - logging_library.log_model(...) - - # You can also write to a file directly. - # The working directory is set to the trial directory, so - # you don't need to worry about multiple workers saving - # to the same location. - with open(f"./artifact_{self.iteration}.txt", "w") as f: - f.write("Artifact Data") - - def log_result(self, result): - res_dict = { - str(k): v - for k, v in result.items() - if (v and "config" not in k and not isinstance(v, str)) - } - step = result["training_iteration"] - logging_library.log(res_dict, step=step) + for step in range(100): + logging_library.log_model(...) + logging_library.log(results, step=step) + + # You can also just write to a file directly. + # The working directory is set to the trial directory, so + # you don't need to worry about multiple workers saving + # to the same location. + with open(f"./artifact_{step}.txt", "w") as f: + f.write("Artifact Data") + + session.report(results) + + + .. tab-item:: Class API + + .. code-block:: python + + import logging_library # ex: mlflow, wandb + from ray import tune + + class CustomLogging(tune.Trainable) + def setup(self, config): + trial_id = self.trial_id + logging_library.init( + name=trial_id, + id=trial_id, + resume=trial_id, + reinit=True, + allow_val_change=True + ) + logging_library.set_log_path(os.getcwd()) + + def step(self): + logging_library.log_model(...) + + # You can also write to a file directly. + # The working directory is set to the trial directory, so + # you don't need to worry about multiple workers saving + # to the same location. + with open(f"./artifact_{self.iteration}.txt", "w") as f: + f.write("Artifact Data") + + def log_result(self, result): + res_dict = { + str(k): v + for k, v in result.items() + if (v and "config" not in k and not isinstance(v, str)) + } + step = result["training_iteration"] + logging_library.log(res_dict, step=step) In the code snippet above, ``logging_library`` refers to whatever 3rd party logging library you are using. diff --git a/doc/source/tune/tutorials/tune-stopping.rst b/doc/source/tune/tutorials/tune-stopping.rst index 72720cfff58f..cbe5fabddb15 100644 --- a/doc/source/tune/tutorials/tune-stopping.rst +++ b/doc/source/tune/tutorials/tune-stopping.rst @@ -48,59 +48,61 @@ In addition to manual stopping, Tune provides several ways to stop experiments p You can implement the stopping criteria using either a dictionary, a function, or a custom :class:`Stopper `. -.. tabbed:: Dictionary +.. tab-set:: - If a dictionary is passed in, the keys may be any field in the return result of ``session.report`` in the - Function API or ``step()`` in the Class API. + .. tab-item:: Dictionary - .. note:: + If a dictionary is passed in, the keys may be any field in the return result of ``session.report`` in the + Function API or ``step()`` in the Class API. - This includes :ref:`auto-filled metrics ` such as ``training_iteration``. + .. note:: - In the example below, each trial will be stopped either when it completes ``10`` iterations or when it - reaches a mean accuracy of ``0.8`` or more. + This includes :ref:`auto-filled metrics ` such as ``training_iteration``. - These metrics are assumed to be **increasing**, so the trial will stop once the reported metric has exceeded the threshold specified in the dictionary. + In the example below, each trial will be stopped either when it completes ``10`` iterations or when it + reaches a mean accuracy of ``0.8`` or more. - .. literalinclude:: /tune/doc_code/stopping.py - :language: python - :start-after: __stopping_dict_start__ - :end-before: __stopping_dict_end__ + These metrics are assumed to be **increasing**, so the trial will stop once the reported metric has exceeded the threshold specified in the dictionary. -.. tabbed:: User-defined Function + .. literalinclude:: /tune/doc_code/stopping.py + :language: python + :start-after: __stopping_dict_start__ + :end-before: __stopping_dict_end__ - For more flexibility, you can pass in a function instead. - If a function is passed in, it must take ``(trial_id: str, result: dict)`` as arguments and return a boolean - (``True`` if trial should be stopped and ``False`` otherwise). + .. tab-item:: User-defined Function - In the example below, each trial will be stopped either when it completes ``10`` iterations or when it - reaches a mean accuracy of ``0.8`` or more. + For more flexibility, you can pass in a function instead. + If a function is passed in, it must take ``(trial_id: str, result: dict)`` as arguments and return a boolean + (``True`` if trial should be stopped and ``False`` otherwise). - .. literalinclude:: /tune/doc_code/stopping.py - :language: python - :start-after: __stopping_fn_start__ - :end-before: __stopping_fn_end__ + In the example below, each trial will be stopped either when it completes ``10`` iterations or when it + reaches a mean accuracy of ``0.8`` or more. -.. tabbed:: Custom Stopper Class + .. literalinclude:: /tune/doc_code/stopping.py + :language: python + :start-after: __stopping_fn_start__ + :end-before: __stopping_fn_end__ - Finally, you can implement the :class:`~ray.tune.stopper.Stopper` interface for - stopping individual trials or even entire experiments based on custom stopping - criteria. For example, the following example stops all trials after the criteria - is achieved by any individual trial and prevents new ones from starting: + .. tab-item:: Custom Stopper Class - .. literalinclude:: /tune/doc_code/stopping.py - :language: python - :start-after: __stopping_cls_start__ - :end-before: __stopping_cls_end__ + Finally, you can implement the :class:`~ray.tune.stopper.Stopper` interface for + stopping individual trials or even entire experiments based on custom stopping + criteria. For example, the following example stops all trials after the criteria + is achieved by any individual trial and prevents new ones from starting: - In the example, once any trial reaches a ``mean_accuracy`` of 0.8 or more, all trials will stop. + .. literalinclude:: /tune/doc_code/stopping.py + :language: python + :start-after: __stopping_cls_start__ + :end-before: __stopping_cls_end__ - .. note:: + In the example, once any trial reaches a ``mean_accuracy`` of 0.8 or more, all trials will stop. - When returning ``True`` from ``stop_all``, currently running trials will not stop immediately. - They will stop after finishing their ongoing training iteration (after ``session.report`` or ``step``). + .. note:: - Ray Tune comes with a set of out-of-the-box stopper classes. See the :ref:`Stopper ` documentation. + When returning ``True`` from ``stop_all``, currently running trials will not stop immediately. + They will stop after finishing their ongoing training iteration (after ``session.report`` or ``step``). + + Ray Tune comes with a set of out-of-the-box stopper classes. See the :ref:`Stopper ` documentation. Stop trials after a certain amount of time diff --git a/doc/source/tune/tutorials/tune-storage.rst b/doc/source/tune/tutorials/tune-storage.rst index ddeb0dd66586..dcb61f910cb3 100644 --- a/doc/source/tune/tutorials/tune-storage.rst +++ b/doc/source/tune/tutorials/tune-storage.rst @@ -137,7 +137,6 @@ then all experiment outputs can be saved in a shared cloud bucket. We can configure cloud storage by telling Ray Tune to **upload to a remote** ``storage_path``: .. code-block:: python - :emphasize-lines: 8, 9, 10, 11 from ray import tune from ray.air.config import RunConfig @@ -154,7 +153,7 @@ We can configure cloud storage by telling Ray Tune to **upload to a remote** ``s Ray AIR automatically configures a default syncer that uses pyarrow to perform syncing with the specified cloud ``storage_path``. You can also pass a custom :class:`Syncer ` object -to the :class:`tune.SyncConfig ` +to a :class:`tune.SyncConfig ` within the :class:`air.RunConfig ` if you want to implement custom logic for uploading/downloading from the cloud. See :ref:`tune-cloud-syncing` and :ref:`tune-cloud-syncing-command-line-example` for more details and examples of custom syncing. @@ -213,8 +212,6 @@ that implements saving and loading checkpoints. # We recommend cloud storage checkpointing as it survives the cluster when # instances are terminated and has better performance. storage_path="s3://my-checkpoints-bucket/path/", - # See above! we will sync our checkpoints to S3 directory - sync_config=sync_config, checkpoint_config=air.CheckpointConfig( # We'll keep the best five checkpoints at all times # (with the highest AUC scores, a metric reported by the trainable) diff --git a/doc/source/tune/tutorials/tune-trial-checkpoints.rst b/doc/source/tune/tutorials/tune-trial-checkpoints.rst index 976e1377e5b8..07cd22f2247f 100644 --- a/doc/source/tune/tutorials/tune-trial-checkpoints.rst +++ b/doc/source/tune/tutorials/tune-trial-checkpoints.rst @@ -27,19 +27,21 @@ To create an AIR checkpoint, one can either use :meth:`~ray.air.checkpoint.Check checkpoint is synced to driver node or the cloud. We are planning to work on it to address the issue. -.. tabbed:: Checkpoint a dictionary +.. tab-set:: - .. literalinclude:: /tune/doc_code/trial_checkpoint.py - :language: python - :start-after: __function_api_checkpointing_start__ - :end-before: __function_api_checkpointing_end__ + .. tab-item:: Checkpoint a dictionary -.. tabbed:: Checkpoint a directory + .. literalinclude:: /tune/doc_code/trial_checkpoint.py + :language: python + :start-after: __function_api_checkpointing_start__ + :end-before: __function_api_checkpointing_end__ - .. literalinclude:: /tune/doc_code/trial_checkpoint.py - :language: python - :start-after: __function_api_checkpointing_from_dir_start__ - :end-before: __function_api_checkpointing_from_dir_end__ + .. tab-item:: Checkpoint a directory + + .. literalinclude:: /tune/doc_code/trial_checkpoint.py + :language: python + :start-after: __function_api_checkpointing_from_dir_start__ + :end-before: __function_api_checkpointing_from_dir_end__ In the above code snippet: diff --git a/doc/source/tune/tutorials/tune_get_data_in_and_out.md b/doc/source/tune/tutorials/tune_get_data_in_and_out.md index 9ee48d06fccf..477725e0e7b2 100644 --- a/doc/source/tune/tutorials/tune_get_data_in_and_out.md +++ b/doc/source/tune/tutorials/tune_get_data_in_and_out.md @@ -71,7 +71,7 @@ For example, passing in a large pandas DataFrame or an unserializable model obje Instead, use strings or other identifiers as your values, and initialize/load the objects inside your Trainable directly depending on those. ```{note} -[Ray Datasets](datasets_getting_started) can be used as values in the search space directly. +[Datasets](data_getting_started) can be used as values in the search space directly. ``` In our example, we want to tune the two model hyperparameters. We also want to set the number of epochs, so that we can easily tweak it later. For the hyperparameters, we will use the `tune.uniform` distribution. We will also modify the `training_function` to obtain those values from the `config` dictionary. diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index b87bcb4f6fb4..63c9d42c47e7 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -29,6 +29,10 @@ RUN apt-get update -y \ USER $RAY_UID ENV HOME=/home/ray +# Todo (krfricke): Move to latest miniconda version once we stop building +# images for Python 3.7. +# https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${HOSTTYPE}.sh + RUN sudo apt-get update -y && sudo apt-get upgrade -y \ && sudo apt-get install -y \ git \ @@ -41,10 +45,11 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ tmux \ screen \ rsync \ + netbase \ openssh-client \ gnupg; fi) \ && wget \ - --quiet "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${HOSTTYPE}.sh" \ + --quiet "https://repo.anaconda.com/miniconda/Miniconda3-py37_23.1.0-1-Linux-${HOSTTYPE}.sh" \ -O /tmp/miniconda.sh \ && /bin/bash /tmp/miniconda.sh -b -u -p $HOME/anaconda3 \ && $HOME/anaconda3/bin/conda init \ diff --git a/docker/examples/Dockerfile b/docker/examples/Dockerfile index d97d40855c48..df7bc8c6d820 100644 --- a/docker/examples/Dockerfile +++ b/docker/examples/Dockerfile @@ -29,7 +29,6 @@ RUN pip install --no-cache-dir -U pip \ tensorboardX \ dragonfly-opt \ zoopt \ - tabulate \ mlflow \ pytest-remotedata>=0.3.1 \ matplotlib \ diff --git a/docker/ray-deps/Dockerfile b/docker/ray-deps/Dockerfile index f9220a09b9f4..81ddd1cb505d 100644 --- a/docker/ray-deps/Dockerfile +++ b/docker/ray-deps/Dockerfile @@ -14,7 +14,9 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install --find-links $FIND_LINKS_PATH $(if [ "$AUTOSCALER" = "autoscaler" ]; then echo \ "redis>=3.5.0,<4.0.0" \ "six==1.13.0" \ - "boto3==1.4.8" \ + "boto3==1.26.82" \ + "pyOpenSSL==22.1.0" \ + "cryptography==38.0.1" \ "google-api-python-client==1.7.8" \ "google-oauth" \ "kubernetes" \ diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 18454405bd52..72949a1a2374 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -4,6 +4,7 @@ ARG PYTHON_MINOR_VERSION=7 # We have to uninstall wrapt this way for Tensorflow compatibility COPY requirements.txt ./ +COPY requirements_no_deps.txt ./ COPY requirements_dl.txt ./ COPY requirements_ml_docker.txt ./ COPY requirements_rllib.txt ./ @@ -19,11 +20,15 @@ RUN sudo apt-get update \ libgl1-mesa-dev \ unzip \ unrar \ - && $HOME/anaconda3/bin/pip --no-cache-dir install -U pip \ - # First, install requirements + && $HOME/anaconda3/bin/pip --no-cache-dir install -U pip pip-tools \ + # Install no-deps requirements. Their dependencies may be overwritten + # in subsequent steps + && $HOME/anaconda3/bin/pip --no-cache-dir install -U \ + -r requirements_no_deps.txt \ + # Then, install requirements && $HOME/anaconda3/bin/pip --no-cache-dir install -U \ -r requirements.txt \ - # Then, keep requirements bounds as constraints and install remaining test dependencies + # Install other requirements. Keep pinned requirements bounds as constraints && $HOME/anaconda3/bin/pip --no-cache-dir install -U \ -c requirements.txt \ -r requirements_rllib.txt \ @@ -32,6 +37,7 @@ RUN sudo apt-get update \ -r requirements_upstream.txt \ # explicitly install (overwrite) pytorch with CUDA support && $HOME/anaconda3/bin/pip --no-cache-dir install -U \ + -c requirements.txt \ -r requirements_ml_docker.txt \ # Remove dataclasses & typing because they are included in Python > 3.6 && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \ @@ -40,5 +46,7 @@ RUN sudo apt-get update \ requirements*.txt \ && sudo apt-get clean +RUN $HOME/anaconda3/bin/pip freeze > /home/ray/pip-freeze.txt + # Make sure tfp is installed correctly and matches tf version. RUN python -c "import tensorflow_probability" diff --git a/docker/ray/Dockerfile b/docker/ray/Dockerfile index ad6253b0da89..40606488dbc4 100644 --- a/docker/ray/Dockerfile +++ b/docker/ray/Dockerfile @@ -9,3 +9,5 @@ COPY $WHEEL_PATH . COPY $FIND_LINKS_PATH $FIND_LINKS_PATH RUN $HOME/anaconda3/bin/pip --no-cache-dir install `basename $WHEEL_PATH`[all] \ --find-links $FIND_LINKS_PATH && sudo rm `basename $WHEEL_PATH` + +RUN $HOME/anaconda3/bin/pip freeze > /home/ray/pip-freeze.txt diff --git a/java/serve/src/main/java/io/ray/serve/api/ServeControllerClient.java b/java/serve/src/main/java/io/ray/serve/api/ServeControllerClient.java index 39c366f1ba66..7dda3ef977ec 100644 --- a/java/serve/src/main/java/io/ray/serve/api/ServeControllerClient.java +++ b/java/serve/src/main/java/io/ray/serve/api/ServeControllerClient.java @@ -193,14 +193,14 @@ private void waitForDeploymentHealthy(String name, Long timeoutS) { "Waiting for deployment {} to be HEALTHY, but deployment doesn't exist.", name)); } - if (status.getStatus() == DeploymentStatus.HEALTHY) { + if (status.getStatus() == DeploymentStatus.DEPLOYMENT_STATUS_HEALTHY) { isTimeout = false; break; - } else if (status.getStatus() == DeploymentStatus.UNHEALTHY) { + } else if (status.getStatus() == DeploymentStatus.DEPLOYMENT_STATUS_UNHEALTHY) { throw new RayServeException( LogUtil.format("Deployment {} is UNHEALTHY: {}", name, status.getMessage())); } else { - Preconditions.checkState(status.getStatus() == DeploymentStatus.UPDATING); + Preconditions.checkState(status.getStatus() == DeploymentStatus.DEPLOYMENT_STATUS_UPDATING); } LOGGER.debug("Waiting for {} to be healthy, current status: {}.", name, status.getStatus()); diff --git a/java/serve/src/main/java/io/ray/serve/config/DeploymentConfig.java b/java/serve/src/main/java/io/ray/serve/config/DeploymentConfig.java index 588356e7a121..a2f996e2677c 100644 --- a/java/serve/src/main/java/io/ray/serve/config/DeploymentConfig.java +++ b/java/serve/src/main/java/io/ray/serve/config/DeploymentConfig.java @@ -208,6 +208,26 @@ public byte[] toProtoBytes() { return builder.build().toByteArray(); } + public io.ray.serve.generated.DeploymentConfig toProto() { + io.ray.serve.generated.DeploymentConfig.Builder builder = + io.ray.serve.generated.DeploymentConfig.newBuilder() + .setNumReplicas(numReplicas) + .setMaxConcurrentQueries(maxConcurrentQueries) + .setGracefulShutdownWaitLoopS(gracefulShutdownWaitLoopS) + .setGracefulShutdownTimeoutS(gracefulShutdownTimeoutS) + .setHealthCheckPeriodS(healthCheckPeriodS) + .setHealthCheckTimeoutS(healthCheckTimeoutS) + .setIsCrossLanguage(isCrossLanguage) + .setDeploymentLanguage(deploymentLanguage); + if (null != userConfig) { + builder.setUserConfig(ByteString.copyFrom(MessagePackSerializer.encode(userConfig).getKey())); + } + if (null != autoscalingConfig) { + builder.setAutoscalingConfig(autoscalingConfig.toProto()); + } + return builder.build(); + } + public static DeploymentConfig fromProto(io.ray.serve.generated.DeploymentConfig proto) { DeploymentConfig deploymentConfig = new DeploymentConfig(); diff --git a/java/serve/src/main/java/io/ray/serve/deployment/DeploymentVersion.java b/java/serve/src/main/java/io/ray/serve/deployment/DeploymentVersion.java index 537edcb79279..651201063a9b 100644 --- a/java/serve/src/main/java/io/ray/serve/deployment/DeploymentVersion.java +++ b/java/serve/src/main/java/io/ray/serve/deployment/DeploymentVersion.java @@ -1,39 +1,52 @@ package io.ray.serve.deployment; -import com.google.protobuf.ByteString; +import com.google.gson.Gson; import com.google.protobuf.InvalidProtocolBufferException; -import io.ray.runtime.serializer.MessagePackSerializer; +import io.ray.serve.config.DeploymentConfig; import io.ray.serve.exception.RayServeException; import java.io.Serializable; +import java.util.Map; import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.StringUtils; public class DeploymentVersion implements Serializable { + private static Gson gson = new Gson(); + private static final long serialVersionUID = 3400261981775851058L; private String codeVersion; private Object userConfig; + private DeploymentConfig deploymentConfig; + + private Map rayActorOptions; + private boolean unversioned; public DeploymentVersion() { - this(null, null); + this(null, new DeploymentConfig(), null); } public DeploymentVersion(String codeVersion) { - this(codeVersion, null); + this(codeVersion, new DeploymentConfig(), null); } - public DeploymentVersion(String codeVersion, Object userConfig) { + public DeploymentVersion( + String codeVersion, DeploymentConfig deploymentConfig, Map rayActorOptions) { if (StringUtils.isBlank(codeVersion)) { this.unversioned = true; this.codeVersion = RandomStringUtils.randomAlphabetic(6); } else { this.codeVersion = codeVersion; } - this.userConfig = userConfig; + if (deploymentConfig == null) { + deploymentConfig = new DeploymentConfig(); + } + this.deploymentConfig = deploymentConfig; + this.rayActorOptions = rayActorOptions; + this.userConfig = deploymentConfig.getUserConfig(); } public String getCodeVersion() { @@ -44,6 +57,14 @@ public Object getUserConfig() { return userConfig; } + public DeploymentConfig getDeploymentConfig() { + return deploymentConfig; + } + + public Map getRayActorOptions() { + return rayActorOptions; + } + public boolean isUnversioned() { return unversioned; } @@ -64,12 +85,8 @@ public static DeploymentVersion fromProtoBytes(byte[] bytes) { } return new DeploymentVersion( proto.getCodeVersion(), - proto.getUserConfig() != null && proto.getUserConfig().size() != 0 - ? new Object[] { - MessagePackSerializer.decode( - proto.getUserConfig().toByteArray(), Object.class) // TODO-xlang - } - : null); + DeploymentConfig.fromProto(proto.getDeploymentConfig()), + gson.fromJson(proto.getRayActorOptions(), Map.class)); } public byte[] toProtoBytes() { @@ -79,9 +96,9 @@ public byte[] toProtoBytes() { if (StringUtils.isNotBlank(codeVersion)) { proto.setCodeVersion(codeVersion); } - if (userConfig != null) { - proto.setUserConfig( - ByteString.copyFrom(MessagePackSerializer.encode(userConfig).getLeft())); // TODO-xlang + proto.setDeploymentConfig(deploymentConfig.toProto()); + if (rayActorOptions != null && !rayActorOptions.isEmpty()) { + proto.setRayActorOptions(gson.toJson(rayActorOptions)); } return proto.build().toByteArray(); } diff --git a/java/serve/src/main/java/io/ray/serve/replica/RayServeReplica.java b/java/serve/src/main/java/io/ray/serve/replica/RayServeReplica.java index 511996fb7630..65d2e2edcd79 100644 --- a/java/serve/src/main/java/io/ray/serve/replica/RayServeReplica.java +++ b/java/serve/src/main/java/io/ray/serve/replica/RayServeReplica.java @@ -1,13 +1,11 @@ package io.ray.serve.replica; -import io.ray.serve.deployment.DeploymentVersion; - public interface RayServeReplica { Object handleRequest(Object requestMetadata, Object requestArgs); - default Object reconfigure(Object userConfig) { - return new DeploymentVersion(null, userConfig); + default Object reconfigure(byte[] deploymentConfigBytes) { + return null; } default boolean checkHealth() { diff --git a/java/serve/src/main/java/io/ray/serve/replica/RayServeReplicaImpl.java b/java/serve/src/main/java/io/ray/serve/replica/RayServeReplicaImpl.java index b7989eac0cba..e605834b4e96 100644 --- a/java/serve/src/main/java/io/ray/serve/replica/RayServeReplicaImpl.java +++ b/java/serve/src/main/java/io/ray/serve/replica/RayServeReplicaImpl.java @@ -279,9 +279,11 @@ public synchronized boolean prepareForShutdown() { } @Override - public DeploymentVersion reconfigure(Object userConfig) { + public DeploymentVersion reconfigure(byte[] deploymentConfigBytes) { + config = DeploymentConfig.fromProtoBytes(deploymentConfigBytes); + Object userConfig = config.getUserConfig(); DeploymentVersion deploymentVersion = - new DeploymentVersion(version.getCodeVersion(), userConfig); + new DeploymentVersion(version.getCodeVersion(), config, version.getRayActorOptions()); version = deploymentVersion; if (userConfig == null) { return deploymentVersion; diff --git a/java/serve/src/main/java/io/ray/serve/replica/RayServeWrappedReplica.java b/java/serve/src/main/java/io/ray/serve/replica/RayServeWrappedReplica.java index dd59858061ed..12d680aebc80 100644 --- a/java/serve/src/main/java/io/ray/serve/replica/RayServeWrappedReplica.java +++ b/java/serve/src/main/java/io/ray/serve/replica/RayServeWrappedReplica.java @@ -189,8 +189,8 @@ public boolean isAllocated() { * * @return */ - public Object isInitialized(Object userConfig) { - Object deploymentVersion = reconfigure(userConfig); + public Object isInitialized(byte[] deploymentConfigBytes) { + Object deploymentVersion = reconfigure(deploymentConfigBytes); checkHealth(); return deploymentVersion; } @@ -213,13 +213,8 @@ public boolean prepareForShutdown() { * DeploymentVersion is serialized to protobuf byte[]. */ @Override - public Object reconfigure(Object userConfig) { - boolean isCrossLanguage = userConfig instanceof byte[]; - DeploymentVersion deploymentVersion = - replica.reconfigure( - isCrossLanguage && userConfig != null - ? MessagePackSerializer.decode((byte[]) userConfig, Object.class) - : userConfig); + public Object reconfigure(byte[] deploymentConfigBytes) { + DeploymentVersion deploymentVersion = replica.reconfigure(deploymentConfigBytes); return deploymentVersion.toProtoBytes(); } diff --git a/java/serve/src/test/java/io/ray/serve/replica/RayServeReplicaTest.java b/java/serve/src/test/java/io/ray/serve/replica/RayServeReplicaTest.java index f9230de84a65..8121aa5d5b35 100644 --- a/java/serve/src/test/java/io/ray/serve/replica/RayServeReplicaTest.java +++ b/java/serve/src/test/java/io/ray/serve/replica/RayServeReplicaTest.java @@ -70,11 +70,17 @@ public void test() throws IOException { // reconfigure ObjectRef versionRef = - replicHandle.task(RayServeWrappedReplica::reconfigure, (Object) null).remote(); + replicHandle + .task(RayServeWrappedReplica::reconfigure, (new DeploymentConfig()).toProtoBytes()) + .remote(); Assert.assertEquals( DeploymentVersion.fromProtoBytes((byte[]) (versionRef.get())).getCodeVersion(), version); - replicHandle.task(RayServeWrappedReplica::reconfigure, new Object()).remote().get(); + deploymentConfig = deploymentConfig.setUserConfig(new Object()); + replicHandle + .task(RayServeWrappedReplica::reconfigure, deploymentConfig.toProtoBytes()) + .remote() + .get(); resultRef = replicHandle .task( @@ -84,8 +90,9 @@ public void test() throws IOException { .remote(); Assert.assertEquals((String) resultRef.get(), "1"); + deploymentConfig = deploymentConfig.setUserConfig(ImmutableMap.of("value", "100")); replicHandle - .task(RayServeWrappedReplica::reconfigure, ImmutableMap.of("value", "100")) + .task(RayServeWrappedReplica::reconfigure, deploymentConfig.toProtoBytes()) .remote() .get(); resultRef = diff --git a/python/build-wheel-manylinux2014.sh b/python/build-wheel-manylinux2014.sh index df488e0d47f7..9e843fd2c406 100755 --- a/python/build-wheel-manylinux2014.sh +++ b/python/build-wheel-manylinux2014.sh @@ -12,33 +12,30 @@ EOF chmod +x /usr/bin/nproc NODE_VERSION="14" -PYTHONS=("cp37-cp37m" - "cp38-cp38" - "cp39-cp39" - "cp310-cp310" - "cp311-cp311") - -NUMPY_VERSIONS=("1.14.5" - "1.14.5" - "1.19.3" - "1.22.0" - "1.22.0") + +# Python version key, interpreter version code, numpy tuples. +PYTHON_NUMPYS=( + "3.7 cp37-cp37m 1.14.5" + "3.8 cp38-cp38 1.14.5" + "3.9 cp39-cp39 1.19.3" + "3.10 cp310-cp310 1.22.0" + "3.11 cp311-cp311 1.22.0" +) yum -y install unzip zip sudo yum -y install java-1.8.0-openjdk java-1.8.0-openjdk-devel xz yum -y install openssl -if [ "${HOSTTYPE-}" = "x86_64" ]; then +if [[ "${HOSTTYPE-}" == "x86_64" ]]; then yum install "libasan-4.8.5-44.el7.${HOSTTYPE}" -y yum install "libubsan-7.3.1-5.10.el7.${HOSTTYPE}" -y yum install "devtoolset-8-libasan-devel.${HOSTTYPE}" -y fi java -version -java_bin=$(readlink -f "$(command -v java)") -echo "java_bin path $java_bin" -java_home=${java_bin%jre/bin/java} -export JAVA_HOME="$java_home" +JAVA_BIN="$(readlink -f "$(command -v java)")" +echo "java_bin path ${JAVA_BIN}" +export JAVA_HOME="${JAVA_BIN%jre/bin/java}" /ray/ci/env/install-bazel.sh # Put bazel into the PATH if building Bazel from source @@ -63,10 +60,11 @@ nvm use "$NODE_VERSION" # Build the dashboard so its static assets can be included in the wheel. # TODO(mfitton): switch this back when deleting old dashboard code. -pushd python/ray/dashboard/client +( + cd python/ray/dashboard/client npm ci npm run build -popd +) set -x # Add the repo folder to the safe.dictory global variable to avoid the failure @@ -75,9 +73,16 @@ set -x git config --global --add safe.directory /ray mkdir -p .whl -for ((i=0; i<${#PYTHONS[@]}; ++i)); do - PYTHON=${PYTHONS[i]} - NUMPY_VERSION=${NUMPY_VERSIONS[i]} +for PYTHON_NUMPY in "${PYTHON_NUMPYS[@]}" ; do + PYTHON_VERSION_KEY="$(echo "${PYTHON_NUMPY}" | cut -d' ' -f1)" + if [[ "${BUILD_ONE_PYTHON_ONLY:-}" != "" && "${PYTHON_VERSION_KEY}" != "${BUILD_ONE_PYTHON_ONLY}" ]]; then + continue + fi + + PYTHON="$(echo "${PYTHON_NUMPY}" | cut -d' ' -f2)" + NUMPY_VERSION="$(echo "${PYTHON_NUMPY}" | cut -d' ' -f3)" + + echo "---- Build wheel for ${PYTHON}, numpy=${NUMPY_VERSION}" # The -f flag is passed twice to also run git clean in the arrow subdirectory. # The -d flag removes directories. The -x flag ignores the .gitignore file, @@ -85,12 +90,13 @@ for ((i=0; i<${#PYTHONS[@]}; ++i)); do # dashboard directory and jars directory. git clean -f -f -x -d -e .whl -e python/ray/dashboard/client -e dashboard/client -e python/ray/jars - pushd python + ( + cd python # Fix the numpy version because this will be the oldest numpy version we can # support. /opt/python/"${PYTHON}"/bin/pip install -q numpy=="${NUMPY_VERSION}" cython==0.29.32 # Set the commit SHA in __init__.py. - if [ -n "$TRAVIS_COMMIT" ]; then + if [[ -n "$TRAVIS_COMMIT" ]]; then sed -i.bak "s/{{RAY_COMMIT_SHA}}/$TRAVIS_COMMIT/g" ray/__init__.py && rm ray/__init__.py.bak else echo "TRAVIS_COMMIT variable not set - required to populated ray.__commit__." @@ -98,22 +104,24 @@ for ((i=0; i<${#PYTHONS[@]}; ++i)); do fi # build ray wheel - PATH=/opt/python/${PYTHON}/bin:/root/bazel-3.2.0/output:$PATH \ - /opt/python/"${PYTHON}"/bin/python setup.py bdist_wheel + PATH="/opt/python/${PYTHON}/bin:/root/bazel-3.2.0/output:$PATH" \ + "/opt/python/${PYTHON}/bin/python" setup.py bdist_wheel + # build ray-cpp wheel - PATH=/opt/python/${PYTHON}/bin:/root/bazel-3.2.0/output:$PATH \ - RAY_INSTALL_CPP=1 /opt/python/"${PYTHON}"/bin/python setup.py bdist_wheel + PATH="/opt/python/${PYTHON}/bin:/root/bazel-3.2.0/output:$PATH" \ + RAY_INSTALL_CPP=1 "/opt/python/${PYTHON}/bin/python" setup.py bdist_wheel + # In the future, run auditwheel here. mv dist/*.whl ../.whl/ - popd + ) done # Rename the wheels so that they can be uploaded to PyPI. TODO(rkn): This is a # hack, we should use auditwheel instead. for path in .whl/*.whl; do - if [ -f "${path}" ]; then + if [[ -f "${path}" ]]; then out="${path//-linux/-manylinux2014}" - if [ "$out" != "$path" ]; then + if [[ "$out" != "$path" ]]; then mv "${path}" "${out}" fi fi diff --git a/python/ray/__init__.py b/python/ray/__init__.py index 126b48d14d59..08fafe10cd5b 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -192,11 +192,10 @@ def __getattr__(self, attr): state = _DeprecationWrapper("state", ray._private.state) -__all__ = [ +RAY_APIS = { "__version__", "_config", "get_runtime_context", - "actor", "autoscaler", "available_resources", "cancel", @@ -207,7 +206,6 @@ def __getattr__(self, attr): "get_actor", "get_gpu_ids", "init", - "internal", "is_initialized", "java_actor_class", "java_function", @@ -221,19 +219,68 @@ def __getattr__(self, attr): "shutdown", "show_in_dashboard", "timeline", - "util", "wait", - "widgets", "LOCAL_MODE", "SCRIPT_MODE", "WORKER_MODE", -] +} + +# Public APIs that should automatically trigger ray.init(). +AUTO_INIT_APIS = { + "cancel", + "get", + "get_actor", + "get_gpu_ids", + "kill", + "put", + "wait", +} + +# Public APIs that should not automatically trigger ray.init(). +NON_AUTO_INIT_APIS = { + "ClientBuilder", + "LOCAL_MODE", + "Language", + "SCRIPT_MODE", + "WORKER_MODE", + "__version__", + "_config", + "autoscaler", + "available_resources", + "client", + "cluster_resources", + "cpp_function", + "get_runtime_context", + "init", + "is_initialized", + "java_actor_class", + "java_function", + "method", + "nodes", + "remote", + "show_in_dashboard", + "shutdown", + "timeline", +} + +assert RAY_APIS == AUTO_INIT_APIS | NON_AUTO_INIT_APIS +from ray._private.auto_init_hook import wrap_auto_init_for_all_apis # noqa: E402 + +wrap_auto_init_for_all_apis(AUTO_INIT_APIS) +del wrap_auto_init_for_all_apis + + +__all__ = list(RAY_APIS) # Subpackages __all__ += [ + "actor", + "autoscaler", "data", + "internal", + "util", + "widgets", "workflow", - "autoscaler", ] # ID types diff --git a/python/ray/_private/auto_init_hook.py b/python/ray/_private/auto_init_hook.py new file mode 100644 index 000000000000..0c4184f3873e --- /dev/null +++ b/python/ray/_private/auto_init_hook.py @@ -0,0 +1,28 @@ +import ray +import os +from functools import wraps + + +def auto_init_ray(): + if ( + os.environ.get("RAY_ENABLE_AUTO_CONNECT", "") != "0" + and not ray.is_initialized() + ): + ray.init() + + +def wrap_auto_init(fn): + @wraps(fn) + def auto_init_wrapper(*args, **kwargs): + auto_init_ray() + return fn(*args, **kwargs) + + return auto_init_wrapper + + +def wrap_auto_init_for_all_apis(api_names): + """Wrap public APIs with automatic ray.init.""" + for api_name in api_names: + api = getattr(ray, api_name, None) + assert api is not None, api_name + setattr(ray, api_name, wrap_auto_init(api)) diff --git a/python/ray/_private/client_mode_hook.py b/python/ray/_private/client_mode_hook.py index 2d58726ab169..052aa01b0b75 100644 --- a/python/ray/_private/client_mode_hook.py +++ b/python/ray/_private/client_mode_hook.py @@ -1,7 +1,8 @@ import os import threading from contextlib import contextmanager -from functools import partial, wraps +from functools import wraps +from ray._private.auto_init_hook import auto_init_ray # Attr set on func defs to mark they have been converted to client mode. RAY_CLIENT_MODE_ATTR = "__ray_client_mode_key__" @@ -77,27 +78,24 @@ def enable_client_mode(): _explicitly_disable_client_mode() -def client_mode_hook(func: callable = None, *, auto_init: bool): +def client_mode_hook(func: callable): """Decorator for whether to use the 'regular' ray version of a function, or the Ray Client version of that function. Args: func: This function. This is set when this function is used as a decorator. - auto_init: Whether `ray.init()` should be transparently called when - the wrapped function is called. This should be `True` for functions - that are *NOT* part of the initialization path (e.g. `init` or - `is_initialized`) or for functions that do not require Ray to be - initialized (e.g., KV operations, `shutdown`). """ - if func is None: - return partial(client_mode_hook, auto_init=auto_init) + + from ray.util.client import ray @wraps(func) def wrapper(*args, **kwargs): - from ray.util.client import ray - - if client_mode_should_convert(auto_init=auto_init): + # NOTE(hchen): DO NOT use "import" inside this function. + # Because when it's called within a `__del__` method, this error + # will be raised (see #35114): + # ImportError: sys.meta_path is None, Python is likely shutting down. + if client_mode_should_convert(): # Legacy code # we only convert init function if RAY_CLIENT_MODE=1 if func.__name__ != "init" or is_client_mode_enabled_by_default: @@ -107,21 +105,8 @@ def wrapper(*args, **kwargs): return wrapper -def client_mode_should_convert(*, auto_init: bool): - """Determines if functions should be converted to client mode & if - Ray should be auto-initialized. - - NOTE: `auto_init` must happen before we branch into regular ray or client - code because the initialization may result in either mode. - """ - if auto_init: - import ray - - if ( - os.environ.get("RAY_ENABLE_AUTO_CONNECT", "") != "0" - and not ray.is_initialized() - ): - ray.init() +def client_mode_should_convert(): + """Determines if functions should be converted to client mode.""" # `is_client_mode_enabled_by_default` is used for testing with # `RAY_CLIENT_MODE=1`. This flag means all tests run with client mode. @@ -146,9 +131,10 @@ def client_mode_wrap(func): def wrapper(*args, **kwargs): from ray.util.client import ray + auto_init_ray() # Directly pass this through since `client_mode_wrap` is for # Placement Group APIs - if client_mode_should_convert(auto_init=True): + if client_mode_should_convert(): f = ray.remote(num_cpus=0)(func) ref = f.remote(*args, **kwargs) return ray.get(ref) diff --git a/python/ray/_private/function_manager.py b/python/ray/_private/function_manager.py index 9dd9173cf265..419731ad8315 100644 --- a/python/ray/_private/function_manager.py +++ b/python/ray/_private/function_manager.py @@ -9,7 +9,7 @@ import time import traceback from collections import defaultdict, namedtuple -from typing import Optional +from typing import Optional, Callable import ray import ray._private.profiling as profiling @@ -27,11 +27,16 @@ format_error_message, ) from ray._private.serialization import pickle_dumps -from ray._raylet import JobID, PythonFunctionDescriptor +from ray._raylet import JobID, PythonFunctionDescriptor, WORKER_SETUP_HOOK_KEY_NAME_GCS FunctionExecutionInfo = namedtuple( "FunctionExecutionInfo", ["function", "function_name", "max_calls"] ) +ImportedFunctionInfo = namedtuple( + "ImportedFunctionInfo", + ["job_id", "function_id", "function_name", "function", "module", "max_calls"], +) + """FunctionExecutionInfo: A named tuple storing remote function information.""" logger = logging.getLogger(__name__) @@ -175,6 +180,53 @@ def export_key(self, key): # TODO(mwtian) implement per-job notification here. self._worker.gcs_publisher.publish_function_key(key) + def export_setup_func( + self, setup_func: Callable, timeout: Optional[int] = None + ) -> bytes: + """Export the setup hook function and return the key.""" + pickled_function = pickle_dumps( + setup_func, f"Cannot serialize the worker_setup_hook {setup_func.__name__}" + ) + + function_to_run_id = hashlib.shake_128(pickled_function).digest( + ray_constants.ID_SIZE + ) + key = make_function_table_key( + # This value should match with gcs_function_manager.h. + # Otherwise, it won't be GC'ed. + WORKER_SETUP_HOOK_KEY_NAME_GCS.encode(), + # b"FunctionsToRun", + self._worker.current_job_id.binary(), + function_to_run_id, + ) + + check_oversized_function( + pickled_function, setup_func.__name__, "function", self._worker + ) + + try: + self._worker.gcs_client.internal_kv_put( + key, + pickle.dumps( + { + "job_id": self._worker.current_job_id.binary(), + "function_id": function_to_run_id, + "function": pickled_function, + } + ), + # overwrite + True, + ray_constants.KV_NAMESPACE_FUNCTION_TABLE, + timeout=timeout, + ) + except Exception as e: + logger.exception( + "Failed to export the setup hook " f"{setup_func.__name__}." + ) + raise e + + return key + def export(self, remote_function): """Pickle a remote function and export it to redis. Args: @@ -224,21 +276,31 @@ def export(self, remote_function): key, val, True, KV_NAMESPACE_FUNCTION_TABLE ) - def fetch_and_register_remote_function(self, key): - """Import a remote function.""" - vals = self._worker.gcs_client.internal_kv_get(key, KV_NAMESPACE_FUNCTION_TABLE) + def fetch_registered_method( + self, key: str, timeout: Optional[int] = None + ) -> Optional[ImportedFunctionInfo]: + vals = self._worker.gcs_client.internal_kv_get( + key, KV_NAMESPACE_FUNCTION_TABLE, timeout=timeout + ) if vals is None: - return False + return None else: vals = pickle.loads(vals) - fields = [ - "job_id", - "function_id", - "function_name", - "function", - "module", - "max_calls", - ] + fields = [ + "job_id", + "function_id", + "function_name", + "function", + "module", + "max_calls", + ] + return ImportedFunctionInfo._make(vals.get(field) for field in fields) + + def fetch_and_register_remote_function(self, key): + """Import a remote function.""" + remote_function_info = self.fetch_registered_method(key) + if not remote_function_info: + return False ( job_id_str, function_id_str, @@ -246,7 +308,7 @@ def fetch_and_register_remote_function(self, key): serialized_function, module, max_calls, - ) = (vals.get(field) for field in fields) + ) = remote_function_info function_id = ray.FunctionID(function_id_str) job_id = ray.JobID(job_id_str) diff --git a/python/ray/_private/gcs_pubsub.py b/python/ray/_private/gcs_pubsub.py index def67746dd42..2168b9dfed9d 100644 --- a/python/ray/_private/gcs_pubsub.py +++ b/python/ray/_private/gcs_pubsub.py @@ -4,10 +4,8 @@ import random import threading from typing import Optional, Tuple, List -import time import grpc -from grpc._channel import _InactiveRpcError from ray._private.utils import get_or_create_event_loop try: @@ -75,6 +73,8 @@ def __init__(self, worker_id: bytes = None): # SubscriberID / UniqueID, which is 28 (kUniqueIDSize) random bytes. self._subscriber_id = bytes(bytearray(random.getrandbits(8) for _ in range(28))) self._last_batch_size = 0 + self._max_processed_sequence_id = 0 + self._publisher_id = b"" # Batch size of the result from last poll. Used to indicate whether the # subscriber can keep up. @@ -91,7 +91,9 @@ def _subscribe_request(self, channel): def _poll_request(self): return gcs_service_pb2.GcsSubscriberPollRequest( - subscriber_id=self._subscriber_id + subscriber_id=self._subscriber_id, + max_processed_sequence_id=self._max_processed_sequence_id, + publisher_id=self._publisher_id, ) def _unsubscribe_request(self, channels): @@ -156,49 +158,6 @@ def _pop_actors(queue, batch_size=100): return msgs -class GcsPublisher(_PublisherBase): - """Publisher to GCS.""" - - def __init__(self, address: str): - channel = gcs_utils.create_gcs_channel(address) - self._stub = gcs_service_pb2_grpc.InternalPubSubGcsServiceStub(channel) - - def publish_error( - self, key_id: bytes, error_info: ErrorTableData, num_retries=None - ) -> None: - """Publishes error info to GCS.""" - msg = pubsub_pb2.PubMessage( - channel_type=pubsub_pb2.RAY_ERROR_INFO_CHANNEL, - key_id=key_id, - error_info_message=error_info, - ) - req = gcs_service_pb2.GcsPublishRequest(pub_messages=[msg]) - self._gcs_publish(req, num_retries, timeout=1) - - def publish_logs(self, log_batch: dict) -> None: - """Publishes logs to GCS.""" - req = self._create_log_request(log_batch) - self._gcs_publish(req) - - def publish_function_key(self, key: bytes) -> None: - """Publishes function key to GCS.""" - req = self._create_function_key_request(key) - self._gcs_publish(req) - - def _gcs_publish(self, req, num_retries=None, timeout=None) -> None: - count = num_retries or MAX_GCS_PUBLISH_RETRIES - while count > 0: - try: - self._stub.GcsPublish(req, timeout=timeout) - return - except _InactiveRpcError: - pass - count -= 1 - if count > 0: - time.sleep(1) - raise TimeoutError(f"Failed to publish after retries: {req}") - - class _SyncSubscriber(_SubscriberBase): def __init__( self, @@ -272,7 +231,21 @@ def _poll_locked(self, timeout=None) -> None: if fut.done(): self._last_batch_size = len(fut.result().pub_messages) + if fut.result().publisher_id != self._publisher_id: + if self._publisher_id != "": + logger.debug( + f"replied publisher_id {fut.result().publisher_id} " + f"different from {self._publisher_id}, this should " + "only happens during gcs failover." + ) + self._publisher_id = fut.result().publisher_id + self._max_processed_sequence_id = 0 + for msg in fut.result().pub_messages: + if msg.sequence_id <= self._max_processed_sequence_id: + logger.warn(f"Ignoring out of order message {msg}") + continue + self._max_processed_sequence_id = msg.sequence_id if msg.channel_type != self._channel: logger.warn(f"Ignoring message from unsubscribed channel {msg}") continue @@ -538,7 +511,20 @@ async def _poll(self, timeout=None) -> None: break try: self._last_batch_size = len(poll.result().pub_messages) + if poll.result().publisher_id != self._publisher_id: + if self._publisher_id != "": + logger.debug( + f"replied publisher_id {poll.result().publisher_id}" + f"different from {self._publisher_id}, this should " + "only happens during gcs failover." + ) + self._publisher_id = poll.result().publisher_id + self._max_processed_sequence_id = 0 for msg in poll.result().pub_messages: + if msg.sequence_id <= self._max_processed_sequence_id: + logger.warn(f"Ignoring out of order message {msg}") + continue + self._max_processed_sequence_id = msg.sequence_id self._queue.append(msg) except grpc.RpcError as e: if self._should_terminate_polling(e): diff --git a/python/ray/_private/log_monitor.py b/python/ray/_private/log_monitor.py index 7f06343625ae..76944d42ec03 100644 --- a/python/ray/_private/log_monitor.py +++ b/python/ray/_private/log_monitor.py @@ -11,11 +11,9 @@ import traceback from typing import Callable, List, Set -import ray._private.gcs_pubsub as gcs_pubsub import ray._private.ray_constants as ray_constants import ray._private.services as services import ray._private.utils -from ray._private.gcs_pubsub import GcsPublisher from ray._private.ray_logging import setup_component_logger # Logger for this module. It should be configured at the entry point @@ -135,7 +133,7 @@ class LogMonitor: def __init__( self, logs_dir, - gcs_publisher: gcs_pubsub.GcsPublisher, + gcs_publisher, is_proc_alive_fn: Callable[[int], bool], max_files_open: int = ray_constants.LOG_MONITOR_MAX_OPEN_FILES, ): @@ -368,6 +366,13 @@ def flush(): file_info.job_id = next_line.split( ray_constants.LOG_PREFIX_JOB_ID, 1 )[1] + elif next_line.startswith( + ray_constants.LOG_PREFIX_TASK_ATTEMPT_START + ) or next_line.startswith( + ray_constants.LOG_PREFIX_TASK_ATTEMPT_END + ): + # Ignore these magic tokens for task logs. + pass elif next_line.startswith( "Windows fatal exception: access violation" ): @@ -525,14 +530,14 @@ def is_proc_alive(pid): ) log_monitor = LogMonitor( - args.logs_dir, gcs_pubsub.GcsPublisher(address=args.gcs_address), is_proc_alive + args.logs_dir, ray._raylet.GcsPublisher(address=args.gcs_address), is_proc_alive ) try: log_monitor.run() except Exception as e: # Something went wrong, so push an error to all drivers. - gcs_publisher = GcsPublisher(address=args.gcs_address) + gcs_publisher = ray._raylet.GcsPublisher(address=args.gcs_address) traceback_str = ray._private.utils.format_error_message(traceback.format_exc()) message = ( f"The log monitor on node {platform.node()} " diff --git a/python/ray/_private/node.py b/python/ray/_private/node.py index ebcb3c6c1a79..ab1e044763f5 100644 --- a/python/ray/_private/node.py +++ b/python/ray/_private/node.py @@ -150,6 +150,7 @@ def __init__( self._config = ray_params._system_config or {} self._dashboard_agent_listen_port = ray_params.dashboard_agent_listen_port + self._dashboard_grpc_port = ray_params.dashboard_grpc_port # Configure log rotation parameters. self.max_bytes = int( @@ -239,9 +240,9 @@ def __init__( self.gcs_address, self._raylet_ip_address, ) - self._plasma_store_socket_name = node_info.object_store_socket_name - self._raylet_socket_name = node_info.raylet_socket_name - self._ray_params.node_manager_port = node_info.node_manager_port + self._plasma_store_socket_name = node_info["object_store_socket_name"] + self._raylet_socket_name = node_info["raylet_socket_name"] + self._ray_params.node_manager_port = node_info["node_manager_port"] else: # If the user specified a socket name, use it. self._plasma_store_socket_name = self._prepare_socket_file( @@ -304,7 +305,7 @@ def __init__( self._raylet_ip_address, ) if self._ray_params.node_manager_port == 0: - self._ray_params.node_manager_port = node_info.node_manager_port + self._ray_params.node_manager_port = node_info["node_manager_port"] # Makes sure the Node object has valid addresses after setup. self.validate_ip_port(self.address) @@ -485,7 +486,7 @@ def raylet_ip_address(self): @property def address(self): """Get the address for bootstrapping, e.g. the address to pass to - `ray start` or `ray.int()` to start worker nodes, that has been + `ray start` or `ray.init()` to start worker nodes, that has been converted to ip:port format. """ return self._gcs_address @@ -546,6 +547,11 @@ def dashboard_agent_listen_port(self): """Get the dashboard agent's listen port""" return self._dashboard_agent_listen_port + @property + def dashboard_grpc_port(self): + """Get the dashboard head grpc port""" + return self._dashboard_grpc_port + @property def logging_config(self): """Get the logging config of the current node.""" @@ -927,13 +933,15 @@ def start_api_server(self, *, include_dashboard: bool, raise_on_failure: bool): raise_on_failure, self._ray_params.dashboard_host, self.gcs_address, + self._node_ip_address, self._temp_dir, self._logs_dir, self._session_dir, + port=self._ray_params.dashboard_port, + dashboard_grpc_port=self._ray_params.dashboard_grpc_port, fate_share=self.kernel_fate_share, max_bytes=self.max_bytes, backup_count=self.backup_count, - port=self._ray_params.dashboard_port, redirect_logging=self.should_redirect_logs(), stdout_file=stderr_file, stderr_file=stderr_file, diff --git a/python/ray/_private/parameter.py b/python/ray/_private/parameter.py index e89b9f9216d0..9dbd7f68b864 100644 --- a/python/ray/_private/parameter.py +++ b/python/ray/_private/parameter.py @@ -87,6 +87,9 @@ class RayParams: dashboard_agent_listen_port: The port for dashboard agents to listen on for HTTP requests. Defaults to 52365. + dashboard_grpc_port: The port for the dashboard head process to listen + for gRPC on. + Defaults to random available port. plasma_store_socket_name: If provided, it will specify the socket name used by the plasma store. raylet_socket_name: If provided, it will specify the socket path @@ -159,6 +162,7 @@ def __init__( dashboard_agent_listen_port: Optional[ int ] = ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT, + dashboard_grpc_port: Optional[int] = None, plasma_store_socket_name: Optional[str] = None, raylet_socket_name: Optional[str] = None, temp_dir: Optional[str] = None, @@ -211,6 +215,7 @@ def __init__( self.dashboard_host = dashboard_host self.dashboard_port = dashboard_port self.dashboard_agent_listen_port = dashboard_agent_listen_port + self.dashboard_grpc_port = dashboard_grpc_port self.plasma_store_socket_name = plasma_store_socket_name self.raylet_socket_name = raylet_socket_name self.temp_dir = temp_dir @@ -298,6 +303,7 @@ def wrap_port(port): "dashboard": wrap_port(self.dashboard_port), "dashboard_agent_grpc": wrap_port(self.metrics_agent_port), "dashboard_agent_http": wrap_port(self.dashboard_agent_listen_port), + "dashboard_grpc": wrap_port(self.dashboard_grpc_port), "metrics_export": wrap_port(self.metrics_export_port), } redis_shard_ports = self.redis_shard_ports diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 769d68a2ec87..8acf81575ab2 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -284,6 +284,10 @@ def env_set_by_user(key): LOG_PREFIX_TASK_NAME = ":task_name:" # Job ids are recorded in the logs with this magic token as a prefix. LOG_PREFIX_JOB_ID = ":job_id:" +# Task attempts magic token marked the beginning of the task logs +LOG_PREFIX_TASK_ATTEMPT_START = ":task_attempt_start:" +# Task attempts magic token marked the beginning of the task logs +LOG_PREFIX_TASK_ATTEMPT_END = ":task_attempt_end:" # The object metadata field uses the following format: It is a comma # separated list of fields. The first field is mandatory and is the @@ -378,6 +382,7 @@ def env_set_by_user(key): LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"] NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" +RAY_WORKER_NICENESS = "RAY_worker_niceness" # Default max_retries option in @ray.remote for non-actor # tasks. @@ -425,3 +430,6 @@ def gcs_actor_scheduling_enabled(): } RAY_ENABLE_RECORD_TASK_LOGGING = env_bool("RAY_ENABLE_RECORD_TASK_LOGGING", False) + +WORKER_SETUP_HOOK_ENV_VAR = "__RAY_WORKER_SETUP_HOOK_ENV_VAR" +RAY_WORKER_SETUP_HOOK_LOAD_TIMEOUT_ENV_VAR = "RAY_WORKER_SETUP_HOOK_LOAD_TIMEOUT" diff --git a/python/ray/_private/ray_option_utils.py b/python/ray/_private/ray_option_utils.py index e51301cf2fd6..88703942f64e 100644 --- a/python/ray/_private/ray_option_utils.py +++ b/python/ray/_private/ray_option_utils.py @@ -3,7 +3,8 @@ from dataclasses import dataclass from typing import Any, Callable, Dict, Optional, Tuple, Union -import ray._private.ray_constants as ray_constants +import ray +from ray._private import ray_constants from ray._private.utils import get_ray_doc_version from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import ( @@ -17,9 +18,9 @@ class Option: # Type constraint of an option. type_constraint: Optional[Union[type, Tuple[type]]] = None # Value constraint of an option. - value_constraint: Optional[Callable[[Any], bool]] = None - # Error message for value constraint. - error_message_for_value_constraint: Optional[str] = None + # The callable should return None if there is no error. + # Otherwise, return the error message. + value_constraint: Optional[Callable[[Any], Optional[str]]] = None # Default value. default_value: Any = None @@ -32,8 +33,9 @@ def validate(self, keyword: str, value: Any): f"but received type {type(value)}" ) if self.value_constraint is not None: - if not self.value_constraint(value): - raise ValueError(self.error_message_for_value_constraint) + possible_error_message = self.value_constraint(value) + if possible_error_message: + raise ValueError(possible_error_message) def _counting_option(name: str, infinite: bool = True, default_value: Any = None): @@ -47,29 +49,63 @@ def _counting_option(name: str, infinite: bool = True, default_value: Any = None if infinite: return Option( (int, type(None)), - lambda x: x is None or x >= -1, - f"The keyword '{name}' only accepts None, 0, -1 or a positive integer, " - "where -1 represents infinity.", + lambda x: None + if (x is None or x >= -1) + else f"The keyword '{name}' only accepts None, 0, -1" + " or a positive integer, where -1 represents infinity.", default_value=default_value, ) return Option( (int, type(None)), - lambda x: x is None or x >= 0, - f"The keyword '{name}' only accepts None, 0 or a positive integer.", + lambda x: None + if (x is None or x >= 0) + else f"The keyword '{name}' only accepts None, 0 or a positive integer.", default_value=default_value, ) +def _validate_resource_quantity(name, quantity): + if quantity < 0: + return f"The quantity of resource {name} cannot be negative" + if ( + isinstance(quantity, float) + and quantity != 0.0 + and int(quantity * ray._raylet.RESOURCE_UNIT_SCALING) == 0 + ): + return ( + f"The precision of the fractional quantity of resource {name}" + " cannot go beyond 0.0001" + ) + return None + + def _resource_option(name: str, default_value: Any = None): - """This is used for non-negative options, typically for defining resources.""" + """This is used for resource related options.""" return Option( (float, int, type(None)), - lambda x: x is None or x >= 0, - f"The keyword '{name}' only accepts None, 0 or a positive number", + lambda x: None if (x is None) else _validate_resource_quantity(name, x), default_value=default_value, ) +def _validate_resources(resources: Optional[Dict[str, float]]) -> Optional[str]: + if resources is None: + return None + + if "CPU" in resources or "GPU" in resources: + return ( + "Use the 'num_cpus' and 'num_gpus' keyword instead of 'CPU' and 'GPU' " + "in 'resources' keyword" + ) + + for name, quantity in resources.items(): + possible_error_message = _validate_resource_quantity(name, quantity) + if possible_error_message: + return possible_error_message + + return None + + _common_options = { "accelerator_type": Option((str, type(None))), "memory": _resource_option("memory"), @@ -85,12 +121,7 @@ def _resource_option(name: str, default_value: Any = None): ), "placement_group_bundle_index": Option(int, default_value=-1), "placement_group_capture_child_tasks": Option((bool, type(None))), - "resources": Option( - (dict, type(None)), - lambda x: x is None or ("CPU" not in x and "GPU" not in x), - "Use the 'num_cpus' and 'num_gpus' keyword instead of 'CPU' and 'GPU' " - "in 'resources' keyword", - ), + "resources": Option((dict, type(None)), lambda x: _validate_resources(x)), "runtime_env": Option((dict, type(None))), "scheduling_strategy": Option( ( @@ -122,26 +153,29 @@ def issubclass_safe(obj: Any, cls_: type) -> bool: "num_cpus": _resource_option("num_cpus", default_value=1), "num_returns": Option( (int, str, type(None)), - lambda x: x is None or x == "dynamic" or x >= 0, - "The keyword 'num_returns' only accepts None, a non-negative integer, or " + lambda x: None + if (x is None or x == "dynamic" or x >= 0) + else "The keyword 'num_returns' only accepts None, a non-negative integer, or " '"dynamic" (for generators)', default_value=1, ), "object_store_memory": Option( # override "_common_options" (int, type(None)), - lambda x: x is None, - "Setting 'object_store_memory' is not implemented for tasks", + lambda x: None + if (x is None) + else "Setting 'object_store_memory' is not implemented for tasks", ), "retry_exceptions": Option( (bool, list, tuple), - lambda x: ( + lambda x: None + if ( isinstance(x, bool) or ( isinstance(x, (list, tuple)) and all(issubclass_safe(x_, Exception) for x_ in x) ) - ), - "retry_exceptions must be either a boolean or a list of exceptions", + ) + else "retry_exceptions must be either a boolean or a list of exceptions", default_value=False, ), } @@ -150,8 +184,9 @@ def issubclass_safe(obj: Any, cls_: type) -> bool: "concurrency_groups": Option((list, dict, type(None))), "lifetime": Option( (str, type(None)), - lambda x: x in (None, "detached", "non_detached"), - "actor `lifetime` argument must be one of 'detached', " + lambda x: None + if x in (None, "detached", "non_detached") + else "actor `lifetime` argument must be one of 'detached', " "'non_detached' and 'None'.", ), "max_concurrency": _counting_option("max_concurrency", False), diff --git a/python/ray/_private/runtime_env/context.py b/python/ray/_private/runtime_env/context.py index 582063ad0896..4c980e4e4ae4 100644 --- a/python/ray/_private/runtime_env/context.py +++ b/python/ray/_private/runtime_env/context.py @@ -8,6 +8,7 @@ from ray.util.annotations import DeveloperAPI from ray.core.generated.common_pb2 import Language from ray._private.services import get_ray_jars_dir +from ray._private.utils import update_envs logger = logging.getLogger(__name__) @@ -44,7 +45,7 @@ def deserialize(json_string): return RuntimeEnvContext(**json.loads(json_string)) def exec_worker(self, passthrough_args: List[str], language: Language): - os.environ.update(self.env_vars) + update_envs(self.env_vars) if language == Language.PYTHON and sys.platform == "win32": executable = self.py_executable diff --git a/python/ray/_private/runtime_env/plugin.py b/python/ray/_private/runtime_env/plugin.py index 21b5fa8c49a2..b36d59858c0a 100644 --- a/python/ray/_private/runtime_env/plugin.py +++ b/python/ray/_private/runtime_env/plugin.py @@ -31,6 +31,8 @@ class RuntimeEnvPlugin(ABC): def validate(runtime_env_dict: dict) -> None: """Validate user entry for this plugin. + The method is invoked upon installation of runtime env. + Args: runtime_env_dict: the user-supplied runtime environment dict. diff --git a/python/ray/_private/runtime_env/setup_hook.py b/python/ray/_private/runtime_env/setup_hook.py new file mode 100644 index 000000000000..135252dd4611 --- /dev/null +++ b/python/ray/_private/runtime_env/setup_hook.py @@ -0,0 +1,131 @@ +import traceback +import logging +import base64 +import os + +from typing import Dict, Any, Callable, Union, Optional + +import ray +import ray._private.ray_constants as ray_constants +import ray.cloudpickle as pickle +from ray.runtime_env import RuntimeEnv + +logger = logging.getLogger(__name__) + + +def get_import_export_timeout(): + return int( + os.environ.get(ray_constants.RAY_WORKER_SETUP_HOOK_LOAD_TIMEOUT_ENV_VAR, "60") + ) + + +def _decode_function_key(key: bytes) -> str: + return base64.b64encode(key).decode() + + +def _encode_function_key(key: str) -> bytes: + return base64.b64decode(key) + + +def upload_worker_setup_hook_if_needed( + runtime_env: Union[Dict[str, Any], RuntimeEnv], + worker: "ray.Worker", +) -> Union[Dict[str, Any], RuntimeEnv]: + """Uploads the worker_setup_hook to GCS with a key. + + runtime_env["worker_setup_hook"] is converted to a decoded key + that can load the worker setup hook function from GCS. + I.e., you can use internalKV.Get(runtime_env["worker_setup_hook]) + to access the worker setup hook from GCS. + + Args: + runtime_env: The runtime_env. The value will be modified + when returned. + worker: ray.worker instance. + decoder: GCS requires the function key to be bytes. However, + we cannot json serialize (which is required to serialize + runtime env) the bytes. So the key should be decoded to + a string. The given decoder is used to decode the function + key. + """ + setup_func = runtime_env.get("worker_setup_hook") + if setup_func is None: + return runtime_env + + if not isinstance(setup_func, Callable): + raise TypeError( + "worker_setup_hook must be a function, " f"got {type(setup_func)}." + ) + # TODO(sang): Support modules. + + try: + key = worker.function_actor_manager.export_setup_func( + setup_func, timeout=get_import_export_timeout() + ) + except Exception as e: + raise ray.exceptions.RuntimeEnvSetupError( + "Failed to export the setup function." + ) from e + env_vars = runtime_env.get("env_vars", {}) + assert ray_constants.WORKER_SETUP_HOOK_ENV_VAR not in env_vars, ( + f"The env var, {ray_constants.WORKER_SETUP_HOOK_ENV_VAR}, " + "is not permitted because it is reserved for the internal use." + ) + env_vars[ray_constants.WORKER_SETUP_HOOK_ENV_VAR] = _decode_function_key(key) + runtime_env["env_vars"] = env_vars + # Note: This field is no-op. We don't have a plugin for the setup hook + # because we can implement it simply using an env var. + # This field is just for the observability purpose, so we store + # the name of the method. + runtime_env["worker_setup_hook"] = setup_func.__name__ + return runtime_env + + +def load_and_execute_setup_hook( + worker_setup_hook_key: str, +) -> Optional[str]: + """Load the setup hook from a given key and execute. + + Args: + worker_setup_hook_key: The key to import the setup hook + from GCS. + Returns: + An error message if it fails. None if it succeeds. + """ + assert worker_setup_hook_key is not None + worker = ray._private.worker.global_worker + assert worker.connected + + func_manager = worker.function_actor_manager + try: + worker_setup_func_info = func_manager.fetch_registered_method( + _encode_function_key(worker_setup_hook_key), + timeout=get_import_export_timeout(), + ) + except Exception: + error_message = ( + "Failed to import setup hook within " + f"{get_import_export_timeout()} seconds.\n" + f"{traceback.format_exc()}" + ) + return error_message + + try: + setup_func = pickle.loads(worker_setup_func_info.function) + except Exception: + error_message = ( + "Failed to deserialize the setup hook method.\n" f"{traceback.format_exc()}" + ) + return error_message + + try: + setup_func() + except Exception: + error_message = ( + f"Failed to execute the setup hook method. Function name:" + f"{worker_setup_func_info.function_name}\n" + f"{traceback.format_exc()}" + ) + return error_message + + return None diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 558762681f90..42cf2c3837ca 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1038,10 +1038,12 @@ def start_api_server( raise_on_failure: bool, host: str, gcs_address: str, + node_ip_address: str, temp_dir: str, logdir: str, session_dir: str, port: Optional[int] = None, + dashboard_grpc_port: Optional[int] = None, fate_share: Optional[bool] = None, max_bytes: int = 0, backup_count: int = 0, @@ -1060,6 +1062,7 @@ def start_api_server( a warning if we fail to start the API server. host: The host to bind the dashboard web server to. gcs_address: The gcs address the dashboard should connect to + node_ip_address: The IP address where this is running. temp_dir: The temporary directory used for log files and information for this Ray session. session_dir: The session directory under temp_dir. @@ -1067,6 +1070,8 @@ def start_api_server( logdir: The log directory used to generate dashboard log. port: The port to bind the dashboard web server to. Defaults to 8265. + dashboard_grpc_port: The port which the dashboard listens for + gRPC on. Defaults to a random, available port. max_bytes: Log rotation parameter. Corresponding to RotatingFileHandler's maxBytes. backup_count: Log rotation parameter. Corresponding to @@ -1079,7 +1084,9 @@ def start_api_server( no redirection should happen, then this should be None. Returns: - ProcessInfo for the process that was started. + A tuple of : + - Dashboard URL if dashboard enabled and started. + - ProcessInfo for the process that was started. """ try: # Make sure port is available. @@ -1132,6 +1139,7 @@ def start_api_server( f"--logging-rotate-bytes={max_bytes}", f"--logging-rotate-backup-count={backup_count}", f"--gcs-address={gcs_address}", + f"--node-ip-address={node_ip_address}", ] if not redirect_logging: @@ -1158,6 +1166,9 @@ def start_api_server( command.append("--modules-to-load=UsageStatsHead") command.append("--disable-frontend") + if dashboard_grpc_port is not None: + command.append(f"--grpc-port={dashboard_grpc_port}") + process_info = start_ray_process( command, ray_constants.PROCESS_TYPE_DASHBOARD, diff --git a/python/ray/_private/state.py b/python/ray/_private/state.py index 12ad9ea8d217..bf4bb4097d4f 100644 --- a/python/ray/_private/state.py +++ b/python/ray/_private/state.py @@ -147,32 +147,7 @@ def node_table(self): """ self._check_connected() - node_table = self.global_state_accessor.get_node_table() - - results = [] - for node_info_item in node_table: - item = gcs_utils.GcsNodeInfo.FromString(node_info_item) - node_info = { - "NodeID": ray._private.utils.binary_to_hex(item.node_id), - "Alive": item.state - == gcs_utils.GcsNodeInfo.GcsNodeState.Value("ALIVE"), - "NodeManagerAddress": item.node_manager_address, - "NodeManagerHostname": item.node_manager_hostname, - "NodeManagerPort": item.node_manager_port, - "ObjectManagerPort": item.object_manager_port, - "ObjectStoreSocketName": item.object_store_socket_name, - "RayletSocketName": item.raylet_socket_name, - "MetricsExportPort": item.metrics_export_port, - "NodeName": item.node_name, - } - node_info["alive"] = node_info["Alive"] - node_info["Resources"] = ( - {key: value for key, value in item.resources_total.items()} - if node_info["Alive"] - else {} - ) - results.append(node_info) - return results + return self.global_state_accessor.get_node_table() def job_table(self): """Fetch and parse the gcs job table. @@ -326,6 +301,8 @@ def get_state(state): return "PENDING" elif state == gcs_utils.PlacementGroupTableData.CREATED: return "CREATED" + elif state == gcs_utils.PlacementGroupTableData.RESCHEDULING: + return "RESCHEDULING" else: return "REMOVED" @@ -354,6 +331,10 @@ def get_strategy(strategy): bundle.bundle_id.bundle_index: MessageToDict(bundle)["unitResources"] for bundle in placement_group_info.bundles }, + "bundles_to_node_id": { + bundle.bundle_id.bundle_index: binary_to_hex(bundle.node_id) + for bundle in placement_group_info.bundles + }, "strategy": get_strategy(placement_group_info.strategy), "state": get_state(placement_group_info.state), "stats": { @@ -749,10 +730,9 @@ def get_system_config(self): def get_node_to_connect_for_driver(self, node_ip_address): """Get the node to connect for a Ray driver.""" self._check_connected() - node_info_str = self.global_state_accessor.get_node_to_connect_for_driver( + return self.global_state_accessor.get_node_to_connect_for_driver( node_ip_address ) - return gcs_utils.GcsNodeInfo.FromString(node_info_str) state = GlobalState() @@ -783,7 +763,7 @@ def next_job_id(): @DeveloperAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def nodes(): """Get a list of the nodes in the cluster (for debugging only). @@ -847,7 +827,7 @@ def actors(actor_id=None): @DeveloperAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def timeline(filename=None): """Return a list of profiling events that can viewed as a timeline. @@ -890,7 +870,7 @@ def object_transfer_timeline(filename=None): @DeveloperAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def cluster_resources(): """Get the current total cluster resources. @@ -905,7 +885,7 @@ def cluster_resources(): @DeveloperAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def available_resources(): """Get the current available cluster resources. diff --git a/python/ray/_private/state_api_test_utils.py b/python/ray/_private/state_api_test_utils.py index 090b025b8c84..9ce3d8a0e542 100644 --- a/python/ray/_private/state_api_test_utils.py +++ b/python/ray/_private/state_api_test_utils.py @@ -10,10 +10,10 @@ import time import traceback from typing import Callable, Dict, List, Optional, Tuple, Union -from ray.experimental.state.api import list_tasks +from ray.util.state import list_tasks import ray from ray.actor import ActorHandle -from ray.experimental.state.api import list_workers +from ray.util.state import list_workers @dataclass diff --git a/python/ray/_private/storage.py b/python/ray/_private/storage.py index ea5743a525f1..90d3e54d7e40 100644 --- a/python/ray/_private/storage.py +++ b/python/ray/_private/storage.py @@ -7,6 +7,7 @@ from ray._private.client_mode_hook import client_mode_hook from ray._private.utils import _add_creatable_buckets_param_if_s3_uri +from ray._private.auto_init_hook import wrap_auto_init if TYPE_CHECKING: import pyarrow.fs @@ -25,7 +26,8 @@ _filesystem = None -@client_mode_hook(auto_init=True) +@wrap_auto_init +@client_mode_hook def get_filesystem() -> ("pyarrow.fs.FileSystem", str): """Initialize and get the configured storage filesystem, if possible. @@ -51,7 +53,8 @@ def get_filesystem() -> ("pyarrow.fs.FileSystem", str): # TODO(suquark): There is no implementation of 'get_client' in client hook. -@client_mode_hook(auto_init=True) +@wrap_auto_init +@client_mode_hook def get_client(prefix: str) -> "KVClient": """Returns a KV-client (convenience wrapper around underlying filesystem). diff --git a/python/ray/_private/test_utils.py b/python/ray/_private/test_utils.py index f933844823ce..122787c6050d 100644 --- a/python/ray/_private/test_utils.py +++ b/python/ray/_private/test_utils.py @@ -51,6 +51,7 @@ gcs_service_pb2_grpc, ) from ray.util.queue import Empty, Queue, _QueueActor +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy logger = logging.getLogger(__name__) @@ -1457,11 +1458,12 @@ def _get_alive_nodes(self, nodes): alive_nodes += 1 return alive_nodes - head_node_ip = ray._private.worker.global_worker.node_ip_address - head_node_id = ray._private.worker.global_worker.current_node_id.hex() + head_node_id = ray.get_runtime_context().get_node_id() # Schedule the actor on the current node. node_killer = NodeKillerActor.options( - resources={f"node:{head_node_ip}": 0.001}, + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=head_node_id, soft=False + ), namespace=namespace, name="node_killer", lifetime=lifetime, diff --git a/python/ray/_private/thirdparty/tabulate/tabulate.py b/python/ray/_private/thirdparty/tabulate/tabulate.py index 7452ad1025fc..83b1090ffaf9 100644 --- a/python/ray/_private/thirdparty/tabulate/tabulate.py +++ b/python/ray/_private/thirdparty/tabulate/tabulate.py @@ -1,70 +1,35 @@ # -*- coding: utf-8 -*- -# Version 0.8.10, commit 4892c6e9a79638c7897ccea68b602040da9cc7a7 +# Version 0.9.0, commit bf58e37e6b35e3cc9a0bd740f752abfd32b6e6f8 """Pretty-print tabular data.""" -from __future__ import print_function -from __future__ import unicode_literals from collections import namedtuple -import sys +from collections.abc import Iterable, Sized +from html import escape as htmlescape +from itertools import chain, zip_longest as izip_longest +from functools import reduce, partial +import io import re import math import textwrap - - -if sys.version_info >= (3, 3): - from collections.abc import Iterable -else: - from collections import Iterable - -if sys.version_info[0] < 3: - from itertools import izip_longest - from functools import partial - - _none_type = type(None) - _bool_type = bool - _int_type = int - _long_type = long # noqa - _float_type = float - _text_type = unicode # noqa - _binary_type = str - - def _is_file(f): - return hasattr(f, "read") - -else: - from itertools import zip_longest as izip_longest - from functools import reduce, partial - - _none_type = type(None) - _bool_type = bool - _int_type = int - _long_type = int - _float_type = float - _text_type = str - _binary_type = bytes - basestring = str - - import io - - def _is_file(f): - return isinstance(f, io.IOBase) - +import dataclasses try: import wcwidth # optional wide-character (CJK) support except ImportError: wcwidth = None -try: - from html import escape as htmlescape -except ImportError: - from cgi import escape as htmlescape + +def _is_file(f): + return isinstance(f, io.IOBase) __all__ = ["tabulate", "tabulate_formats", "simple_separated_format"] -__version__ = "0.8.10" +try: + from .version import version as __version__ # noqa: F401 +except ImportError: + pass # running __init__.py as a script, AppVeyor pytests # minimum extra space in headers @@ -74,6 +39,7 @@ def _is_file(f): PRESERVE_WHITESPACE = False _DEFAULT_FLOATFMT = "g" +_DEFAULT_INTFMT = "" _DEFAULT_MISSINGVAL = "" # default align will be overwritten by "left", "center" or "decimal" # depending on the formatter @@ -83,6 +49,9 @@ def _is_file(f): # if True, enable wide-character (CJK) support WIDE_CHARS_MODE = wcwidth is not None +# Constant that can be used as part of passed rows to generate a separating line +# It is purposely an unprintable character, very unlikely to be used in a table +SEPARATING_LINE = "\001" Line = namedtuple("Line", ["begin", "hline", "sep", "end"]) @@ -90,7 +59,7 @@ def _is_file(f): DataRow = namedtuple("DataRow", ["begin", "sep", "end"]) -# A table structure is suppposed to be: +# A table structure is supposed to be: # # --- lineabove --------- # headerrow @@ -136,6 +105,15 @@ def _is_file(f): ) +def _is_separating_line(row): + row_type = type(row) + is_sl = (row_type == list or row_type == str) and ( + (len(row) >= 1 and row[0] == SEPARATING_LINE) + or (len(row) >= 2 and row[1] == SEPARATING_LINE) + ) + return is_sl + + def _pipe_segment_with_colons(align, colwidth): """Return a segment of a horizontal line with optional colons which indicate column's alignment (as in `pipe` output format).""" @@ -206,7 +184,7 @@ def _html_row_with_attrs(celltag, unsafe, cell_values, colwidths, colaligns): ] rowhtml = "{}".format("".join(values_with_attrs).rstrip()) if celltag == "th": # it's a header row, create a new table header - rowhtml = "\n\n{}\n\n".format(rowhtml) + rowhtml = f"
    \n\n{rowhtml}\n\n" return rowhtml @@ -218,7 +196,7 @@ def _moin_row_with_attrs(celltag, cell_values, colwidths, colaligns, header=""): "decimal": '', } values_with_attrs = [ - "{0}{1} {2} ".format(celltag, alignment.get(a, ""), header + c + header) + "{}{} {} ".format(celltag, alignment.get(a, ""), header + c + header) for c, a in zip(cell_values, colaligns) ] return "".join(values_with_attrs) + "||" @@ -237,6 +215,59 @@ def _latex_line_begin_tabular(colwidths, colaligns, booktabs=False, longtable=Fa ) +def _asciidoc_row(is_header, *args): + """handle header and data rows for asciidoc format""" + + def make_header_line(is_header, colwidths, colaligns): + # generate the column specifiers + + alignment = {"left": "<", "right": ">", "center": "^", "decimal": ">"} + # use the column widths generated by tabulate for the asciidoc column width specifiers + asciidoc_alignments = zip( + colwidths, [alignment[colalign] for colalign in colaligns] + ) + asciidoc_column_specifiers = [ + "{:d}{}".format(width, align) for width, align in asciidoc_alignments + ] + header_list = ['cols="' + (",".join(asciidoc_column_specifiers)) + '"'] + + # generate the list of options (currently only "header") + options_list = [] + + if is_header: + options_list.append("header") + + if options_list: + header_list += ['options="' + ",".join(options_list) + '"'] + + # generate the list of entries in the table header field + + return "[{}]\n|====".format(",".join(header_list)) + + if len(args) == 2: + # two arguments are passed if called in the context of aboveline + # print the table header with column widths and optional header tag + return make_header_line(False, *args) + + elif len(args) == 3: + # three arguments are passed if called in the context of dataline or headerline + # print the table line and make the aboveline if it is a header + + cell_values, colwidths, colaligns = args + data_line = "|" + "|".join(cell_values) + + if is_header: + return make_header_line(True, colwidths, colaligns) + "\n" + data_line + else: + return data_line + + else: + raise ValueError( + " _asciidoc_row() requires two (colwidths, colaligns) " + + "or three (cell_values, colwidths, colaligns) arguments) " + ) + + LATEX_ESCAPE_RULES = { r"&": r"\&", r"%": r"\%", @@ -264,7 +295,7 @@ def escape_char(c): def _rst_escape_first_column(rows, headers): def escape_empty(val): - if isinstance(val, (_text_type, _binary_type)) and not val.strip(): + if isinstance(val, (str, bytes)) and not val.strip(): return ".." else: return val @@ -312,6 +343,56 @@ def escape_empty(val): padding=1, with_header_hide=None, ), + "simple_grid": TableFormat( + lineabove=Line("┌", "─", "┬", "┐"), + linebelowheader=Line("├", "─", "┼", "┤"), + linebetweenrows=Line("├", "─", "┼", "┤"), + linebelow=Line("└", "─", "┴", "┘"), + headerrow=DataRow("│", "│", "│"), + datarow=DataRow("│", "│", "│"), + padding=1, + with_header_hide=None, + ), + "rounded_grid": TableFormat( + lineabove=Line("╭", "─", "┬", "╮"), + linebelowheader=Line("├", "─", "┼", "┤"), + linebetweenrows=Line("├", "─", "┼", "┤"), + linebelow=Line("╰", "─", "┴", "╯"), + headerrow=DataRow("│", "│", "│"), + datarow=DataRow("│", "│", "│"), + padding=1, + with_header_hide=None, + ), + "heavy_grid": TableFormat( + lineabove=Line("┏", "━", "┳", "┓"), + linebelowheader=Line("┣", "━", "╋", "┫"), + linebetweenrows=Line("┣", "━", "╋", "┫"), + linebelow=Line("┗", "━", "┻", "┛"), + headerrow=DataRow("┃", "┃", "┃"), + datarow=DataRow("┃", "┃", "┃"), + padding=1, + with_header_hide=None, + ), + "mixed_grid": TableFormat( + lineabove=Line("┍", "━", "┯", "┑"), + linebelowheader=Line("┝", "━", "┿", "┥"), + linebetweenrows=Line("├", "─", "┼", "┤"), + linebelow=Line("┕", "━", "┷", "┙"), + headerrow=DataRow("│", "│", "│"), + datarow=DataRow("│", "│", "│"), + padding=1, + with_header_hide=None, + ), + "double_grid": TableFormat( + lineabove=Line("╔", "═", "╦", "╗"), + linebelowheader=Line("╠", "═", "╬", "╣"), + linebetweenrows=Line("╠", "═", "╬", "╣"), + linebelow=Line("╚", "═", "╩", "╝"), + headerrow=DataRow("║", "║", "║"), + datarow=DataRow("║", "║", "║"), + padding=1, + with_header_hide=None, + ), "fancy_grid": TableFormat( lineabove=Line("╒", "═", "╤", "╕"), linebelowheader=Line("╞", "═", "╪", "╡"), @@ -322,6 +403,66 @@ def escape_empty(val): padding=1, with_header_hide=None, ), + "outline": TableFormat( + lineabove=Line("+", "-", "+", "+"), + linebelowheader=Line("+", "=", "+", "+"), + linebetweenrows=None, + linebelow=Line("+", "-", "+", "+"), + headerrow=DataRow("|", "|", "|"), + datarow=DataRow("|", "|", "|"), + padding=1, + with_header_hide=None, + ), + "simple_outline": TableFormat( + lineabove=Line("┌", "─", "┬", "┐"), + linebelowheader=Line("├", "─", "┼", "┤"), + linebetweenrows=None, + linebelow=Line("└", "─", "┴", "┘"), + headerrow=DataRow("│", "│", "│"), + datarow=DataRow("│", "│", "│"), + padding=1, + with_header_hide=None, + ), + "rounded_outline": TableFormat( + lineabove=Line("╭", "─", "┬", "╮"), + linebelowheader=Line("├", "─", "┼", "┤"), + linebetweenrows=None, + linebelow=Line("╰", "─", "┴", "╯"), + headerrow=DataRow("│", "│", "│"), + datarow=DataRow("│", "│", "│"), + padding=1, + with_header_hide=None, + ), + "heavy_outline": TableFormat( + lineabove=Line("┏", "━", "┳", "┓"), + linebelowheader=Line("┣", "━", "╋", "┫"), + linebetweenrows=None, + linebelow=Line("┗", "━", "┻", "┛"), + headerrow=DataRow("┃", "┃", "┃"), + datarow=DataRow("┃", "┃", "┃"), + padding=1, + with_header_hide=None, + ), + "mixed_outline": TableFormat( + lineabove=Line("┍", "━", "┯", "┑"), + linebelowheader=Line("┝", "━", "┿", "┥"), + linebetweenrows=None, + linebelow=Line("┕", "━", "┷", "┙"), + headerrow=DataRow("│", "│", "│"), + datarow=DataRow("│", "│", "│"), + padding=1, + with_header_hide=None, + ), + "double_outline": TableFormat( + lineabove=Line("╔", "═", "╦", "╗"), + linebelowheader=Line("╠", "═", "╬", "╣"), + linebetweenrows=None, + linebelow=Line("╚", "═", "╩", "╝"), + headerrow=DataRow("║", "║", "║"), + datarow=DataRow("║", "║", "║"), + padding=1, + with_header_hide=None, + ), "fancy_outline": TableFormat( lineabove=Line("╒", "═", "╤", "╕"), linebelowheader=Line("╞", "═", "╪", "╡"), @@ -527,6 +668,16 @@ def escape_empty(val): padding=1, with_header_hide=None, ), + "asciidoc": TableFormat( + lineabove=partial(_asciidoc_row, False), + linebelowheader=None, + linebetweenrows=None, + linebelow=Line("|====", "", "", ""), + headerrow=partial(_asciidoc_row, True), + datarow=partial(_asciidoc_row, False), + padding=1, + with_header_hide=["lineabove"], + ), } @@ -539,6 +690,11 @@ def escape_empty(val): "plain": "plain", "simple": "simple", "grid": "grid", + "simple_grid": "simple_grid", + "rounded_grid": "rounded_grid", + "heavy_grid": "heavy_grid", + "mixed_grid": "mixed_grid", + "double_grid": "double_grid", "fancy_grid": "fancy_grid", "pipe": "pipe", "orgtbl": "orgtbl", @@ -561,16 +717,55 @@ def escape_empty(val): _multiline_codes = re.compile(r"\r|\n|\r\n") _multiline_codes_bytes = re.compile(b"\r|\n|\r\n") -_invisible_codes = re.compile( - r"\x1b\[\d+[;\d]*m|\x1b\[\d*\;\d*\;\d*m|\x1b\]8;;(.*?)\x1b\\" -) # ANSI color codes -_invisible_codes_bytes = re.compile( - b"\x1b\\[\\d+\\[;\\d]*m|\x1b\\[\\d*;\\d*;\\d*m|\\x1b\\]8;;(.*?)\\x1b\\\\" -) # ANSI color codes -_invisible_codes_link = re.compile( - r"\x1B]8;[a-zA-Z0-9:]*;[^\x1B]+\x1B\\([^\x1b]+)\x1B]8;;\x1B\\" -) # Terminal hyperlinks +# Handle ANSI escape sequences for both control sequence introducer (CSI) and +# operating system command (OSC). Both of these begin with 0x1b (or octal 033), +# which will be shown below as ESC. +# +# CSI ANSI escape codes have the following format, defined in section 5.4 of ECMA-48: +# +# CSI: ESC followed by the '[' character (0x5b) +# Parameter Bytes: 0..n bytes in the range 0x30-0x3f +# Intermediate Bytes: 0..n bytes in the range 0x20-0x2f +# Final Byte: a single byte in the range 0x40-0x7e +# +# Also include the terminal hyperlink sequences as described here: +# https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda +# +# OSC 8 ; params ; uri ST display_text OSC 8 ;; ST +# +# Example: \x1b]8;;https://example.com\x5ctext to show\x1b]8;;\x5c +# +# Where: +# OSC: ESC followed by the ']' character (0x5d) +# params: 0..n optional key value pairs separated by ':' (e.g. foo=bar:baz=qux:abc=123) +# URI: the actual URI with protocol scheme (e.g. https://, file://, ftp://) +# ST: ESC followed by the '\' character (0x5c) +_esc = r"\x1b" +_csi = rf"{_esc}\[" +_osc = rf"{_esc}\]" +_st = rf"{_esc}\\" + +_ansi_escape_pat = rf""" + ( + # terminal colors, etc + {_csi} # CSI + [\x30-\x3f]* # parameter bytes + [\x20-\x2f]* # intermediate bytes + [\x40-\x7e] # final byte + | + # terminal hyperlinks + {_osc}8; # OSC opening + (\w+=\w+:?)* # key=value params list (submatch 2) + ; # delimiter + ([^{_esc}]+) # URI - anything but ESC (submatch 3) + {_st} # ST + ([^{_esc}]+) # link text - anything but ESC (submatch 4) + {_osc}8;;{_st} # "closing" OSC sequence + ) +""" +_ansi_codes = re.compile(_ansi_escape_pat, re.VERBOSE) +_ansi_codes_bytes = re.compile(_ansi_escape_pat.encode("utf8"), re.VERBOSE) _ansi_color_reset_code = "\033[0m" _float_with_thousands_separators = re.compile( @@ -654,7 +849,7 @@ def _isnumber(string): """ if not _isconvertible(float, string): return False - elif isinstance(string, (_text_type, _binary_type)) and ( + elif isinstance(string, (str, bytes)) and ( math.isinf(float(string)) or math.isnan(float(string)) ): return string.lower() in ["inf", "-inf", "nan"] @@ -670,7 +865,7 @@ def _isint(string, inttype=int): """ return ( type(string) is inttype - or (isinstance(string, _binary_type) or isinstance(string, _text_type)) + or isinstance(string, (bytes, str)) and _isconvertible(inttype, string) ) @@ -684,8 +879,8 @@ def _isbool(string): >>> _isbool(1) False """ - return type(string) is _bool_type or ( - isinstance(string, (_binary_type, _text_type)) and string in ("True", "False") + return type(string) is bool or ( + isinstance(string, (bytes, str)) and string in ("True", "False") ) @@ -705,27 +900,23 @@ def _type(string, has_invisible=True, numparse=True): """ - if has_invisible and ( - isinstance(string, _text_type) or isinstance(string, _binary_type) - ): - string = _strip_invisible(string) + if has_invisible and isinstance(string, (str, bytes)): + string = _strip_ansi(string) if string is None: - return _none_type + return type(None) elif hasattr(string, "isoformat"): # datetime.datetime, date, and time - return _text_type + return str elif _isbool(string): - return _bool_type + return bool elif _isint(string) and numparse: return int - elif _isint(string, _long_type) and numparse: - return int elif _isnumber(string) and numparse: return float - elif isinstance(string, _binary_type): - return _binary_type + elif isinstance(string, bytes): + return bytes else: - return _text_type + return str def _afterpoint(string): @@ -794,18 +985,24 @@ def _padnone(ignore_width, s): return s -def _strip_invisible(s): - r"""Remove invisible ANSI color codes. +def _strip_ansi(s): + r"""Remove ANSI escape sequences, both CSI (color codes, etc) and OSC hyperlinks. + + CSI sequences are simply removed from the output, while OSC hyperlinks are replaced + with the link text. Note: it may be desirable to show the URI instead but this is not + supported. + + >>> repr(_strip_ansi('\x1B]8;;https://example.com\x1B\\This is a link\x1B]8;;\x1B\\')) + "'This is a link'" - >>> str(_strip_invisible('\x1B]8;;https://example.com\x1B\\This is a link\x1B]8;;\x1B\\')) - 'This is a link' + >>> repr(_strip_ansi('\x1b[31mred\x1b[0m text')) + "'red text'" """ - if isinstance(s, _text_type): - links_removed = re.sub(_invisible_codes_link, "\\1", s) - return re.sub(_invisible_codes, "", links_removed) + if isinstance(s, str): + return _ansi_codes.sub(r"\4", s) else: # a bytestring - return re.sub(_invisible_codes_bytes, "", s) + return _ansi_codes_bytes.sub(r"\4", s) def _visible_width(s): @@ -820,14 +1017,14 @@ def _visible_width(s): len_fn = wcwidth.wcswidth else: len_fn = len - if isinstance(s, _text_type) or isinstance(s, _binary_type): - return len_fn(_strip_invisible(s)) + if isinstance(s, (str, bytes)): + return len_fn(_strip_ansi(s)) else: - return len_fn(_text_type(s)) + return len_fn(str(s)) def _is_multiline(s): - if isinstance(s, _text_type): + if isinstance(s, str): return bool(re.search(_multiline_codes, s)) else: # a bytestring return bool(re.search(_multiline_codes_bytes, s)) @@ -864,7 +1061,7 @@ def _align_column_choose_padfn(strings, alignment, has_invisible): padfn = _padboth elif alignment == "decimal": if has_invisible: - decimals = [_afterpoint(_strip_invisible(s)) for s in strings] + decimals = [_afterpoint(_strip_ansi(s)) for s in strings] else: decimals = [_afterpoint(s) for s in strings] maxdecimals = max(decimals) @@ -960,20 +1157,20 @@ def _align_column( def _more_generic(type1, type2): types = { - _none_type: 0, - _bool_type: 1, + type(None): 0, # noqa + bool: 1, int: 2, float: 3, - _binary_type: 4, - _text_type: 5, + bytes: 4, + str: 5, } invtypes = { - 5: _text_type, - 4: _binary_type, + 5: str, + 4: bytes, 3: float, 2: int, - 1: _bool_type, - 0: _none_type, + 1: bool, + 0: type(None), } moregeneric = max(types.get(type1, 5), types.get(type2, 5)) return invtypes[moregeneric] @@ -982,30 +1179,30 @@ def _more_generic(type1, type2): def _column_type(strings, has_invisible=True, numparse=True): """The least generic type all column values are convertible to. - >>> _column_type([True, False]) is _bool_type + >>> _column_type([True, False]) is bool True - >>> _column_type(["1", "2"]) is _int_type + >>> _column_type(["1", "2"]) is int True - >>> _column_type(["1", "2.3"]) is _float_type + >>> _column_type(["1", "2.3"]) is float True - >>> _column_type(["1", "2.3", "four"]) is _text_type + >>> _column_type(["1", "2.3", "four"]) is str True - >>> _column_type(["four", '\u043f\u044f\u0442\u044c']) is _text_type + >>> _column_type(["four", '\u043f\u044f\u0442\u044c']) is str True - >>> _column_type([None, "brux"]) is _text_type + >>> _column_type([None, "brux"]) is str True - >>> _column_type([1, 2, None]) is _int_type + >>> _column_type([1, 2, None]) is int True >>> import datetime as dt - >>> _column_type([dt.datetime(1991,2,19), dt.time(17,35)]) is _text_type + >>> _column_type([dt.datetime(1991,2,19), dt.time(17,35)]) is str True """ types = [_type(s, has_invisible, numparse) for s in strings] - return reduce(_more_generic, types, _bool_type) + return reduce(_more_generic, types, bool) -def _format(val, valtype, floatfmt, missingval="", has_invisible=True): +def _format(val, valtype, floatfmt, intfmt, missingval="", has_invisible=True): """Format a value according to its type. Unicode is supported: @@ -1020,25 +1217,25 @@ def _format(val, valtype, floatfmt, missingval="", has_invisible=True): if val is None: return missingval - if valtype in [int, _text_type]: - return "{0}".format(val) - elif valtype is _binary_type: + if valtype is str: + return f"{val}" + elif valtype is int: + return format(val, intfmt) + elif valtype is bytes: try: - return _text_type(val, "ascii") - except TypeError: - return _text_type(val) + return str(val, "ascii") + except (TypeError, UnicodeDecodeError): + return str(val) elif valtype is float: - is_a_colored_number = has_invisible and isinstance( - val, (_text_type, _binary_type) - ) + is_a_colored_number = has_invisible and isinstance(val, (str, bytes)) if is_a_colored_number: - raw_val = _strip_invisible(val) + raw_val = _strip_ansi(val) formatted_val = format(float(raw_val), floatfmt) return val.replace(raw_val, formatted_val) else: return format(float(val), floatfmt) else: - return "{0}".format(val) + return f"{val}" def _align_header( @@ -1059,20 +1256,48 @@ def _align_header( elif alignment == "center": return _padboth(width, header) elif not alignment: - return "{0}".format(header) + return f"{header}" else: return _padleft(width, header) +def _remove_separating_lines(rows): + if type(rows) == list: + separating_lines = [] + sans_rows = [] + for index, row in enumerate(rows): + if _is_separating_line(row): + separating_lines.append(index) + else: + sans_rows.append(row) + return sans_rows, separating_lines + else: + return rows, None + + +def _reinsert_separating_lines(rows, separating_lines): + if separating_lines: + for index in separating_lines: + rows.insert(index, SEPARATING_LINE) + + def _prepend_row_index(rows, index): """Add a left-most index column.""" if index is None or index is False: return rows - if len(index) != len(rows): - print("index=", index) - print("rows=", rows) - raise ValueError("index must be as long as the number of data rows") - rows = [[v] + list(row) for v, row in zip(index, rows)] + if isinstance(index, Sized) and len(index) != len(rows): + raise ValueError( + "index must be as long as the number of data rows: " + + "len(index)={} len(rows)={}".format(len(index), len(rows)) + ) + sans_rows, separating_lines = _remove_separating_lines(rows) + new_rows = [] + index_iter = iter(index) + for row in sans_rows: + index_v = next(index_iter) + new_rows.append([index_v] + list(row)) + rows = new_rows + _reinsert_separating_lines(rows, separating_lines) return rows @@ -1097,6 +1322,8 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"): * list of OrderedDicts (usually used with headers="keys") + * list of dataclasses (Python 3.7+ only, usually used with headers="keys") + * 2D NumPy arrays * NumPy record arrays (usually used with headers="keys") @@ -1150,9 +1377,9 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"): raise ValueError("tabular data doesn't appear to be a dict or a DataFrame") if headers == "keys": - headers = list(map(_text_type, keys)) # headers should be strings + headers = list(map(str, keys)) # headers should be strings - else: # it's a usual an iterable of iterables, or a NumPy array + else: # it's a usual iterable of iterables, or a NumPy array, or an iterable of dataclasses rows = list(tabular_data) if headers == "keys" and not rows: @@ -1172,7 +1399,7 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"): and hasattr(rows[0], "_fields") ): # namedtuple - headers = list(map(_text_type, rows[0]._fields)) + headers = list(map(str, rows[0]._fields)) elif len(rows) > 0 and hasattr(rows[0], "keys") and hasattr(rows[0], "values"): # dict-like object uniq_keys = set() # implements hashed lookup @@ -1193,11 +1420,11 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"): elif isinstance(headers, dict): # a dict of headers for a list of dicts headers = [headers.get(k, k) for k in keys] - headers = list(map(_text_type, headers)) + headers = list(map(str, headers)) elif headers == "firstrow": if len(rows) > 0: headers = [firstdict.get(k, k) for k in keys] - headers = list(map(_text_type, headers)) + headers = list(map(str, headers)) else: headers = [] elif headers: @@ -1216,9 +1443,20 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"): # print tabulate(cursor, headers='keys') headers = [column[0] for column in tabular_data.description] + elif ( + dataclasses is not None + and len(rows) > 0 + and dataclasses.is_dataclass(rows[0]) + ): + # Python 3.7+'s dataclass + field_names = [field.name for field in dataclasses.fields(rows[0])] + if headers == "keys": + headers = field_names + rows = [[getattr(row, f) for f in field_names] for row in rows] + elif headers == "keys" and len(rows) > 0: # keys are column indices - headers = list(map(_text_type, range(len(rows[0])))) + headers = list(map(str, range(len(rows[0])))) # take headers from the first row if necessary if headers == "firstrow" and len(rows) > 0: @@ -1227,18 +1465,23 @@ def _normalize_tabular_data(tabular_data, headers, showindex="default"): index = index[1:] else: headers = rows[0] - headers = list(map(_text_type, headers)) # headers should be strings + headers = list(map(str, headers)) # headers should be strings rows = rows[1:] + elif headers == "firstrow": + headers = [] - headers = list(map(_text_type, headers)) - rows = list(map(list, rows)) + headers = list(map(str, headers)) + # rows = list(map(list, rows)) + rows = list(map(lambda r: r if _is_separating_line(r) else list(r), rows)) # add or remove an index column - showindex_is_a_str = type(showindex) in [_text_type, _binary_type] + showindex_is_a_str = type(showindex) in [str, bytes] if showindex == "default" and index is not None: rows = _prepend_row_index(rows, index) - elif isinstance(showindex, Iterable) and not showindex_is_a_str: + elif isinstance(showindex, Sized) and not showindex_is_a_str: rows = _prepend_row_index(rows, list(showindex)) + elif isinstance(showindex, Iterable) and not showindex_is_a_str: + rows = _prepend_row_index(rows, showindex) elif showindex == "always" or (_bool(showindex) and not showindex_is_a_str): if index is None: index = list(range(len(rows))) @@ -1270,7 +1513,13 @@ def _wrap_text_to_colwidths(list_of_lists, colwidths, numparses=True): if width is not None: wrapper = _CustomTextWrap(width=width) - wrapped = wrapper.wrap(cell) + # Cast based on our internal type handling + # Any future custom formatting of types (such as datetimes) + # may need to be more explicit than just `str` of the object + casted_cell = ( + str(cell) if _isnumber(cell) else _type(cell, numparse)(cell) + ) + wrapped = wrapper.wrap(casted_cell) new_row.append("\n".join(wrapped)) else: new_row.append(cell) @@ -1279,11 +1528,37 @@ def _wrap_text_to_colwidths(list_of_lists, colwidths, numparses=True): return result +def _to_str(s, encoding="utf8", errors="ignore"): + """ + A type safe wrapper for converting a bytestring to str. This is essentially just + a wrapper around .decode() intended for use with things like map(), but with some + specific behavior: + + 1. if the given parameter is not a bytestring, it is returned unmodified + 2. decode() is called for the given parameter and assumes utf8 encoding, but the + default error behavior is changed from 'strict' to 'ignore' + + >>> repr(_to_str(b'foo')) + "'foo'" + + >>> repr(_to_str('foo')) + "'foo'" + + >>> repr(_to_str(42)) + "'42'" + + """ + if isinstance(s, bytes): + return s.decode(encoding=encoding, errors=errors) + return str(s) + + def tabulate( tabular_data, headers=(), tablefmt="simple", floatfmt=_DEFAULT_FLOATFMT, + intfmt=_DEFAULT_INTFMT, numalign=_DEFAULT_ALIGN, stralign=_DEFAULT_ALIGN, missingval=_DEFAULT_MISSINGVAL, @@ -1291,6 +1566,8 @@ def tabulate( disable_numparse=False, colalign=None, maxcolwidths=None, + rowalign=None, + maxheadercolwidths=None, ): """Format a fixed width table for pretty printing. @@ -1304,8 +1581,8 @@ def tabulate( The first required argument (`tabular_data`) can be a list-of-lists (or another iterable of iterables), a list of named tuples, a dictionary of iterables, an iterable of dictionaries, - a two-dimensional NumPy array, NumPy record array, or a Pandas' - dataframe. + an iterable of dataclasses (Python 3.7+), a two-dimensional NumPy array, + NumPy record array, or a Pandas' dataframe. Table headers @@ -1357,6 +1634,10 @@ def tabulate( Table formats ------------- + `intfmt` is a format specification used for columns which + contain numeric data without a decimal point. This can also be + a list or tuple of format strings, one per column. + `floatfmt` is a format specification used for columns which contain numeric data with a decimal point. This can also be a list or tuple of format strings, one per column. @@ -1427,7 +1708,73 @@ def tabulate( | eggs | 451 | +------+----------+ - "fancy_grid" draws a grid using box-drawing characters: + "simple_grid" draws a grid using single-line box-drawing + characters: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "simple_grid")) + ┌───────────┬───────────┐ + │ strings │ numbers │ + ├───────────┼───────────┤ + │ spam │ 41.9999 │ + ├───────────┼───────────┤ + │ eggs │ 451 │ + └───────────┴───────────┘ + + "rounded_grid" draws a grid using single-line box-drawing + characters with rounded corners: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "rounded_grid")) + ╭───────────┬───────────╮ + │ strings │ numbers │ + ├───────────┼───────────┤ + │ spam │ 41.9999 │ + ├───────────┼───────────┤ + │ eggs │ 451 │ + ╰───────────┴───────────╯ + + "heavy_grid" draws a grid using bold (thick) single-line box-drawing + characters: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "heavy_grid")) + ┏━━━━━━━━━━━┳━━━━━━━━━━━┓ + ┃ strings ┃ numbers ┃ + ┣━━━━━━━━━━━╋━━━━━━━━━━━┫ + ┃ spam ┃ 41.9999 ┃ + ┣━━━━━━━━━━━╋━━━━━━━━━━━┫ + ┃ eggs ┃ 451 ┃ + ┗━━━━━━━━━━━┻━━━━━━━━━━━┛ + + "mixed_grid" draws a grid using a mix of light (thin) and heavy (thick) lines + box-drawing characters: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "mixed_grid")) + ┍━━━━━━━━━━━┯━━━━━━━━━━━┑ + │ strings │ numbers │ + ┝━━━━━━━━━━━┿━━━━━━━━━━━┥ + │ spam │ 41.9999 │ + ├───────────┼───────────┤ + │ eggs │ 451 │ + ┕━━━━━━━━━━━┷━━━━━━━━━━━┙ + + "double_grid" draws a grid using double-line box-drawing + characters: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "double_grid")) + ╔═══════════╦═══════════╗ + ║ strings ║ numbers ║ + ╠═══════════╬═══════════╣ + ║ spam ║ 41.9999 ║ + ╠═══════════╬═══════════╣ + ║ eggs ║ 451 ║ + ╚═══════════╩═══════════╝ + + "fancy_grid" draws a grid using a mix of single and + double-line box-drawing characters: >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], ... ["strings", "numbers"], "fancy_grid")) @@ -1439,6 +1786,89 @@ def tabulate( │ eggs │ 451 │ ╘═══════════╧═══════════╛ + "outline" is the same as the "grid" format but doesn't draw lines between rows: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "outline")) + +-----------+-----------+ + | strings | numbers | + +===========+===========+ + | spam | 41.9999 | + | eggs | 451 | + +-----------+-----------+ + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], tablefmt="outline")) + +------+----------+ + | spam | 41.9999 | + | eggs | 451 | + +------+----------+ + + "simple_outline" is the same as the "simple_grid" format but doesn't draw lines between rows: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "simple_outline")) + ┌───────────┬───────────┐ + │ strings │ numbers │ + ├───────────┼───────────┤ + │ spam │ 41.9999 │ + │ eggs │ 451 │ + └───────────┴───────────┘ + + "rounded_outline" is the same as the "rounded_grid" format but doesn't draw lines between rows: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "rounded_outline")) + ╭───────────┬───────────╮ + │ strings │ numbers │ + ├───────────┼───────────┤ + │ spam │ 41.9999 │ + │ eggs │ 451 │ + ╰───────────┴───────────╯ + + "heavy_outline" is the same as the "heavy_grid" format but doesn't draw lines between rows: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "heavy_outline")) + ┏━━━━━━━━━━━┳━━━━━━━━━━━┓ + ┃ strings ┃ numbers ┃ + ┣━━━━━━━━━━━╋━━━━━━━━━━━┫ + ┃ spam ┃ 41.9999 ┃ + ┃ eggs ┃ 451 ┃ + ┗━━━━━━━━━━━┻━━━━━━━━━━━┛ + + "mixed_outline" is the same as the "mixed_grid" format but doesn't draw lines between rows: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "mixed_outline")) + ┍━━━━━━━━━━━┯━━━━━━━━━━━┑ + │ strings │ numbers │ + ┝━━━━━━━━━━━┿━━━━━━━━━━━┥ + │ spam │ 41.9999 │ + │ eggs │ 451 │ + ┕━━━━━━━━━━━┷━━━━━━━━━━━┙ + + "double_outline" is the same as the "double_grid" format but doesn't draw lines between rows: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "double_outline")) + ╔═══════════╦═══════════╗ + ║ strings ║ numbers ║ + ╠═══════════╬═══════════╣ + ║ spam ║ 41.9999 ║ + ║ eggs ║ 451 ║ + ╚═══════════╩═══════════╝ + + "fancy_outline" is the same as the "fancy_grid" format but doesn't draw lines between rows: + + >>> print(tabulate([["spam", 41.9999], ["eggs", "451.0"]], + ... ["strings", "numbers"], "fancy_outline")) + ╒═══════════╤═══════════╕ + │ strings │ numbers │ + ╞═══════════╪═══════════╡ + │ spam │ 41.9999 │ + │ eggs │ 451 │ + ╘═══════════╧═══════════╛ + "pipe" is like tables in PHP Markdown Extra extension or Pandoc pipe_tables: @@ -1612,14 +2042,17 @@ def tabulate( | | | better if it is wrapped a bit | +------------+------------+-------------------------------+ + Header column width can be specified in a similar way using `maxheadercolwidth` """ if tabular_data is None: tabular_data = [] + list_of_lists, headers = _normalize_tabular_data( tabular_data, headers, showindex=showindex ) + list_of_lists, separating_lines = _remove_separating_lines(list_of_lists) if maxcolwidths is not None: num_cols = len(list_of_lists[0]) @@ -1633,6 +2066,20 @@ def tabulate( list_of_lists, maxcolwidths, numparses=numparses ) + if maxheadercolwidths is not None: + num_cols = len(list_of_lists[0]) + if isinstance(maxheadercolwidths, int): # Expand scalar for all columns + maxheadercolwidths = _expand_iterable( + maxheadercolwidths, num_cols, maxheadercolwidths + ) + else: # Ignore col width for any 'trailing' columns + maxheadercolwidths = _expand_iterable(maxheadercolwidths, num_cols, None) + + numparses = _expand_numparse(disable_numparse, num_cols) + headers = _wrap_text_to_colwidths( + [headers], maxheadercolwidths, numparses=numparses + )[0] + # empty values in the first column of RST tables should be escaped (issue #82) # "" should be escaped as "\\ " or ".." if tablefmt == "rst": @@ -1654,14 +2101,21 @@ def tabulate( # optimization: look for ANSI control codes once, # enable smart width functions only if a control code is found + # + # convert the headers and rows into a single, tab-delimited string ensuring + # that any bytestrings are decoded safely (i.e. errors ignored) plain_text = "\t".join( - ["\t".join(map(_text_type, headers))] - + ["\t".join(map(_text_type, row)) for row in list_of_lists] + chain( + # headers + map(_to_str, headers), + # rows: chain the rows together into a single iterable after mapping + # the bytestring conversino to each cell value + chain.from_iterable(map(_to_str, row) for row in list_of_lists), + ) ) - has_invisible = re.search(_invisible_codes, plain_text) - if not has_invisible: - has_invisible = re.search(_invisible_codes_link, plain_text) + has_invisible = _ansi_codes.search(plain_text) is not None + enable_widechars = wcwidth is not None and WIDE_CHARS_MODE if ( not isinstance(tablefmt, TableFormat) @@ -1678,7 +2132,7 @@ def tabulate( cols = list(izip_longest(*list_of_lists)) numparses = _expand_numparse(disable_numparse, len(cols)) coltypes = [_column_type(col, numparse=np) for col, np in zip(cols, numparses)] - if isinstance(floatfmt, basestring): # old version + if isinstance(floatfmt, str): # old version float_formats = len(cols) * [ floatfmt ] # just duplicate the string to use in each column @@ -1686,15 +2140,25 @@ def tabulate( float_formats = list(floatfmt) if len(float_formats) < len(cols): float_formats.extend((len(cols) - len(float_formats)) * [_DEFAULT_FLOATFMT]) - if isinstance(missingval, basestring): + if isinstance(intfmt, str): # old version + int_formats = len(cols) * [ + intfmt + ] # just duplicate the string to use in each column + else: # if intfmt is list, tuple etc we have one per column + int_formats = list(intfmt) + if len(int_formats) < len(cols): + int_formats.extend((len(cols) - len(int_formats)) * [_DEFAULT_INTFMT]) + if isinstance(missingval, str): missing_vals = len(cols) * [missingval] else: missing_vals = list(missingval) if len(missing_vals) < len(cols): missing_vals.extend((len(cols) - len(missing_vals)) * [_DEFAULT_MISSINGVAL]) cols = [ - [_format(v, ct, fl_fmt, miss_v, has_invisible) for v in c] - for c, ct, fl_fmt, miss_v in zip(cols, coltypes, float_formats, missing_vals) + [_format(v, ct, fl_fmt, int_fmt, miss_v, has_invisible) for v in c] + for c, ct, fl_fmt, int_fmt, miss_v in zip( + cols, coltypes, float_formats, int_formats, missing_vals + ) ] # align columns @@ -1731,7 +2195,13 @@ def tabulate( if not isinstance(tablefmt, TableFormat): tablefmt = _table_formats.get(tablefmt, _table_formats["simple"]) - return _format_table(tablefmt, headers, rows, minwidths, aligns, is_multiline) + ra_default = rowalign if isinstance(rowalign, str) else None + rowaligns = _expand_iterable(rowalign, len(rows), ra_default) + _reinsert_separating_lines(rows, separating_lines) + + return _format_table( + tablefmt, headers, rows, minwidths, aligns, is_multiline, rowaligns=rowaligns + ) def _expand_numparse(disable_numparse, column_count): @@ -1759,7 +2229,7 @@ def _expand_iterable(original, num_desired, default): If `original` is not a list to begin with (i.e. scalar value) a list of length `num_desired` completely populated with `default will be returned """ - if isinstance(original, Iterable): + if isinstance(original, Iterable) and not isinstance(original, str): return original + [default] * (num_desired - len(original)) else: return [default] * num_desired @@ -1790,20 +2260,39 @@ def _build_row(padded_cells, colwidths, colaligns, rowfmt): return _build_simple_row(padded_cells, rowfmt) -def _append_basic_row(lines, padded_cells, colwidths, colaligns, rowfmt): +def _append_basic_row(lines, padded_cells, colwidths, colaligns, rowfmt, rowalign=None): + # NOTE: rowalign is ignored and exists for api compatibility with _append_multiline_row lines.append(_build_row(padded_cells, colwidths, colaligns, rowfmt)) return lines +def _align_cell_veritically(text_lines, num_lines, column_width, row_alignment): + delta_lines = num_lines - len(text_lines) + blank = [" " * column_width] + if row_alignment == "bottom": + return blank * delta_lines + text_lines + elif row_alignment == "center": + top_delta = delta_lines // 2 + bottom_delta = delta_lines - top_delta + return top_delta * blank + text_lines + bottom_delta * blank + else: + return text_lines + blank * delta_lines + + def _append_multiline_row( - lines, padded_multiline_cells, padded_widths, colaligns, rowfmt, pad + lines, padded_multiline_cells, padded_widths, colaligns, rowfmt, pad, rowalign=None ): colwidths = [w - 2 * pad for w in padded_widths] cells_lines = [c.splitlines() for c in padded_multiline_cells] nlines = max(map(len, cells_lines)) # number of lines in the row # vertically pad cells where some lines are missing + # cells_lines = [ + # (cl + [" " * w] * (nlines - len(cl))) for cl, w in zip(cells_lines, colwidths) + # ] + cells_lines = [ - (cl + [" " * w] * (nlines - len(cl))) for cl, w in zip(cells_lines, colwidths) + _align_cell_veritically(cl, nlines, w, rowalign) + for cl, w in zip(cells_lines, colwidths) ] lines_cells = [[cl[i] for cl in cells_lines] for i in range(nlines)] for ln in lines_cells: @@ -1842,7 +2331,7 @@ def str(self): return self -def _format_table(fmt, headers, rows, colwidths, colaligns, is_multiline): +def _format_table(fmt, headers, rows, colwidths, colaligns, is_multiline, rowaligns): """Produce a plain-text representation of the table.""" lines = [] hidden = fmt.with_header_hide if (headers and fmt.with_header_hide) else [] @@ -1870,14 +2359,35 @@ def _format_table(fmt, headers, rows, colwidths, colaligns, is_multiline): if padded_rows and fmt.linebetweenrows and "linebetweenrows" not in hidden: # initial rows with a line below - for row in padded_rows[:-1]: - append_row(lines, row, padded_widths, colaligns, fmt.datarow) + for row, ralign in zip(padded_rows[:-1], rowaligns): + append_row( + lines, row, padded_widths, colaligns, fmt.datarow, rowalign=ralign + ) _append_line(lines, padded_widths, colaligns, fmt.linebetweenrows) # the last row without a line below - append_row(lines, padded_rows[-1], padded_widths, colaligns, fmt.datarow) + append_row( + lines, + padded_rows[-1], + padded_widths, + colaligns, + fmt.datarow, + rowalign=rowaligns[-1], + ) else: + separating_line = ( + fmt.linebetweenrows + or fmt.linebelowheader + or fmt.linebelow + or fmt.lineabove + or Line("", "", "", "") + ) for row in padded_rows: - append_row(lines, row, padded_widths, colaligns, fmt.datarow) + # test to see if either the 1st column or the 2nd column (account for showindex) has + # the SEPARATING_LINE flag + if _is_separating_line(row): + _append_line(lines, padded_widths, colaligns, separating_line) + else: + append_row(lines, row, padded_widths, colaligns, fmt.datarow) if fmt.linebelow and "linebelow" not in hidden: _append_line(lines, padded_widths, colaligns, fmt.linebelow) @@ -1909,7 +2419,7 @@ def __init__(self, *args, **kwargs): def _len(item): """Custom len that gets console column width for wide and non-wide characters as well as ignores color codes""" - stripped = _strip_invisible(item) + stripped = _strip_ansi(item) if wcwidth: return wcwidth.wcswidth(stripped) else: @@ -1921,7 +2431,7 @@ def _update_lines(self, lines, new_line): as add any colors from previous lines order to preserve the same formatting as a single unwrapped string. """ - code_matches = [x for x in re.finditer(_invisible_codes, new_line)] + code_matches = [x for x in _ansi_codes.finditer(new_line)] color_codes = [ code.string[code.span()[0] : code.span()[1]] for code in code_matches ] @@ -2109,6 +2619,7 @@ def _main(): -o FILE, --output FILE print table to FILE (default: stdout) -s REGEXP, --sep REGEXP use a custom column separator (default: whitespace) -F FPFMT, --float FPFMT floating point number format (default: g) + -I INTFMT, --int INTFMT integer point number format (default: "") -f FMT, --format FMT set output table format; supported formats: plain, simple, grid, fancy_grid, pipe, orgtbl, rst, mediawiki, html, latex, latex_raw, @@ -2124,7 +2635,7 @@ def _main(): opts, args = getopt.getopt( sys.argv[1:], "h1o:s:F:A:f:", - ["help", "header", "output", "sep=", "float=", "align=", "format="], + ["help", "header", "output", "sep=", "float=", "int=", "align=", "format="], ) except getopt.GetoptError as e: print(e) @@ -2132,6 +2643,7 @@ def _main(): sys.exit(2) headers = [] floatfmt = _DEFAULT_FLOATFMT + intfmt = _DEFAULT_INTFMT colalign = None tablefmt = "simple" sep = r"\s+" @@ -2143,6 +2655,8 @@ def _main(): outfile = value elif opt in ["-F", "--float"]: floatfmt = value + elif opt in ["-I", "--int"]: + intfmt = value elif opt in ["-C", "--colalign"]: colalign = value.split() elif opt in ["-f", "--format"]: @@ -2168,6 +2682,7 @@ def _main(): tablefmt=tablefmt, sep=sep, floatfmt=floatfmt, + intfmt=intfmt, file=out, colalign=colalign, ) @@ -2179,16 +2694,24 @@ def _main(): tablefmt=tablefmt, sep=sep, floatfmt=floatfmt, + intfmt=intfmt, file=out, colalign=colalign, ) -def _pprint_file(fobject, headers, tablefmt, sep, floatfmt, file, colalign): +def _pprint_file(fobject, headers, tablefmt, sep, floatfmt, intfmt, file, colalign): rows = fobject.readlines() table = [re.split(sep, r.rstrip()) for r in rows if r.strip()] print( - tabulate(table, headers, tablefmt, floatfmt=floatfmt, colalign=colalign), + tabulate( + table, + headers, + tablefmt, + floatfmt=floatfmt, + intfmt=intfmt, + colalign=colalign, + ), file=file, ) diff --git a/python/ray/_private/usage/usage_lib.py b/python/ray/_private/usage/usage_lib.py index 7eda1e3d3c2f..d40effe571b6 100644 --- a/python/ray/_private/usage/usage_lib.py +++ b/python/ray/_private/usage/usage_lib.py @@ -45,6 +45,7 @@ import logging import threading import os +import platform import sys import time import uuid @@ -144,6 +145,8 @@ class UsageStatsToReport: #: The total number of running jobs excluding internal ones # when the report is generated. total_num_running_jobs: Optional[int] + #: The libc version in the OS. + libc_version: Optional[str] @dataclass(init=True) @@ -356,6 +359,13 @@ def _generate_cluster_metadata(): "session_start_timestamp_ms": int(time.time() * 1000), } ) + if sys.platform == "linux": + # Record llibc version + (lib, ver) = platform.libc_ver() + if not lib: + metadata.update({"libc_version": "NA"}) + else: + metadata.update({"libc_version": f"{lib}:{ver}"}) return metadata @@ -759,6 +769,7 @@ def generate_report_data( extra_usage_tags=get_extra_usage_tags_to_report(gcs_client), total_num_nodes=get_total_num_nodes_to_report(gcs_client), total_num_running_jobs=get_total_num_running_jobs_to_report(gcs_client), + libc_version=cluster_metadata.get("libc_version"), ) return data diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py index d35be4353bd7..8d1793114ac9 100644 --- a/python/ray/_private/utils.py +++ b/python/ray/_private/utils.py @@ -44,7 +44,6 @@ import ray import ray._private.ray_constants as ray_constants from ray._private.tls_utils import load_certs_from_env -from ray.core.generated.gcs_pb2 import ErrorTableData from ray.core.generated.runtime_env_common_pb2 import ( RuntimeEnvInfo as ProtoRuntimeEnvInfo, ) @@ -182,27 +181,6 @@ def push_error_to_driver( worker.core_worker.push_error(job_id, error_type, message, time.time()) -def construct_error_message(job_id, error_type, message, timestamp): - """Construct an ErrorTableData object. - - Args: - job_id: The ID of the job that the error should go to. If this is - nil, then the error will go to all drivers. - error_type: The type of the error. - message: The error message. - timestamp: The time of the error. - - Returns: - The ErrorTableData object. - """ - data = ErrorTableData() - data.job_id = job_id.binary() - data.type = error_type - data.error_message = message - data.timestamp = timestamp - return data - - def publish_error_to_driver( error_type: str, message: str, @@ -228,11 +206,12 @@ def publish_error_to_driver( if job_id is None: job_id = ray.JobID.nil() assert isinstance(job_id, ray.JobID) - error_data = construct_error_message(job_id, error_type, message, time.time()) try: - gcs_publisher.publish_error(job_id.hex().encode(), error_data, num_retries) + gcs_publisher.publish_error( + job_id.hex().encode(), error_type, message, job_id, num_retries + ) except Exception: - logger.exception(f"Failed to publish error {error_data}") + logger.exception(f"Failed to publish error: {message} [type {error_type}]") def decode(byte_str: str, allow_none: bool = False, encode_type: str = "utf-8"): @@ -1893,3 +1872,25 @@ def try_import_each_module(module_names_to_import: List[str]) -> None: importlib.import_module(module_to_preload) except ImportError: logger.exception(f'Failed to preload the module "{module_to_preload}"') + + +def update_envs(env_vars: Dict[str, str]): + """ + When updating the environment variable, if there is ${X}, + it will be replaced with the current environment variable. + """ + if not env_vars: + return + + replaceable_keys = [ + "PATH", + "LD_LIBRARY_PATH", + "DYLD_LIBRARY_PATH", + "LD_PRELOAD", + ] + + for key, value in env_vars.items(): + if key in replaceable_keys: + os.environ[key] = value.replace("${" + key + "}", os.environ.get(key, "")) + else: + os.environ[key] = value diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 4389510c7d21..69a8327173c9 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -68,7 +68,6 @@ GcsErrorSubscriber, GcsFunctionKeySubscriber, GcsLogSubscriber, - GcsPublisher, ) from ray._private.inspect_util import is_cython from ray._private.ray_logging import ( @@ -80,6 +79,7 @@ from ray._private.runtime_env.constants import RAY_JOB_CONFIG_JSON_ENV_VAR from ray._private.runtime_env.py_modules import upload_py_modules_if_needed from ray._private.runtime_env.working_dir import upload_working_dir_if_needed +from ray._private.runtime_env.setup_hook import upload_worker_setup_hook_if_needed from ray._private.storage import _load_class from ray._private.utils import check_oversized_function, get_ray_doc_version from ray.exceptions import ObjectStoreFullError, RayError, RaySystemError, RayTaskError @@ -462,6 +462,11 @@ def __init__(self): # Create the lock here because the serializer will use it before # initializing Ray. self.lock = threading.RLock() + # By default, don't show logs from other drivers. This is set to true by Serve + # in order to stream logs from the controller and replica actors across + # different drivers that connect to the same Serve instance. + # See https://github.com/ray-project/ray/pull/35070. + self._filter_logs_by_job = True @property def connected(self): @@ -738,12 +743,12 @@ def get_objects(self, object_refs: list, timeout: Optional[float] = None): "which is not an ray.ObjectRef." ) - timeout_ms = int(timeout * 1000) if timeout else -1 + timeout_ms = int(timeout * 1000) if timeout is not None else -1 data_metadata_pairs = self.core_worker.get_objects( object_refs, self.current_task_id, timeout_ms ) debugger_breakpoint = b"" - for (data, metadata) in data_metadata_pairs: + for data, metadata in data_metadata_pairs: if metadata: metadata_fields = metadata.split(b",") if len(metadata_fields) >= 2 and metadata_fields[1].startswith( @@ -871,8 +876,11 @@ def print_logs(self): last_polling_batch_size = 0 continue - # Don't show logs from other drivers. - if data["job"] and data["job"] != job_id_hex: + if ( + self._filter_logs_by_job + and data["job"] + and data["job"] != job_id_hex + ): last_polling_batch_size = 0 continue @@ -898,7 +906,7 @@ def print_logs(self): @PublicAPI -@client_mode_hook(auto_init=True) +@client_mode_hook def get_gpu_ids(): """Get the IDs of the GPUs that are available to the worker. @@ -1111,7 +1119,7 @@ def _repr_html_(self): @PublicAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def init( address: Optional[str] = None, *, @@ -1655,7 +1663,7 @@ def init( @PublicAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def shutdown(_exiting_interpreter: bool = False): """Disconnect the worker, and terminate processes started by ray.init(). @@ -1859,6 +1867,28 @@ def color_for(data: Dict[str, str], line: str) -> str: return colorama.Style.BRIGHT + colorama.Fore.YELLOW else: return colorama.Style.BRIGHT + colorama.Fore.CYAN + elif os.getenv("RAY_COLOR_PREFIX") == "1": + colors = [ + # colorama.Fore.BLUE, # Too dark + colorama.Fore.MAGENTA, + colorama.Fore.CYAN, + colorama.Fore.GREEN, + # colorama.Fore.WHITE, # Too light + # colorama.Fore.RED, + colorama.Fore.LIGHTBLACK_EX, + colorama.Fore.LIGHTBLUE_EX, + # colorama.Fore.LIGHTCYAN_EX, # Too light + # colorama.Fore.LIGHTGREEN_EX, # Too light + colorama.Fore.LIGHTMAGENTA_EX, + # colorama.Fore.LIGHTWHITE_EX, # Too light + # colorama.Fore.LIGHTYELLOW_EX, # Too light + ] + pid = data.get("pid", 0) + try: + i = int(pid) + except ValueError: + i = 0 + return colors[i % len(colors)] else: return colorama.Fore.CYAN @@ -1980,7 +2010,7 @@ def listen_error_messages(worker, threads_stopped): @PublicAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def is_initialized() -> bool: """Check if ray.init has been called yet. @@ -2052,7 +2082,7 @@ def connect( ray._private.state.state._initialize_global_state( ray._raylet.GcsClientOptions.from_gcs_address(node.gcs_address) ) - worker.gcs_publisher = GcsPublisher(address=worker.gcs_client.address) + worker.gcs_publisher = ray._raylet.GcsPublisher(address=worker.gcs_client.address) # Initialize some fields. if mode in (WORKER_MODE, RESTORE_WORKER_MODE, SPILL_WORKER_MODE): # We should not specify the job_id if it's `WORKER_MODE`. @@ -2135,7 +2165,7 @@ def connect( # If it's a driver and it's not coming from ray client, we'll prepare the # environment here. If it's ray client, the environment will be prepared # at the server side. - if mode == SCRIPT_MODE and not job_config.client_job and job_config.runtime_env: + if mode == SCRIPT_MODE and not job_config._client_job and job_config.runtime_env: scratch_dir: str = worker.node.get_runtime_env_dir_path() runtime_env = job_config.runtime_env or {} runtime_env = upload_py_modules_if_needed( @@ -2144,6 +2174,10 @@ def connect( runtime_env = upload_working_dir_if_needed( runtime_env, scratch_dir, logger=logger ) + runtime_env = upload_worker_setup_hook_if_needed( + runtime_env, + worker, + ) # Remove excludes, it isn't relevant after the upload step. runtime_env.pop("excludes", None) job_config.set_runtime_env(runtime_env) @@ -2165,13 +2199,13 @@ def connect( code_paths.append(script_directory) # In client mode, if we use runtime envs with "working_dir", then # it'll be handled automatically. Otherwise, add the current dir. - if not job_config.client_job and not job_config.runtime_env_has_working_dir(): + if not job_config._client_job and not job_config._runtime_env_has_working_dir(): current_directory = os.path.abspath(os.path.curdir) code_paths.append(current_directory) if len(code_paths) != 0: - job_config.py_driver_sys_path.extend(code_paths) + job_config._py_driver_sys_path.extend(code_paths) - serialized_job_config = job_config.serialize() + serialized_job_config = job_config._serialize() if not node.should_redirect_logs(): # Logging to stderr, so give core worker empty logs directory. logs_dir = "" @@ -2407,7 +2441,7 @@ def get(object_refs: "ObjectRef[R]", *, timeout: Optional[float] = None) -> R: @PublicAPI -@client_mode_hook(auto_init=True) +@client_mode_hook def get( object_refs: Union[ray.ObjectRef, Sequence[ray.ObjectRef]], *, @@ -2442,12 +2476,9 @@ def get( to get. timeout (Optional[float]): The maximum amount of time in seconds to wait before returning. Set this to None will block until the - corresponding object becomes available. - WARNING: In future ray releases ``timeout=0`` will return the object - immediately if it's available, else raise GetTimeoutError in accordance with - the above docstring. The current behavior of blocking until objects become - available of ``timeout=0`` is considered to be a bug, see - https://github.com/ray-project/ray/issues/28465. + corresponding object becomes available. Setting ``timeout=0`` will + return the object immediately if it's available, else raise + GetTimeoutError in accordance with the above docstring. Returns: A Python object or a list of Python objects. @@ -2458,26 +2489,6 @@ def get( Exception: An exception is raised if the task that created the object or that created one of the objects raised an exception. """ - if timeout == 0: - if os.environ.get("RAY_WARN_RAY_GET_TIMEOUT_ZERO", "1") == "1": - import warnings - - warnings.warn( - ( - "Please use timeout=None if you expect ray.get() to block. " - "Setting timeout=0 in future ray releases will raise " - "GetTimeoutError if the objects references are not available. " - "You could suppress this warning by setting " - "RAY_WARN_RAY_GET_TIMEOUT_ZERO=0." - ), - UserWarning, - ) - - # Record this usage in telemetry - import ray._private.usage.usage_lib as usage_lib - - usage_lib.record_extra_usage_tag(usage_lib.TagKey.RAY_GET_TIMEOUT_ZERO, "True") - worker = global_worker worker.check_connected() @@ -2534,7 +2545,7 @@ def get( @PublicAPI -@client_mode_hook(auto_init=True) +@client_mode_hook def put( value: Any, *, _owner: Optional["ray.actor.ActorHandle"] = None ) -> "ray.ObjectRef": @@ -2596,7 +2607,7 @@ def put( @PublicAPI -@client_mode_hook(auto_init=True) +@client_mode_hook def wait( object_refs: List["ray.ObjectRef"], *, @@ -2688,7 +2699,6 @@ def wait( worker.check_connected() # TODO(swang): Check main thread. with profiling.profile("ray.wait"): - # TODO(rkn): This is a temporary workaround for # https://github.com/ray-project/ray/issues/997. However, it should be # fixed in Arrow instead of here. @@ -2718,7 +2728,7 @@ def wait( @PublicAPI -@client_mode_hook(auto_init=True) +@client_mode_hook def get_actor(name: str, namespace: Optional[str] = None) -> "ray.actor.ActorHandle": """Get a handle to a named actor. @@ -2753,7 +2763,7 @@ def get_actor(name: str, namespace: Optional[str] = None) -> "ray.actor.ActorHan @PublicAPI -@client_mode_hook(auto_init=True) +@client_mode_hook def kill(actor: "ray.actor.ActorHandle", *, no_restart: bool = True): """Kill an actor forcefully. @@ -2783,7 +2793,7 @@ def kill(actor: "ray.actor.ActorHandle", *, no_restart: bool = True): @PublicAPI -@client_mode_hook(auto_init=True) +@client_mode_hook def cancel(object_ref: "ray.ObjectRef", *, force: bool = False, recursive: bool = True): """Cancels a task according to the following conditions. @@ -3033,62 +3043,68 @@ def remote( This function can be used as a decorator with no arguments to define a remote function or actor as follows: - >>> import ray - >>> - >>> @ray.remote - ... def f(a, b, c): - ... return a + b + c - >>> - >>> object_ref = f.remote(1, 2, 3) - >>> result = ray.get(object_ref) - >>> assert result == (1 + 2 + 3) - >>> - >>> @ray.remote - ... class Foo: - ... def __init__(self, arg): - ... self.x = arg - ... - ... def method(self, a): - ... return self.x + a - >>> - >>> actor_handle = Foo.remote(123) - >>> object_ref = actor_handle.method.remote(321) - >>> result = ray.get(object_ref) - >>> assert result == (123 + 321) + .. testcode:: + + import ray + + @ray.remote + def f(a, b, c): + return a + b + c + + object_ref = f.remote(1, 2, 3) + result = ray.get(object_ref) + assert result == (1 + 2 + 3) + + @ray.remote + class Foo: + def __init__(self, arg): + self.x = arg + + def method(self, a): + return self.x + a + + actor_handle = Foo.remote(123) + object_ref = actor_handle.method.remote(321) + result = ray.get(object_ref) + assert result == (123 + 321) Equivalently, use a function call to create a remote function or actor. - >>> def g(a, b, c): - ... return a + b + c - >>> - >>> remote_g = ray.remote(g) - >>> object_ref = remote_g.remote(1, 2, 3) - >>> assert ray.get(object_ref) == (1 + 2 + 3) + .. testcode:: - >>> class Bar: - ... def __init__(self, arg): - ... self.x = arg - ... - ... def method(self, a): - ... return self.x + a - >>> - >>> RemoteBar = ray.remote(Bar) - >>> actor_handle = RemoteBar.remote(123) - >>> object_ref = actor_handle.method.remote(321) - >>> result = ray.get(object_ref) - >>> assert result == (123 + 321) + def g(a, b, c): + return a + b + c + + remote_g = ray.remote(g) + object_ref = remote_g.remote(1, 2, 3) + assert ray.get(object_ref) == (1 + 2 + 3) + + class Bar: + def __init__(self, arg): + self.x = arg + + def method(self, a): + return self.x + a + + RemoteBar = ray.remote(Bar) + actor_handle = RemoteBar.remote(123) + object_ref = actor_handle.method.remote(321) + result = ray.get(object_ref) + assert result == (123 + 321) It can also be used with specific keyword arguments as follows: - >>> @ray.remote(num_gpus=1, max_calls=1, num_returns=2) - ... def f(): - ... return 1, 2 - >>> - >>> @ray.remote(num_cpus=2, resources={"CustomResource": 1}) - ... class Foo: - ... def method(self): - ... return 1 + .. testcode:: + + @ray.remote(num_gpus=1, max_calls=1, num_returns=2) + def f(): + return 1, 2 + + @ray.remote(num_cpus=2, resources={"CustomResource": 1}) + class Foo: + def method(self): + return 1 Remote task and actor objects returned by @ray.remote can also be dynamically modified with the same arguments as above using diff --git a/python/ray/_private/workers/default_worker.py b/python/ray/_private/workers/default_worker.py index 937f45a8b85d..19fd801532c8 100644 --- a/python/ray/_private/workers/default_worker.py +++ b/python/ray/_private/workers/default_worker.py @@ -1,3 +1,4 @@ +import os import argparse import base64 import json @@ -10,6 +11,7 @@ import ray.actor from ray._private.parameter import RayParams from ray._private.ray_logging import configure_log_file, get_worker_log_file_name +from ray._private.runtime_env.setup_hook import load_and_execute_setup_hook parser = argparse.ArgumentParser( @@ -236,20 +238,29 @@ worker_launched_time_ms=worker_launched_time_ms, ) + worker = ray._private.worker.global_worker + # Setup log file. out_file, err_file = node.get_log_file_handles( get_worker_log_file_name(args.worker_type) ) configure_log_file(out_file, err_file) - ray._private.worker.global_worker.set_out_file(out_file) - ray._private.worker.global_worker.set_err_file(err_file) + worker.set_out_file(out_file) + worker.set_err_file(err_file) if mode == ray.WORKER_MODE and args.worker_preload_modules: module_names_to_import = args.worker_preload_modules.split(",") ray._private.utils.try_import_each_module(module_names_to_import) + # If the worker setup function is configured, run it. + worker_setup_hook_key = os.getenv(ray_constants.WORKER_SETUP_HOOK_ENV_VAR) + if worker_setup_hook_key: + error = load_and_execute_setup_hook(worker_setup_hook_key) + if error is not None: + worker.core_worker.exit_worker("system", error) + if mode == ray.WORKER_MODE: - ray._private.worker.global_worker.main_loop() + worker.main_loop() elif mode in [ray.RESTORE_WORKER_MODE, ray.SPILL_WORKER_MODE]: # It is handled by another thread in the C++ core worker. # We just need to keep the worker alive. diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 89a80aff8bef..9be8234a0084 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -59,16 +59,20 @@ from ray.includes.common cimport ( CObjectReference, CLanguage, CObjectReference, + CWorkerExitType, CRayObject, CRayStatus, + CErrorTableData, CGcsClientOptions, CGcsNodeInfo, CJobTableData, + CLogBatch, CTaskArg, CTaskArgByReference, CTaskArgByValue, CTaskType, CPlacementStrategy, + CPythonFunction, CSchedulingStrategy, CPlacementGroupSchedulingStrategy, CNodeAffinitySchedulingStrategy, @@ -92,6 +96,10 @@ from ray.includes.common cimport ( PLACEMENT_STRATEGY_SPREAD, PLACEMENT_STRATEGY_STRICT_PACK, PLACEMENT_STRATEGY_STRICT_SPREAD, + WORKER_EXIT_TYPE_USER_ERROR, + WORKER_EXIT_TYPE_SYSTEM_ERROR, + kResourceUnitScaling, + kWorkerSetupHookKeyName, ) from ray.includes.unique_ids cimport ( CActorID, @@ -586,6 +594,7 @@ cdef store_task_errors( exc, task_exception, actor, + actor_id, function_name, CTaskType task_type, proctitle, @@ -606,15 +615,22 @@ cdef store_task_errors( # Generate the actor repr from the actor class. actor_repr = repr(actor) if actor else None + if actor_id is None or actor_id.is_nil(): + actor_id = None + else: + actor_id = actor_id.hex() + if isinstance(exc, RayTaskError): # Avoid recursive nesting of RayTaskError. failure_object = RayTaskError(function_name, backtrace, exc.cause, proctitle=proctitle, - actor_repr=actor_repr) + actor_repr=actor_repr, + actor_id=actor_id) else: failure_object = RayTaskError(function_name, backtrace, exc, proctitle=proctitle, - actor_repr=actor_repr) + actor_repr=actor_repr, + actor_id=actor_id) # Pass the failure object back to the CoreWorker. # We also cap the size of the error message to the last @@ -698,6 +714,7 @@ cdef execute_dynamic_generator_and_store_task_outputs( worker, error, False, # task_exception None, # actor + None, # actor id function_name, task_type, title, dynamic_returns, application_error) if num_errors_stored == 0: @@ -740,6 +757,7 @@ cdef void execute_task( worker = ray._private.worker.global_worker manager = worker.function_actor_manager actor = None + actor_id = None cdef: CoreWorker core_worker = worker.core_worker JobID job_id = core_worker.get_current_job_id() @@ -780,7 +798,8 @@ cdef void execute_task( print(task_name_magic_token, end="") print(task_name_magic_token, file=sys.stderr, end="") else: - actor = worker.actors[core_worker.get_actor_id()] + actor_id = core_worker.get_actor_id() + actor = worker.actors[actor_id] class_name = actor.__class__.__name__ next_title = f"ray::{class_name}" @@ -827,6 +846,7 @@ cdef void execute_task( return function(actor, *arguments, **kwarguments) with core_worker.profile_event(b"task::" + name, extra_data=extra_data): + task_exception = False try: with core_worker.profile_event(b"task:deserialize_arguments"): if c_args.empty(): @@ -866,20 +886,26 @@ cdef void execute_task( args, kwargs = ray._private.signature.recover_args(args) if (task_type == TASK_TYPE_ACTOR_CREATION_TASK): - actor = worker.actors[core_worker.get_actor_id()] + actor_id = core_worker.get_actor_id() + actor = worker.actors[actor_id] class_name = actor.__class__.__name__ actor_title = f"{class_name}({args!r}, {kwargs!r})" core_worker.set_actor_title(actor_title.encode("utf-8")) - worker.record_task_log_start() + # Record the task id via magic token in the log file. + # This will be used to locate the beginning of logs from a task. + attempt_number = core_worker.get_current_task_attempt_number() + task_attempt_magic_token = "{}{}-{}\n".format( + ray_constants.LOG_PREFIX_TASK_ATTEMPT_START, task_id.hex(), + attempt_number) + # Print on both .out and .err + print(task_attempt_magic_token, end="") + print(task_attempt_magic_token, file=sys.stderr, end="") + # Execute the task. with core_worker.profile_event(b"task:execute"): task_exception = True try: - is_exiting = core_worker.is_exiting() - if is_exiting: - title = f"{title}::Exiting" - next_title = f"{next_title}::Exiting" with ray._private.worker._changeproctitle(title, next_title): if debugger_breakpoint != b"": ray.util.pdb.set_trace( @@ -926,9 +952,14 @@ cdef void execute_task( exc_info=True) raise e finally: - # Record the task logs end offsets regardless of - # task execution results. - worker.record_task_log_end() + # Record the end of task via magic token in the log file. + # This will be used to locate the end of logs from a task. + task_attempt_magic_token = "{}{}-{}\n".format( + ray_constants.LOG_PREFIX_TASK_ATTEMPT_END, task_id.hex(), + attempt_number) + # Print on both .out and .err + print(task_attempt_magic_token, end="") + print(task_attempt_magic_token, file=sys.stderr, end="") if returns[0].size() == 1 and not inspect.isgenerator(outputs): # If there is only one return specified, we should return @@ -1005,7 +1036,7 @@ cdef void execute_task( returns) except Exception as e: num_errors_stored = store_task_errors( - worker, e, task_exception, actor, function_name, + worker, e, task_exception, actor, actor_id, function_name, task_type, title, returns, application_error) if returns[0].size() > 0 and num_errors_stored == 0: logger.exception( @@ -1139,7 +1170,9 @@ cdef execute_task_with_cancellation_handler( # Task cancellation can happen anytime so we don't really need # to differentiate between mid-task or not. False, # task_exception - actor, execution_info.function_name, + actor, + actor_id, + execution_info.function_name, task_type, title, returns, # application_error: we are passing NULL since we don't want the # cancel tasks to fail. @@ -1189,7 +1222,6 @@ cdef CRayStatus task_execution_handler( const c_vector[CConcurrencyGroup] &defined_concurrency_groups, const c_string name_of_concurrency_group_to_execute, c_bool is_reattempt) nogil: - with gil, disable_client_hook(): # Initialize job_config if it hasn't already. # Setup system paths configured in job_config. @@ -1727,6 +1759,66 @@ cdef class GcsClient: } return result +cdef class GcsPublisher: + """Cython wrapper class of C++ `ray::gcs::PythonGcsPublisher`.""" + cdef: + shared_ptr[CPythonGcsPublisher] inner + + def __cinit__(self, address): + self.inner.reset(new CPythonGcsPublisher(address)) + check_status(self.inner.get().Connect()) + + def publish_error(self, key_id: bytes, error_type: str, message: str, + job_id=None, num_retries=None): + cdef: + CErrorTableData error_info + int64_t c_num_retries = num_retries if num_retries else -1 + c_string c_key_id = key_id + + if job_id is None: + job_id = ray.JobID.nil() + assert isinstance(job_id, ray.JobID) + error_info.set_job_id(job_id.binary()) + error_info.set_type(error_type) + error_info.set_error_message(message) + error_info.set_timestamp(time.time()) + + with nogil: + check_status( + self.inner.get().PublishError(c_key_id, error_info, c_num_retries)) + + def publish_logs(self, log_json: dict): + cdef: + CLogBatch log_batch + c_string c_job_id + + job_id = log_json.get("job") + log_batch.set_ip(log_json.get("ip") if log_json.get("ip") else b"") + log_batch.set_pid( + str(log_json.get("pid")).encode() if log_json.get("pid") else b"") + log_batch.set_job_id(job_id.encode() if job_id else b"") + log_batch.set_is_error(bool(log_json.get("is_err"))) + for line in log_json.get("lines", []): + log_batch.add_lines(line) + actor_name = log_json.get("actor_name") + log_batch.set_actor_name(actor_name.encode() if actor_name else b"") + task_name = log_json.get("task_name") + log_batch.set_task_name(task_name.encode() if task_name else b"") + + c_job_id = job_id.encode() if job_id else b"" + with nogil: + check_status(self.inner.get().PublishLogs(c_job_id, log_batch)) + + def publish_function_key(self, key: bytes): + cdef: + CPythonFunction python_function + + python_function.set_key(key) + + with nogil: + check_status(self.inner.get().PublishFunctionKey(python_function)) + + cdef class CoreWorker: def __cinit__(self, worker_type, store_socket, raylet_socket, @@ -1794,15 +1886,16 @@ cdef class CoreWorker: self.cgname_to_eventloop_dict = None self.fd_to_cgname_dict = None self.eventloop_for_default_cg = None + self.current_runtime_env = None def shutdown(self): - with nogil: - # If it's a worker, the core worker process should have been - # shutdown. So we can't call - # `CCoreWorkerProcess.GetCoreWorker().GetWorkerType()` here. - # Instead, we use the cached `is_driver` flag to test if it's a - # driver. - if self.is_driver: + # If it's a worker, the core worker process should have been + # shutdown. So we can't call + # `CCoreWorkerProcess.GetCoreWorker().GetWorkerType()` here. + # Instead, we use the cached `is_driver` flag to test if it's a + # driver. + if self.is_driver: + with nogil: CCoreWorkerProcess.Shutdown() def notify_raylet(self): @@ -1813,6 +1906,28 @@ cdef class CoreWorker: with nogil: CCoreWorkerProcess.RunTaskExecutionLoop() + def exit_worker(self, exit_type: str, c_string detail): + """ + Exit the current worker process. This API should only be used by + a worker. If this API is called, the worker will finish currently + executing task, initiate the shutdown, and stop itself gracefully. + The given exit_type and detail will be reported to GCS, and any + worker failure error will contain them. + """ + cdef: + CWorkerExitType c_exit_type + cdef const shared_ptr[LocalMemoryBuffer] null_ptr + + if exit_type == "user": + c_exit_type = WORKER_EXIT_TYPE_USER_ERROR + if exit_type == "system": + c_exit_type = WORKER_EXIT_TYPE_SYSTEM_ERROR + else: + raise ValueError(f"Invalid exit type: {exit_type}") + assert not self.is_driver + with nogil: + CCoreWorkerProcess.GetCoreWorker().Exit(c_exit_type, detail, null_ptr) + def get_current_task_retry_exceptions(self): return CCoreWorkerProcess.GetCoreWorker( ).GetCurrentTaskRetryExceptions() @@ -1821,6 +1936,9 @@ cdef class CoreWorker: return TaskID( CCoreWorkerProcess.GetCoreWorker().GetCurrentTaskId().Binary()) + def get_current_task_attempt_number(self): + return CCoreWorkerProcess.GetCoreWorker().GetCurrentTaskAttemptNumber() + def get_task_depth(self): return CCoreWorkerProcess.GetCoreWorker().GetTaskDepth() @@ -2406,7 +2524,7 @@ cdef class CoreWorker: unordered_map[c_string, double] c_resources CRayFunction ray_function c_vector[unique_ptr[CTaskArg]] args_vector - optional[c_vector[CObjectReference]] return_refs + c_vector[CObjectReference] return_refs c_vector[CObjectID] incremented_put_arg_ids with self.profile_event(b"submit_task"): @@ -2419,12 +2537,13 @@ cdef class CoreWorker: &incremented_put_arg_ids) with nogil: - return_refs = CCoreWorkerProcess.GetCoreWorker().SubmitActorTask( + status = CCoreWorkerProcess.GetCoreWorker().SubmitActorTask( c_actor_id, ray_function, args_vector, CTaskOptions( - name, num_returns, c_resources, concurrency_group_name)) + name, num_returns, c_resources, concurrency_group_name), + return_refs) # These arguments were serialized and put into the local object # store during task submission. The backend increments their local # ref count initially to ensure that they remain in scope until we @@ -2434,28 +2553,25 @@ cdef class CoreWorker: CCoreWorkerProcess.GetCoreWorker().RemoveLocalReference( put_arg_id) - if return_refs.has_value(): + if status.ok(): # The initial local reference is already acquired internally # when adding the pending task. - return VectorToObjectRefs(return_refs.value(), + return VectorToObjectRefs(return_refs, skip_adding_local_ref=True) else: - actor = self.get_actor_handle(actor_id) - actor_handle = (CCoreWorkerProcess.GetCoreWorker() - .GetActorHandle(c_actor_id)) - raise PendingCallsLimitExceeded("The task {} could not be " - "submitted to {} because more " - "than {} tasks are queued on " - "the actor. This limit " - "can be adjusted with the " - "`max_pending_calls` actor " - "option.".format( - function_descriptor - .function_name, - repr(actor), - (dereference(actor_handle) - .MaxPendingCalls()) - )) + if status.IsOutOfResource(): + actor = self.get_actor_handle(actor_id) + actor_handle = (CCoreWorkerProcess.GetCoreWorker() + .GetActorHandle(c_actor_id)) + raise PendingCallsLimitExceeded( + f"The task {function_descriptor.function_name} could not be " + f"submitted to {repr(actor)} because more than" + f" {(dereference(actor_handle).MaxPendingCalls())}" + " tasks are queued on the actor. This limit can be adjusted" + " with the `max_pending_calls` actor option.") + else: + raise Exception(f"Failed to submit task to actor {actor_id} " + f"due to {status.message()}") def kill_actor(self, ActorID actor_id, c_bool no_restart): cdef: @@ -2932,9 +3048,6 @@ cdef class CoreWorker: return self.current_runtime_env - def is_exiting(self): - return CCoreWorkerProcess.GetCoreWorker().IsExiting() - cdef yield_current_fiber(self, CFiberEvent &fiber_event): with nogil: CCoreWorkerProcess.GetCoreWorker().YieldCurrentFiber(fiber_event) diff --git a/python/ray/actor.py b/python/ray/actor.py index 6bf93281fc83..7191031e059b 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -9,6 +9,7 @@ import ray._raylet from ray import ActorClassID, Language, cross_language from ray._private import ray_option_utils +from ray._private.auto_init_hook import auto_init_ray from ray._private.client_mode_hook import ( client_mode_convert_actor, client_mode_hook, @@ -42,7 +43,7 @@ @PublicAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def method(*args, **kwargs): """Annotate an actor method. @@ -763,7 +764,8 @@ def _remote(self, args=None, kwargs=None, **actor_options): if actor_options.get("max_concurrency") is None: actor_options["max_concurrency"] = 1000 if is_asyncio else 1 - if client_mode_should_convert(auto_init=True): + auto_init_ray() + if client_mode_should_convert(): return client_mode_convert_actor(self, args, kwargs, **actor_options) # fill actor required options @@ -1368,21 +1370,19 @@ def _make_actor(cls, actor_options): def exit_actor(): """Intentionally exit the current actor. - This function is used to disconnect an actor and exit the worker. - Any ``atexit`` handlers installed in the actor will be run. + This API can be used only inside an actor. Use ray.kill + API if you'd like to kill an actor using actor handle. + + When the API is called, the actor raises an exception and exits. + Any queued methods will fail. Any ``atexit`` + handlers installed in the actor will be run. Raises: - Exception: An exception is raised if this is a driver or this + TypeError: An exception is raised if this is a driver or this worker is not an actor. """ worker = ray._private.worker.global_worker if worker.mode == ray.WORKER_MODE and not worker.actor_id.is_nil(): - # Intentionally disconnect the core worker from the raylet so the - # raylet won't push an error message to the driver. - ray._private.worker.disconnect() - # Disconnect global state from GCS. - ray._private.state.state.disconnect() - # In asyncio actor mode, we can't raise SystemExit because it will just # quit the asycnio event loop thread, not the main thread. Instead, we # raise a custom error to the main thread to tell it to exit. @@ -1397,4 +1397,8 @@ def exit_actor(): raise exit assert False, "This process should have terminated." else: - raise TypeError("exit_actor called on a non-actor worker.") + raise TypeError( + "exit_actor API is called on a non-actor worker, " + f"{worker.mode}. Call this API inside an actor methods" + "if you'd like to exit the actor gracefully." + ) diff --git a/python/ray/air/BUILD b/python/ray/air/BUILD index cc2886d267de..fc0a1a9def1d 100644 --- a/python/ray/air/BUILD +++ b/python/ray/air/BUILD @@ -34,6 +34,14 @@ py_test( deps = [":ml_lib"] ) +py_test( + name = "test_air_usage", + size = "small", + srcs = ["tests/test_air_usage.py"], + tags = ["team:ml", "exclusive"], + deps = [":ml_lib"] +) + py_test( name = "test_checkpoints", size = "small", @@ -76,7 +84,7 @@ py_test( py_test( name = "test_experiment_restore", - size = "medium", + size = "large", srcs = [ "tests/test_experiment_restore.py", "tests/_test_experiment_restore_run.py" @@ -173,6 +181,13 @@ py_test( deps = [":ml_lib"] ) +py_test( + name = "test_util_torch_dist", + size = "small", + srcs = ["tests/test_util_torch_dist.py"], + tags = ["team:ml", "gpu", "exclusive"], + deps = [":ml_lib"] +) # -------------------------------------------------------------------- # Tests from the python/ray/air/tests/execution directory. diff --git a/python/ray/air/_internal/checkpoint_manager.py b/python/ray/air/_internal/checkpoint_manager.py index a1c89b02596c..ee1bdd973f1f 100644 --- a/python/ray/air/_internal/checkpoint_manager.py +++ b/python/ray/air/_internal/checkpoint_manager.py @@ -55,11 +55,7 @@ class _TrackedCheckpoint: into `"evaluation/episode_reward_mean"`. node_ip: IP of the node where the checkpoint was generated. Defaults to the current node. - local_dir_to_remote_uri_fn: Function that takes in this checkpoint's local - directory path and returns the corresponding remote URI in the cloud. - This should only be specified if the data was synced to cloud. - Only applied during conversion to AIR checkpoint and only - if ``dir_or_data`` is or resolves to a directory path. + rank: Rank of the node where the checkpoint was generated. Defaults to 0. """ def __init__( @@ -69,16 +65,14 @@ def __init__( checkpoint_id: Optional[int] = None, metrics: Optional[Dict] = None, node_ip: Optional[str] = None, - local_to_remote_path_fn: Optional[Callable[[str], str]] = None, + rank: Optional[int] = 0, ): from ray.tune.result import NODE_IP self.dir_or_data = dir_or_data self.id = checkpoint_id self.storage_mode = storage_mode - # This is a function because dir_or_data may be an object ref - # and we need to wait until its resolved first. - self.local_to_remote_path_fn = local_to_remote_path_fn + self.rank = rank self.metrics = flatten_dict(metrics) if metrics else {} self.node_ip = node_ip or self.metrics.get(NODE_IP, None) @@ -142,7 +136,31 @@ def delete( except Exception as e: logger.warning(f"Checkpoint deletion failed: {e}") - def to_air_checkpoint(self) -> Optional[Checkpoint]: + def to_air_checkpoint( + self, local_to_remote_path_fn: Optional[Callable[[str], str]] = None + ) -> Optional[Checkpoint]: + """Converter from a `_TrackedCheckpoint` to a `ray.air.Checkpoint`. + + This method Resolves the checkpoint data if it is an object reference. + + This method handles multiple types of checkpoint data: + - If the data is a string (local checkpoint path), this returns a + directory-backed checkpoint. + - If a `local_to_remote_path_fn` is provided, this converts + local path to a remote URI, then returns a URI-backed checkpoint. + - If the data is bytes or a dictionary, it returns an in-memory + bytes/dict-backed checkpoint. + + Args: + local_to_remote_path_fn: Function that takes in this checkpoint's local + directory path and returns the corresponding remote URI in the cloud. + This should only be specified if the data was synced to cloud. + Only applied during conversion to AIR checkpoint and only + if ``dir_or_data`` is or resolves to a directory path. + + Returns: + Checkpoint: The AIR checkpoint backed by the resolved data. + """ from ray.tune.trainable.util import TrainableUtil checkpoint_data = self.dir_or_data @@ -158,9 +176,9 @@ def to_air_checkpoint(self) -> Optional[Checkpoint]: if isinstance(checkpoint_data, str): # Prefer cloud checkpoints - if self.local_to_remote_path_fn: + if local_to_remote_path_fn: checkpoint = Checkpoint.from_uri( - self.local_to_remote_path_fn(checkpoint_data) + local_to_remote_path_fn(checkpoint_data) ) else: try: @@ -281,7 +299,7 @@ def __init__( # always available). self._checkpoints_to_clean_up = set() - self._delete_fn = delete_fn + self.set_delete_fn(delete_fn) def set_delete_fn( self, delete_fn: Optional[Callable[["_TrackedCheckpoint"], None]] @@ -294,7 +312,10 @@ def set_delete_fn( """ self._delete_fn = delete_fn - def register_checkpoint(self, checkpoint: _TrackedCheckpoint): + def register_checkpoints( + self, + checkpoints: Union[_TrackedCheckpoint, List[_TrackedCheckpoint]], + ): """Register new checkpoint and add to bookkeeping. This method will register a new checkpoint and add it to the internal @@ -303,23 +324,27 @@ def register_checkpoint(self, checkpoint: _TrackedCheckpoint): checkpoints should be deleted. Args: - checkpoint: Tracked checkpoint object to add to bookkeeping. + checkpoints: Tracked checkpoint object to add to bookkeeping. """ - checkpoint.id = checkpoint.id or self._latest_checkpoint_id + if not isinstance(checkpoints, list): + checkpoints = [checkpoints] + + for checkpoint in checkpoints: + checkpoint.id = checkpoint.id or self._latest_checkpoint_id - if checkpoint.storage_mode == CheckpointStorage.MEMORY: - self._replace_latest_memory_checkpoint(checkpoint) + if checkpoint.storage_mode == CheckpointStorage.MEMORY: + self._replace_latest_memory_checkpoint(checkpoint) - if self._persist_memory_checkpoints: - persisted_checkpoint = copy.copy(checkpoint) - persisted_checkpoint.storage_mode = CheckpointStorage.PERSISTENT + if self._persist_memory_checkpoints: + persisted_checkpoint = copy.copy(checkpoint) + persisted_checkpoint.storage_mode = CheckpointStorage.PERSISTENT + else: + persisted_checkpoint = None else: - persisted_checkpoint = None - else: - persisted_checkpoint = checkpoint + persisted_checkpoint = checkpoint - if persisted_checkpoint and self._checkpoint_strategy.num_to_keep != 0: - self._process_persistent_checkpoint(persisted_checkpoint) + if persisted_checkpoint and self._checkpoint_strategy.num_to_keep != 0: + self._process_persistent_checkpoint(persisted_checkpoint) self._latest_checkpoint_id += 1 @@ -390,8 +415,20 @@ def _get_checkpoint_score( checkpoint.id, ) - def _process_persistent_checkpoint(self, checkpoint: _TrackedCheckpoint): + def _process_persistent_checkpoint( + self, + checkpoint: _TrackedCheckpoint, + next_checkpoint_path: Optional[str] = None, + ): + # Note(jungong) : Track rank0 checkpoint as the best / worst checkpoint. + # That is because we only care about the data for checkpoints + # from non-rank0 workers. They do not represent a different Trial + # checkpoint as the rank0 one. + if checkpoint.rank > 0: + return + assert checkpoint.storage_mode == CheckpointStorage.PERSISTENT + next_checkpoint_path = next_checkpoint_path or self._get_next_checkpoint_path() checkpoint_score = self._get_checkpoint_score(checkpoint) wrapped_checkpoint = _HeapCheckpointWrapper( @@ -399,20 +436,19 @@ def _process_persistent_checkpoint(self, checkpoint: _TrackedCheckpoint): ) if self._checkpoint_strategy.num_to_keep is None: - # Keep all checkpoints - checkpoint.commit(path=self._get_next_checkpoint_path()) + checkpoint.commit(path=next_checkpoint_path) self._replace_latest_persisted_checkpoint(checkpoint) self._top_persisted_checkpoints.append(wrapped_checkpoint) elif ( len(self._top_persisted_checkpoints) < self._checkpoint_strategy.num_to_keep ): + checkpoint.commit(path=next_checkpoint_path) # Heap is not full yet, so keep this checkpoint - checkpoint.commit(path=self._get_next_checkpoint_path()) heapq.heappush(self._top_persisted_checkpoints, wrapped_checkpoint) self._replace_latest_persisted_checkpoint(checkpoint) elif wrapped_checkpoint.priority >= self._top_persisted_checkpoints[0].priority: + checkpoint.commit(path=next_checkpoint_path) # Priority is higher than current worst checkpoint, so replace worst - checkpoint.commit(path=self._get_next_checkpoint_path()) worst_checkpoint = heapq.heappushpop( self._top_persisted_checkpoints, wrapped_checkpoint ).tracked_checkpoint diff --git a/python/ray/air/_internal/mlflow.py b/python/ray/air/_internal/mlflow.py index d0fd1168dd60..a7b553100933 100644 --- a/python/ray/air/_internal/mlflow.py +++ b/python/ray/air/_internal/mlflow.py @@ -4,6 +4,8 @@ from copy import deepcopy from typing import TYPE_CHECKING, Dict, Optional +from ray._private.dict import flatten_dict + if TYPE_CHECKING: from mlflow.entities import Run from mlflow.tracking import MlflowClient @@ -262,6 +264,7 @@ def log_params(self, params_to_log: Dict, run_id: Optional[str] = None): params_to_log: Dictionary of parameters to log. run_id (Optional[str]): The ID of the run to log to. """ + params_to_log = flatten_dict(params_to_log) if run_id and self._run_exists(run_id): client = self._get_client() @@ -284,6 +287,7 @@ def log_metrics(self, step, metrics_to_log: Dict, run_id: Optional[str] = None): metrics_to_log: Dictionary of metrics to log. run_id (Optional[str]): The ID of the run to log to. """ + metrics_to_log = flatten_dict(metrics_to_log) metrics_to_log = self._parse_dict(metrics_to_log) if run_id and self._run_exists(run_id): diff --git a/python/ray/air/_internal/remote_storage.py b/python/ray/air/_internal/remote_storage.py index 5b14c919cc44..13aaf91ab919 100644 --- a/python/ray/air/_internal/remote_storage.py +++ b/python/ray/air/_internal/remote_storage.py @@ -5,41 +5,76 @@ import urllib.parse from pathlib import Path from pkg_resources import packaging +import psutil import shutil -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from ray.air._internal.filelock import TempFileLock try: import fsspec + from fsspec.implementations.local import LocalFileSystem except ImportError: fsspec = None + LocalFileSystem = object try: import pyarrow import pyarrow.fs - # TODO(krfricke): Remove this once gcsfs > 2022.3.0 is released - # (and make sure to pin) - class _CustomGCSHandler(pyarrow.fs.FSSpecHandler): - """Custom FSSpecHandler that avoids a bug in gcsfs <= 2022.3.0.""" - - def create_dir(self, path, recursive): - try: - # GCSFS doesn't expose `create_parents` argument, - # so it is omitted here - self.fs.mkdir(path) - except FileExistsError: - pass - except (ImportError, ModuleNotFoundError): pyarrow = None - _CustomGCSHandler = None from ray import logger +class _ExcludingLocalFilesystem(LocalFileSystem): + """LocalFileSystem wrapper to exclude files according to patterns. + + Args: + exclude: List of patterns that are applied to files returned by + ``self.find()``. If a file path matches this pattern, it will + be excluded. + + """ + + def __init__(self, exclude: List[str], **kwargs): + super().__init__(**kwargs) + self._exclude = exclude + + @property + def fsid(self): + return "_excluding_local" + + def _should_exclude(self, name: str) -> bool: + """Return True if `name` matches any of the `self._exclude` patterns.""" + alt = None + if os.path.isdir(name): + # If this is a directory, also test it with trailing slash + alt = os.path.join(name, "") + for excl in self._exclude: + if fnmatch.fnmatch(name, excl): + return True + if alt and fnmatch.fnmatch(alt, excl): + return True + return False + + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + """Call parent find() and exclude from result.""" + names = super().find( + path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs + ) + if detail: + return { + name: out + for name, out in names.items() + if not self._should_exclude(name) + } + else: + return [name for name in names if not self._should_exclude(name)] + + def _pyarrow_fs_copy_files( source, destination, source_filesystem=None, destination_filesystem=None, **kwargs ): @@ -102,38 +137,181 @@ def is_non_local_path_uri(uri: str) -> bool: _cached_fs = {} -def _is_local_path(uri: str) -> bool: - """Check if the path points to the local filesystem.""" - if len(uri) >= 1 and uri[0] == "/": - return True +def _get_network_mounts() -> List[str]: + """Get mounted network filesystems on the current node. + + Network file system (NFS), server message block (SMB) and + common internet file system (CIFS) are all file access storage protocols, + used to access files on remote servers and storage servers (such as NAS storage) + as if they were local files. + """ + partitions = psutil.disk_partitions(all=True) + network_fstypes = ("nfs", "smbfs", "cifs") + return [p.mountpoint for p in partitions if p.fstype in network_fstypes] + +def _is_network_mount(path: str) -> bool: + """Checks if a path is within a mounted network filesystem.""" + resolved_path = Path(path).expanduser().resolve() + network_mounts = {Path(mount) for mount in _get_network_mounts()} + + # Check if any of the network mounts are one of the path's parents. + return bool(set(resolved_path.parents).intersection(network_mounts)) + + +def is_local_path(path: str) -> bool: + """Check if a given path is a local path or a remote URI.""" if sys.platform == "win32": - return _is_local_windows_path(uri) - return False + return _is_local_windows_path(path) + + scheme = urllib.parse.urlparse(path).scheme + return scheme in ("", "file") -def _is_local_windows_path(uri: str) -> bool: +def _is_local_windows_path(path: str) -> bool: """Determines if path is a Windows file-system location.""" - if len(uri) >= 1 and uri[0] == "\\": + if len(path) >= 1 and path[0] == "\\": return True if ( - len(uri) >= 3 - and uri[1] == ":" - and (uri[2] == "/" or uri[2] == "\\") - and uri[0].isalpha() + len(path) >= 3 + and path[1] == ":" + and (path[2] == "/" or path[2] == "\\") + and path[0].isalpha() ): return True return False +def _translate_s3_options(options: Dict[str, List[str]]) -> Dict[str, Any]: + """Translate pyarrow s3 query options into s3fs ``storage_kwargs``. + + ``storage_kwargs`` are passed to ``s3fs.S3Filesystem``. They accept + ``client_kwargs``, which are passed to ``botocore.session.Session.Client``. + + In this function, we translate query string parameters from an s3 URI + (e.g. ``s3://bucket/folder?endpoint_override=somewhere``) into the respective + query parameters for the botocore clent. + + S3Filesystem API ref: https://s3fs.readthedocs.io/en/latest/api.html + + Botocore Client API ref: https://boto3.amazonaws.com/v1/documentation/api/latest/ + reference/core/session.html#boto3.session.Session.client + + """ + # Map from s3 query keys --> botocore client arguments + option_map = { + "endpoint_override": "endpoint_url", + "region": "region_name", + "access_key": "aws_access_key_id", + "secret_key": "aws_secret_access_key", + } + + client_kwargs = {} + for opt, target in option_map.items(): + if opt in options: + client_kwargs[target] = options[opt][0] + + # s3fs directory cache does not work correctly, so we pass + # `use_listings_cache` to disable it. See https://github.com/fsspec/s3fs/issues/657 + # We should keep this for s3fs versions <= 2023.4.0. + return {"client_kwargs": client_kwargs, "use_listings_cache": False} + + +def _translate_gcs_options(options: Dict[str, List[str]]) -> Dict[str, Any]: + """Translate pyarrow s3 query options into s3fs ``storage_kwargs``. + + ``storage_kwargs`` are passed to ``gcsfs.GCSFileSystem``. + + In this function, we translate query string parameters from an s3 URI + (e.g. ``s3://bucket/folder?endpoint_override=somewhere``) into the respective + arguments for the gcs filesystem. + + GCSFileSystem API ref: https://gcsfs.readthedocs.io/en/latest/api.html + + """ + # Map from gcs query keys --> gcsfs kwarg names + option_map = { + "endpoint_override": "endpoint_url", + } + + storage_kwargs = {} + for opt, target in option_map.items(): + if opt in options: + storage_kwargs[target] = options[opt][0] + + return storage_kwargs + + +def _has_compatible_gcsfs_version() -> bool: + """GCSFS does not work for versions > 2022.7.1 and < 2022.10.0. + + See https://github.com/fsspec/gcsfs/issues/498. + + In that case, and if we can't fallback to native PyArrow's GCS handler, + we raise an error. + """ + try: + import gcsfs + + # For minimal install that only needs python3-setuptools + if packaging.version.parse(gcsfs.__version__) > packaging.version.parse( + "2022.7.1" + ) and packaging.version.parse(gcsfs.__version__) < packaging.version.parse( + "2022.10.0" + ): + # PyArrow's GcsFileSystem was introduced in 9.0.0. + if packaging.version.parse(pyarrow.__version__) < packaging.version.parse( + "9.0.0" + ): + raise RuntimeError( + "`gcsfs` versions between '2022.7.1' and '2022.10.0' are not " + f"compatible with pyarrow. You have gcsfs version " + f"{gcsfs.__version__}. Please downgrade or upgrade your gcsfs " + f"version or upgrade PyArrow. See more details in " + f"https://github.com/fsspec/gcsfs/issues/498." + ) + # Returning False here means we fall back to pyarrow. + return False + except ImportError: + return False + return True + + +def _get_fsspec_fs_and_path(uri: str) -> Optional["pyarrow.fs.FileSystem"]: + parsed = urllib.parse.urlparse(uri) + + storage_kwargs = {} + if parsed.scheme in ["s3", "s3a"] and parsed.query: + storage_kwargs = _translate_s3_options(urllib.parse.parse_qs(parsed.query)) + elif parsed.scheme in ["gs", "gcs"] and parsed.query: + if not _has_compatible_gcsfs_version(): + # If gcsfs is incompatible, fallback to pyarrow.fs. + return None + storage_kwargs = _translate_gcs_options(urllib.parse.parse_qs(parsed.query)) + + try: + fsspec_fs = fsspec.filesystem(parsed.scheme, **storage_kwargs) + except Exception: + # ValueError when protocol is not known. + # ImportError when protocol is known but package not installed. + # Other errors can be raised if args/kwargs are incompatible. + # Thus we should except broadly here. + return None + + fsspec_handler = pyarrow.fs.FSSpecHandler + fs = pyarrow.fs.PyFileSystem(fsspec_handler(fsspec_fs)) + return fs + + def get_fs_and_path( uri: str, ) -> Tuple[Optional["pyarrow.fs.FileSystem"], Optional[str]]: if not pyarrow: return None, None - if _is_local_path(uri): - # Append protocol such that the downstream operations work + scheme = urllib.parse.urlparse(uri).scheme + if is_local_path(uri) and not scheme: + # Append local filesys scheme such that the downstream operations work # properly on Linux and Windows. uri = "file://" + pathlib.Path(uri).as_posix() @@ -157,68 +335,33 @@ def get_fs_and_path( fs = _cached_fs[cache_key] return fs, path - # In case of hdfs filesystem, if uri does not have the netloc part below will - # fail with hdfs access error. For example 'hdfs:///user_folder/...' will - # fail, while only 'hdfs://namenode_server/user_foler/...' will work - # we consider the two cases of uri: short_hdfs_uri or other_uri, - # other_uri includes long hdfs uri and other filesystem uri, like s3 or gcp - # filesystem. Two cases of imported module of fsspec: yes or no. So we need - # to handle 4 cases: - # (uri, fsspec) - # (short_hdfs_uri, yes) --> use fsspec - # (short_hdfs_uri, no) --> return None and avoid init pyarrow - # (other_uri, yes) --> try pyarrow, if throw use fsspec - # (other_uri, no) --> try pyarrow, if throw return None - short_hdfs_uri = parsed.scheme == "hdfs" and parsed.netloc == "" - try: - if short_hdfs_uri and not fsspec: - return None, None - if not short_hdfs_uri: - fs, path = pyarrow.fs.FileSystem.from_uri(uri) + # Prefer fsspec over native pyarrow. + if fsspec: + fs = _get_fsspec_fs_and_path(uri) + if fs: _cached_fs[cache_key] = fs return fs, path - except (pyarrow.lib.ArrowInvalid, pyarrow.lib.ArrowNotImplementedError): - # Raised when URI not recognized - if not fsspec: - # Only return if fsspec is not installed - return None, None - - # Else, try to resolve protocol via fsspec - try: - fsspec_fs = fsspec.filesystem(parsed.scheme) - except ValueError: - # Raised when protocol not known - return None, None - - fsspec_handler = pyarrow.fs.FSSpecHandler - if parsed.scheme in ["gs", "gcs"]: - - # TODO(amogkam): Remove after https://github.com/fsspec/gcsfs/issues/498 is - # resolved. - try: - import gcsfs - # For minimal install that only needs python3-setuptools - if packaging.version.parse(gcsfs.__version__) > packaging.version.parse( - "2022.7.1" - ): - raise RuntimeError( - "`gcsfs` versions greater than '2022.7.1' are not " - f"compatible with pyarrow. You have gcsfs version " - f"{gcsfs.__version__}. Please downgrade your gcsfs " - f"version. See more details in " - f"https://github.com/fsspec/gcsfs/issues/498." - ) - except ImportError: - pass + # In case of hdfs filesystem, if uri does not have the netloc part below, it will + # fail with hdfs access error. For example 'hdfs:///user_folder/...' will + # fail, while only 'hdfs://namenode_server/user_foler/...' will work. + # Thus, if fsspec didn't return a filesystem, we return None. + hdfs_uri = parsed.scheme == "hdfs" + short_hdfs_uri = hdfs_uri and parsed.netloc == "" - # GS doesn't support `create_parents` arg in `create_dir()` - fsspec_handler = _CustomGCSHandler + if short_hdfs_uri: + return None, None - fs = pyarrow.fs.PyFileSystem(fsspec_handler(fsspec_fs)) - _cached_fs[cache_key] = fs + # If no fsspec filesystem was found, use pyarrow native filesystem. + try: + fs, path = pyarrow.fs.FileSystem.from_uri(uri) + _cached_fs[cache_key] = fs + return fs, path + except (pyarrow.lib.ArrowInvalid, pyarrow.lib.ArrowNotImplementedError): + # Raised when URI not recognized + pass - return fs, path + return None, None def delete_at_uri(uri: str): @@ -284,10 +427,13 @@ def download_from_uri(uri: str, local_path: str, filelock: bool = True): f"Hint: {fs_hint(uri)}" ) - _local_path = Path(local_path) + _local_path = Path(local_path).resolve() exists_before = _local_path.exists() if is_directory(uri): _local_path.mkdir(parents=True, exist_ok=True) + else: + _local_path.parent.mkdir(parents=True, exist_ok=True) + try: if filelock: with TempFileLock(f"{os.path.normpath(local_path)}.lock"): @@ -331,14 +477,33 @@ def upload_to_uri( if not exclude: _ensure_directory(bucket_path, fs=fs) _pyarrow_fs_copy_files(local_path, bucket_path, destination_filesystem=fs) + elif fsspec: + # If fsspec is available, prefer it because it's more efficient than + # calling pyarrow.fs.copy_files multiple times + _upload_to_uri_with_exclude_fsspec( + local_path=local_path, fs=fs, bucket_path=bucket_path, exclude=exclude + ) else: # Walk the filetree and upload - _upload_to_uri_with_exclude( + _upload_to_uri_with_exclude_pyarrow( local_path=local_path, fs=fs, bucket_path=bucket_path, exclude=exclude ) -def _upload_to_uri_with_exclude( +def _upload_to_uri_with_exclude_fsspec( + local_path: str, fs: "pyarrow.fs", bucket_path: str, exclude: Optional[List[str]] +) -> None: + local_fs = _ExcludingLocalFilesystem(exclude=exclude) + handler = pyarrow.fs.FSSpecHandler(local_fs) + source_fs = pyarrow.fs.PyFileSystem(handler) + + _ensure_directory(bucket_path, fs=fs) + _pyarrow_fs_copy_files( + local_path, bucket_path, source_filesystem=source_fs, destination_filesystem=fs + ) + + +def _upload_to_uri_with_exclude_pyarrow( local_path: str, fs: "pyarrow.fs", bucket_path: str, exclude: Optional[List[str]] ) -> None: def _should_exclude(candidate: str) -> bool: diff --git a/python/ray/air/_internal/uri_utils.py b/python/ray/air/_internal/uri_utils.py index c6222198b137..fd836e794ad4 100644 --- a/python/ray/air/_internal/uri_utils.py +++ b/python/ray/air/_internal/uri_utils.py @@ -42,6 +42,14 @@ def parent(self) -> "URI": assert self._path.parent != ".", f"{str(self)} has no valid parent URI" return URI(self._get_str_representation(self._parsed, self._path.parent)) + @property + def scheme(self) -> str: + return self._parsed.scheme + + @property + def path(self) -> str: + return str(self._path) + def __truediv__(self, path_to_append): assert isinstance(path_to_append, str) return URI( @@ -59,3 +67,23 @@ def __repr__(self): def __str__(self): return self._get_str_representation(self._parsed, self._path) + + +def _join_path_or_uri(base_path: str, path_to_join: str) -> str: + """Joins paths to form either a URI (w/ possible URL params) or a local path. + + Example: + + >>> local_path = "/a/b" + >>> uri = "s3://bucket/a?scheme=http" + >>> path_to_join = "c/d" + >>> _join_path_or_uri(local_path, path_to_join) + '/a/b/c/d' + >>> _join_path_or_uri(uri, path_to_join) + 's3://bucket/a/c/d?scheme=http' + + """ + from ray.air._internal.remote_storage import is_local_path + + base_path_or_uri = Path(base_path) if is_local_path(base_path) else URI(base_path) + return str(base_path_or_uri / path_to_join) diff --git a/python/ray/air/_internal/usage.py b/python/ray/air/_internal/usage.py index 8d6068260eb9..96efac9ff3ef 100644 --- a/python/ray/air/_internal/usage.py +++ b/python/ray/air/_internal/usage.py @@ -1,15 +1,24 @@ -from typing import TYPE_CHECKING, Set, Union +import collections +import json +import os +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union +import urllib.parse +from ray.air._internal.remote_storage import _is_network_mount from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag if TYPE_CHECKING: from ray.train.trainer import BaseTrainer from ray.tune.schedulers import TrialScheduler from ray.tune.search import BasicVariantGenerator, Searcher + from ray.tune import Callback + from ray.tune import SyncConfig + AIR_TRAINERS = { + "AccelerateTrainer", "HorovodTrainer", - "HuggingFaceTrainer", + "TransformersTrainer", "LightGBMTrainer", "LightningTrainer", "MosaicTrainer", @@ -114,3 +123,160 @@ def tag_scheduler(scheduler: "TrialScheduler"): assert isinstance(scheduler, TrialScheduler) scheduler_name = _find_class_name(scheduler, "ray.tune.schedulers", TUNE_SCHEDULERS) record_extra_usage_tag(TagKey.TUNE_SCHEDULER, scheduler_name) + + +def tag_setup_wandb(): + record_extra_usage_tag(TagKey.AIR_SETUP_WANDB_INTEGRATION_USED, "1") + + +def tag_setup_mlflow(): + record_extra_usage_tag(TagKey.AIR_SETUP_MLFLOW_INTEGRATION_USED, "1") + + +def _count_callbacks(callbacks: Optional[List["Callback"]]) -> Dict[str, int]: + """Creates a map of callback class name -> count given a list of callbacks.""" + from ray.tune import Callback + from ray.tune.logger import LoggerCallback + from ray.tune.utils.callback import DEFAULT_CALLBACK_CLASSES + + from ray.air.integrations.wandb import WandbLoggerCallback + from ray.air.integrations.mlflow import MLflowLoggerCallback + from ray.air.integrations.comet import CometLoggerCallback + from ray.tune.logger.aim import AimLoggerCallback + + built_in_callbacks = ( + WandbLoggerCallback, + MLflowLoggerCallback, + CometLoggerCallback, + AimLoggerCallback, + ) + DEFAULT_CALLBACK_CLASSES + + callback_names = [callback_cls.__name__ for callback_cls in built_in_callbacks] + callback_counts = collections.defaultdict(int) + + callbacks = callbacks or [] + for callback in callbacks: + if not isinstance(callback, Callback): + # This will error later, but don't include this as custom usage. + continue + + callback_name = callback.__class__.__name__ + + if callback_name in callback_names: + callback_counts[callback_name] += 1 + elif isinstance(callback, LoggerCallback): + callback_counts["CustomLoggerCallback"] += 1 + else: + callback_counts["CustomCallback"] += 1 + + return callback_counts + + +def tag_callbacks(callbacks: Optional[List["Callback"]]) -> bool: + """Records built-in callback usage via a JSON str representing a + dictionary mapping callback class name -> counts. + + User-defined callbacks will increment the count under the `CustomLoggerCallback` + or `CustomCallback` key depending on which of the provided interfaces they subclass. + NOTE: This will NOT track the name of the user-defined callback, + nor its implementation. + + This will NOT report telemetry if no callbacks are provided by the user. + + Returns: + bool: True if usage was recorded, False otherwise. + """ + if not callbacks: + # User didn't pass in any callbacks -> no usage recorded. + return False + + callback_counts = _count_callbacks(callbacks) + + if callback_counts: + callback_counts_str = json.dumps(callback_counts) + record_extra_usage_tag(TagKey.AIR_CALLBACKS, callback_counts_str) + + +def _get_tag_for_remote_path(remote_path: str) -> str: + scheme = urllib.parse.urlparse(remote_path).scheme + if scheme == "file": + # NOTE: We treat a file:// storage_path as a "remote" path, so this case + # differs from the local path only case. + # In particular, default syncing to head node is not enabled here. + tag = "local_uri" + elif scheme == "memory": + # NOTE: This is used in tests and does not make sense to actually use. + # This condition filters the tag out of the `custom` catch-all. + tag = "memory" + elif scheme == "hdfs": + tag = "hdfs" + elif scheme in {"s3", "s3a"}: + tag = "s3" + elif scheme in {"gs", "gcs"}: + tag = "gs" + else: + tag = "custom_remote_storage" + return tag + + +def tag_ray_air_storage_config( + local_path: str, remote_path: Optional[str], sync_config: "SyncConfig" +) -> None: + """Records the storage storage configuration of an experiment. + + The storage configuration is set by `RunConfig(storage_path, sync_config)`. + + The possible configurations are: + - 'driver' = Default syncing to Tune driver node if no remote path is specified. + - 'local' = No synchronization at all. + - 'nfs' = Using a mounted shared network filesystem. + - ('s3', 'gs', 'hdfs', 'custom_remote_storage'): Various remote storage schemes. + - ('local_uri', 'memory'): Mostly used by internal testing by setting `storage_path` + to `file://` or `memory://`. + """ + if remote_path: + # HDFS or cloud storage + storage_config_tag = _get_tag_for_remote_path(remote_path) + elif _is_network_mount(local_path): + # NFS + storage_config_tag = "nfs" + elif sync_config.syncer is None: + # Syncing is disabled - results are only available on node-local storage + storage_config_tag = "local" + else: + # The driver node's local storage is the synchronization point. + storage_config_tag = "driver" + + record_extra_usage_tag(TagKey.AIR_STORAGE_CONFIGURATION, storage_config_tag) + + +def tag_ray_air_env_vars() -> bool: + """Records usage of environment variables exposed by the Ray AIR libraries. + + NOTE: This does not track the values of the environment variables, nor + does this track environment variables not explicitly included in the + `all_ray_air_env_vars` allow-list. + + Returns: + bool: True if at least one environment var is supplied by the user. + """ + from ray.air.constants import AIR_ENV_VARS + from ray.tune.constants import TUNE_ENV_VARS + from ray.train.constants import TRAIN_ENV_VARS + + all_ray_air_env_vars = sorted( + set().union(AIR_ENV_VARS, TUNE_ENV_VARS, TRAIN_ENV_VARS) + ) + + user_supplied_env_vars = [] + + for env_var in all_ray_air_env_vars: + if env_var in os.environ: + user_supplied_env_vars.append(env_var) + + if user_supplied_env_vars: + env_vars_str = json.dumps(user_supplied_env_vars) + record_extra_usage_tag(TagKey.AIR_ENV_VARS, env_vars_str) + return True + + return False diff --git a/python/ray/air/_internal/util.py b/python/ray/air/_internal/util.py index c1c93d8a1c23..a4eba452e6de 100644 --- a/python/ray/air/_internal/util.py +++ b/python/ray/air/_internal/util.py @@ -3,10 +3,12 @@ from contextlib import closing import logging import queue +import shutil import threading from typing import Optional import numpy as np +from pathlib import Path import ray from ray.air.constants import _ERROR_REPORT_TIMEOUT @@ -119,3 +121,26 @@ def join(self, timeout=None): def _estimate_avail_object_store_memory() -> int: """Estimates total object store memory available in the cluster.""" return ray.available_resources()["object_store_memory"] + + +def _copy_dir_ignore_conflicts(src_dir: Path, dst_dir: Path): + """This is a workaround for python < 3.8 where shutil.copytree does not + support dirs_exist_ok=True. + + We will go through the content of the folder and manually copy ites, + while ignoring files that conflict. + + TODO(jungong): remove this workaround when we drop support for python < 3.8. + """ + for inner in src_dir.iterdir(): + dest = dst_dir / inner.name + if inner.is_dir(): + if not dest.exists(): + dest.mkdir(parents=True) + _copy_dir_ignore_conflicts(inner, dest) + else: + if not dest.exists(): + shutil.copy2(str(inner.absolute()), str(dest.absolute())) + else: + # Ignore and don't overwrite the existing file. + pass diff --git a/python/ray/air/checkpoint.py b/python/ray/air/checkpoint.py index 60fefca9505e..e5f0905e424d 100644 --- a/python/ray/air/checkpoint.py +++ b/python/ray/air/checkpoint.py @@ -26,6 +26,7 @@ read_file_from_uri, upload_to_uri, ) +from ray.air._internal.util import _copy_dir_ignore_conflicts from ray.air.constants import PREPROCESSOR_KEY, CHECKPOINT_ID_ATTR from ray.util.annotations import DeveloperAPI, PublicAPI @@ -41,6 +42,8 @@ _BYTES_DATA_KEY = "bytes_data" _METADATA_KEY = "_metadata" _CHECKPOINT_DIR_PREFIX = "checkpoint_tmp_" +# The namespace is a constant UUID to prevent conflicts, as defined in RFC-4122 +_CHECKPOINT_UUID_URI_NAMESPACE = uuid.UUID("627fe696-f135-436f-bc4b-bda0306e0181") logger = logging.getLogger(__name__) @@ -213,7 +216,17 @@ def __init__( self._override_preprocessor: Optional["Preprocessor"] = None self._override_preprocessor_set = False - self._uuid = uuid.uuid4() + # When using a cloud URI, we make sure that the uuid is constant. + # This ensures we do not download the data multiple times on one node. + # Note that this is not a caching mechanism - instead, this + # only ensures that if there are several processes downloading + # from the same URI, only one process does the actual work + # while the rest waits (FileLock). This also means data will not be duplicated. + self._uuid = ( + uuid.uuid4() + if not self._uri + else uuid.uuid5(_CHECKPOINT_UUID_URI_NAMESPACE, self._uri) + ) def __repr__(self): parameter, argument = self.get_internal_representation() @@ -547,21 +560,22 @@ def _to_directory(self, path: str, move_instead_of_copy: bool = False) -> None: if local_path: local_path_pathlib = Path(local_path).resolve() if local_path_pathlib != path_pathlib: - if path_pathlib.exists(): - shutil.rmtree(str(path_pathlib.absolute())) # If this exists on the local path, just copy over if move_instead_of_copy: os.makedirs(str(path_pathlib.absolute()), exist_ok=True) self._local_path = str(path_pathlib.absolute()) for inner in local_path_pathlib.iterdir(): + dest = path_pathlib / inner.name + if dest.exists(): + # Ignore files that already exist. + # For example, checkpoints from every rank may all have + # a same .is_checkpoint file. + continue shutil.move( str(inner.absolute()), str(path_pathlib.absolute()) ) else: - shutil.copytree( - str(local_path_pathlib.absolute()), - str(path_pathlib.absolute()), - ) + _copy_dir_ignore_conflicts(local_path_pathlib, path_pathlib) elif external_path: # If this exists on external storage (e.g. cloud), download download_from_uri(uri=external_path, local_path=path, filelock=False) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index bc77dce1e90d..c1fa930ea67d 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -18,6 +18,7 @@ ) from ray._private.storage import _get_storage_uri +from ray._private.thirdparty.tabulate.tabulate import tabulate from ray.air.constants import WILDCARD_KEY from ray.util.annotations import PublicAPI from ray.widgets import Template, make_table_html_repr @@ -30,6 +31,7 @@ from ray.tune.search.sample import Domain from ray.tune.stopper import Stopper from ray.tune.syncer import SyncConfig + from ray.tune.experimental.output import AirVerbosity from ray.tune.utils.log import Verbosity from ray.tune.execution.placement_groups import PlacementGroupFactory @@ -550,14 +552,6 @@ def __repr__(self): return _repr_dataclass(self) def _repr_html_(self): - try: - from tabulate import tabulate - except ImportError: - return ( - "Tabulate isn't installed. Run " - "`pip install tabulate` for rich notebook output." - ) - return Template("scrollableTable.html.j2").render( table=tabulate( { @@ -608,7 +602,15 @@ class CheckpointConfig: This attribute is only supported by trainers that don't take in custom training loops. Defaults to True for trainers that support it and False for generic function trainables. - + _checkpoint_keep_all_ranks: If True, will save checkpoints from all ranked + training workers. If False, only checkpoint from rank 0 worker is kept. + NOTE: This API is experimental and subject to change between minor + releases. + _checkpoint_upload_from_workers: If True, distributed workers + will upload their checkpoints to cloud directly. This is to avoid the + need for transferring large checkpoint files to the training worker + group coordinator for persistence. NOTE: This API is experimental and + subject to change between minor releases. """ num_to_keep: Optional[int] = None @@ -616,6 +618,8 @@ class CheckpointConfig: checkpoint_score_order: str = MAX checkpoint_frequency: int = 0 checkpoint_at_end: Optional[bool] = None + _checkpoint_keep_all_ranks: bool = False + _checkpoint_upload_from_workers: bool = False def __post_init__(self): if self.num_to_keep is not None and self.num_to_keep <= 0: @@ -638,14 +642,6 @@ def __repr__(self): return _repr_dataclass(self) def _repr_html_(self) -> str: - try: - from tabulate import tabulate - except ImportError: - return ( - "Tabulate isn't installed. Run " - "`pip install tabulate` for rich notebook output." - ) - if self.num_to_keep is None: num_to_keep_repr = "All" else: @@ -733,7 +729,10 @@ class RunConfig: a Jupyter notebook. verbose: 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief - results, 3 = status and detailed results. Defaults to 2. + results, 3 = status and detailed results. Defaults to 3. + If the ``RAY_AIR_NEW_OUTPUT=1`` environment variable is set, + uses the new context-aware verbosity settings: + 0 = silent, 1 = default, 2 = verbose. log_to_file: Log stdout and stderr to files in trial directories. If this is `False` (default), no files are written. If `true`, outputs are written to `trialdir/stdout` @@ -753,7 +752,7 @@ class RunConfig: sync_config: Optional["SyncConfig"] = None checkpoint_config: Optional[CheckpointConfig] = None progress_reporter: Optional["ProgressReporter"] = None - verbose: Union[int, "Verbosity"] = 3 + verbose: Optional[Union[int, "AirVerbosity", "Verbosity"]] = None log_to_file: Union[bool, str, Tuple[str, str]] = False # Deprecated @@ -762,6 +761,7 @@ class RunConfig: def __post_init__(self): from ray.tune.syncer import SyncConfig, Syncer from ray.tune.utils.util import _resolve_storage_path + from ray.tune.experimental.output import AirVerbosity, get_air_verbosity if not self.failure_config: self.failure_config = FailureConfig() @@ -827,6 +827,13 @@ def __post_init__(self): "Must specify a remote `storage_path` to use a custom `syncer`." ) + if self.verbose is None: + # Default `verbose` value. For new output engine, + # this is AirVerbosity.DEFAULT. + # For old output engine, this is Verbosity.V3_TRIAL_DETAILS + # Todo (krfricke): Currently uses number to pass test_configs::test_repr + self.verbose = get_air_verbosity(AirVerbosity.DEFAULT) or 3 + def __repr__(self): from ray.tune.syncer import SyncConfig @@ -840,14 +847,6 @@ def __repr__(self): ) def _repr_html_(self) -> str: - try: - from tabulate import tabulate - except ImportError: - return ( - "Tabulate isn't installed. Run " - "`pip install tabulate` for rich notebook output." - ) - reprs = [] if self.failure_config is not None: reprs.append( diff --git a/python/ray/air/constants.py b/python/ray/air/constants.py index 1accc998eebd..f4a85ec59d66 100644 --- a/python/ray/air/constants.py +++ b/python/ray/air/constants.py @@ -40,6 +40,15 @@ # training with Ray Train CHECKPOINT_ID_ATTR = "_current_checkpoint_id" +# Name of the marker dropped by the Trainable. If a worker detects +# the presence of the marker in the trial dir, it will use lazy +# checkpointing. +LAZY_CHECKPOINT_MARKER_FILE = ".lazy_checkpoint_marker" + +# ================================================== +# Environment Variables +# ================================================== + # Integer value which if set will copy files in reported AIR directory # checkpoints instead of moving them (if worker is on the same node as Trainable) COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV = ( @@ -51,7 +60,13 @@ # as Trainable) DISABLE_LAZY_CHECKPOINTING_ENV = "TRAIN_DISABLE_LAZY_CHECKPOINTING" -# Name of the marker dropped by the Trainable. If a worker detects -# the presence of the marker in the trial dir, it will use lazy -# checkpointing. -LAZY_CHECKPOINT_MARKER_FILE = ".lazy_checkpoint_marker" + +# NOTE: When adding a new environment variable, please track it in this list. +# TODO(ml-team): Most env var constants should get moved here. +AIR_ENV_VARS = { + COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV, + DISABLE_LAZY_CHECKPOINTING_ENV, + "RAY_AIR_FULL_TRACEBACKS", + "RAY_AIR_NEW_OUTPUT", + "RAY_AIR_RICH_LAYOUT", +} diff --git a/python/ray/air/examples/dreambooth/dataset.py b/python/ray/air/examples/dreambooth/dataset.py index 30b485b31d7c..a20a1d8698a9 100644 --- a/python/ray/air/examples/dreambooth/dataset.py +++ b/python/ray/air/examples/dreambooth/dataset.py @@ -8,8 +8,8 @@ def get_train_dataset(args, image_resolution=512): - """Build a Ray Dataset for fine-tuning DreamBooth model.""" - # Load images into Ray Dataset + """Build a Dataset for fine-tuning DreamBooth model.""" + # Load images into Dataset instance_dataset = read_images(args.instance_images_dir) class_dataset = read_images(args.class_images_dir) diff --git a/python/ray/air/execution/_internal/actor_manager.py b/python/ray/air/execution/_internal/actor_manager.py index e28c6dd72677..c2af238f0229 100644 --- a/python/ray/air/execution/_internal/actor_manager.py +++ b/python/ray/air/execution/_internal/actor_manager.py @@ -360,6 +360,13 @@ def _try_start_actors(self, max_actors: Optional[int] = None) -> int: # Start Ray actor actor = remote_actor_cls.remote(**kwargs) + # Track + self._live_actors_to_ray_actors_resources[tracked_actor] = ( + actor, + acquired_resources, + ) + self._live_resource_cache = None + # Schedule ready future future = actor.__ray_ready__.remote() @@ -392,12 +399,6 @@ def on_error(exception: Exception): on_error=on_error, ) - self._live_actors_to_ray_actors_resources[tracked_actor] = ( - actor, - acquired_resources, - ) - self._live_resource_cache = None - self._enqueue_cached_actor_tasks(tracked_actor=tracked_actor) return started_actors @@ -698,6 +699,9 @@ def schedule_actor_task( args = args or tuple() kwargs = kwargs or {} + if tracked_actor.actor_id in self._failed_actor_ids: + return + tracked_actor_task = TrackedActorTask( tracked_actor=tracked_actor, on_result=on_result, on_error=on_error ) @@ -874,6 +878,3 @@ def cleanup(self): self._resource_manager.clear() self.__init__(resource_manager=self._resource_manager) - - def __del__(self): - self.cleanup() diff --git a/python/ray/air/execution/resources/request.py b/python/ray/air/execution/resources/request.py index 87455a6b3ad8..40dbd3858d85 100644 --- a/python/ray/air/execution/resources/request.py +++ b/python/ray/air/execution/resources/request.py @@ -154,7 +154,8 @@ def to_placement_group(self): def __eq__(self, other: "ResourceRequest"): return ( - self._bound == other._bound + isinstance(other, ResourceRequest) + and self._bound == other._bound and self.head_bundle_is_empty == other.head_bundle_is_empty ) diff --git a/python/ray/air/integrations/mlflow.py b/python/ray/air/integrations/mlflow.py index fc22e28fdf4a..aa671b24786a 100644 --- a/python/ray/air/integrations/mlflow.py +++ b/python/ray/air/integrations/mlflow.py @@ -1,12 +1,12 @@ import logging -import warnings from types import ModuleType from typing import Dict, Optional, Union +import warnings import ray from ray.air import session - from ray.air._internal.mlflow import _MLflowLoggerUtil +from ray.air._internal import usage as air_usage from ray.tune.logger import LoggerCallback from ray.tune.result import TIMESTEPS_TOTAL, TRAINING_ITERATION from ray.tune.experiment import Trial @@ -194,6 +194,10 @@ def train_fn(config): set_active=True, ) mlflow_util.log_params(_config) + + # Record `setup_mlflow` usage when everything has setup successfully. + air_usage.tag_setup_mlflow() + return mlflow_util._mlflow diff --git a/python/ray/air/integrations/wandb.py b/python/ray/air/integrations/wandb.py index e998a264785a..d0c40ee1d47f 100644 --- a/python/ray/air/integrations/wandb.py +++ b/python/ray/air/integrations/wandb.py @@ -13,6 +13,7 @@ import ray from ray import logger from ray.air import session +from ray.air._internal import usage as air_usage from ray.air.util.node import _force_on_current_node from ray.tune.logger import LoggerCallback @@ -207,6 +208,10 @@ def _setup_wandb( run = _wandb.init(**wandb_init_kwargs) _run_wandb_process_run_info_hook(run) + + # Record `setup_wandb` usage when everything has setup successfully. + air_usage.tag_setup_wandb() + return run diff --git a/python/ray/air/tests/test_air_usage.py b/python/ray/air/tests/test_air_usage.py new file mode 100644 index 000000000000..040b1a615d4c --- /dev/null +++ b/python/ray/air/tests/test_air_usage.py @@ -0,0 +1,207 @@ +"""Unit tests for AIR telemetry.""" + +from collections import namedtuple +import json +import os + +import pytest +from unittest.mock import MagicMock, patch + +import ray +from ray import air, tune +from ray.air import session +from ray.air._internal import usage as air_usage +from ray.air.integrations import wandb, mlflow, comet +from ray.tune.callback import Callback +from ray.tune.logger import LoggerCallback +from ray.tune.logger.aim import AimLoggerCallback +from ray.tune.utils.callback import DEFAULT_CALLBACK_CLASSES +from ray._private.usage.usage_lib import TagKey + + +def _mock_record_from_module(module, monkeypatch): + recorded = {} + + def mock_record_extra_usage_tag(key: TagKey, value: str): + recorded[key] = value + + monkeypatch.setattr( + module, + "record_extra_usage_tag", + mock_record_extra_usage_tag, + ) + return recorded + + +@pytest.fixture +def mock_record(monkeypatch): + import ray.air._internal.usage + + yield _mock_record_from_module(ray.air._internal.usage, monkeypatch=monkeypatch) + + +def train_fn(config): + session.report({"score": 1}) + + +@pytest.fixture +def tuner(tmp_path): + yield tune.Tuner(train_fn, run_config=air.RunConfig(storage_path=str(tmp_path))) + + +@pytest.fixture(scope="module") +def ray_start_2_cpus(): + address_info = ray.init(num_cpus=2) + yield address_info + ray.shutdown() + + +# (nfs: bool, remote_path: str | None, syncing_disabled: bool, expected: str) +_StorageTestConfig = namedtuple( + "StorageTestConfig", ["nfs", "remote_path", "syncing_disabled", "expected"] +) + +_storage_test_configs = [ + # Local + _StorageTestConfig(False, None, False, "driver"), + _StorageTestConfig(False, None, True, "local"), + # Remote + _StorageTestConfig(False, "s3://mock/bucket?param=1", False, "s3"), + _StorageTestConfig(False, "gs://mock/bucket?param=1", False, "gs"), + _StorageTestConfig(False, "hdfs://mock/bucket?param=1", False, "hdfs"), + _StorageTestConfig(False, "file://mock/bucket?param=1", False, "local_uri"), + _StorageTestConfig(False, "memory://mock/bucket?param=1", False, "memory"), + _StorageTestConfig( + False, "custom://mock/bucket?param=1", False, "custom_remote_storage" + ), + # NFS + _StorageTestConfig(True, None, True, "nfs"), +] + + +@pytest.mark.parametrize( + "storage_test_config", + _storage_test_configs, + ids=[str(config) for config in _storage_test_configs], +) +def test_tag_ray_air_storage_config( + tmp_path, storage_test_config, mock_record, monkeypatch +): + if storage_test_config.nfs: + import ray.air._internal.remote_storage + + monkeypatch.setattr( + ray.air._internal.remote_storage, + "_get_network_mounts", + lambda: [str(tmp_path)], + ) + + local_path = str(tmp_path / "local_path") + sync_config = ( + tune.SyncConfig(syncer=None) + if storage_test_config.syncing_disabled + else tune.SyncConfig() + ) + + air_usage.tag_ray_air_storage_config( + local_path=local_path, + remote_path=storage_test_config.remote_path, + sync_config=sync_config, + ) + assert storage_test_config.expected == mock_record[TagKey.AIR_STORAGE_CONFIGURATION] + + +class _CustomLoggerCallback(LoggerCallback): + pass + + +class _CustomCallback(Callback): + pass + + +_TEST_CALLBACKS = [ + wandb.WandbLoggerCallback, + mlflow.MLflowLoggerCallback, + comet.CometLoggerCallback, + AimLoggerCallback, + _CustomLoggerCallback, + _CustomLoggerCallback, + _CustomCallback, +] + + +def test_tag_setup_wandb(mock_record): + from ray.air.integrations.wandb import _setup_wandb + + with patch.dict(os.environ, {wandb.WANDB_MODE_ENV_VAR: "disabled"}): + _setup_wandb(trial_id="a", trial_name="b", config={}, _wandb=MagicMock()) + assert mock_record[TagKey.AIR_SETUP_WANDB_INTEGRATION_USED] == "1" + + +def test_tag_setup_mlflow(mock_record, monkeypatch): + from ray.air.integrations.mlflow import setup_mlflow + + monkeypatch.setattr(ray.air.integrations.mlflow, "_MLflowLoggerUtil", MagicMock()) + setup_mlflow() + assert mock_record[TagKey.AIR_SETUP_MLFLOW_INTEGRATION_USED] == "1" + + +@pytest.mark.parametrize( + "callback_classes_expected", + [ + (None, None), + ([], None), + ([lambda: None], None), + ( + DEFAULT_CALLBACK_CLASSES, + {cls.__name__: 1 for cls in DEFAULT_CALLBACK_CLASSES}, + ), + ( + _TEST_CALLBACKS, + { + "WandbLoggerCallback": 1, + "MLflowLoggerCallback": 1, + "CometLoggerCallback": 1, + "AimLoggerCallback": 1, + "CustomLoggerCallback": 2, + "CustomCallback": 1, + }, + ), + ], +) +def test_tag_callbacks(mock_record, callback_classes_expected): + callback_classes, expected = callback_classes_expected + + callbacks = ( + [callback_cls() for callback_cls in callback_classes] + if callback_classes + else None + ) + + air_usage.tag_callbacks(callbacks) + + callback_usage_str = mock_record.pop(TagKey.AIR_CALLBACKS, None) + callback_counts = json.loads(callback_usage_str) if callback_usage_str else None + assert callback_counts == expected + + +def test_tag_env_vars(ray_start_2_cpus, mock_record, tuner): + """Test that env vars are recorded properly, and arbitrary user environment + variables are ignored.""" + env_vars_to_record = { + "RAY_AIR_LOCAL_CACHE_DIR": "~/ray_results", + "TUNE_DISABLE_AUTO_CALLBACK_SYNCER": "1", + } + untracked_env_vars = {"RANDOM_USER_ENV_VAR": "asdf"} + + with patch.dict(os.environ, {**env_vars_to_record, **untracked_env_vars}): + tuner.fit() + + recorded_env_vars = json.loads(mock_record[TagKey.AIR_ENV_VARS]) + assert sorted(env_vars_to_record) == sorted(recorded_env_vars) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/air/tests/test_checkpoint_manager.py b/python/ray/air/tests/test_checkpoint_manager.py index fa0e20b89473..eb50aa04fb74 100644 --- a/python/ray/air/tests/test_checkpoint_manager.py +++ b/python/ray/air/tests/test_checkpoint_manager.py @@ -11,7 +11,7 @@ def test_unlimited_persistent_checkpoints(): cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=None)) for i in range(10): - cpm.register_checkpoint( + cpm.register_checkpoints( _TrackedCheckpoint({"data": i}, storage_mode=CheckpointStorage.PERSISTENT) ) @@ -22,7 +22,7 @@ def test_limited_persistent_checkpoints(): cpm = _CheckpointManager(checkpoint_strategy=CheckpointConfig(num_to_keep=2)) for i in range(10): - cpm.register_checkpoint( + cpm.register_checkpoints( _TrackedCheckpoint({"data": i}, storage_mode=CheckpointStorage.PERSISTENT) ) @@ -41,7 +41,7 @@ def __post_init__(self): cpm = _CheckpointManager(checkpoint_strategy=_CheckpointConfig(num_to_keep=0)) for i in range(10): - cpm.register_checkpoint( + cpm.register_checkpoints( _TrackedCheckpoint({"data": i}, storage_mode=CheckpointStorage.PERSISTENT) ) @@ -53,7 +53,7 @@ def test_dont_persist_memory_checkpoints(): cpm._persist_memory_checkpoints = False for i in range(10): - cpm.register_checkpoint( + cpm.register_checkpoints( _TrackedCheckpoint({"data": i}, storage_mode=CheckpointStorage.MEMORY) ) @@ -65,7 +65,7 @@ def test_persist_memory_checkpoints(): cpm._persist_memory_checkpoints = True for i in range(10): - cpm.register_checkpoint( + cpm.register_checkpoints( _TrackedCheckpoint({"data": i}, storage_mode=CheckpointStorage.MEMORY) ) @@ -83,7 +83,7 @@ def test_keep_best_checkpoints(): cpm._persist_memory_checkpoints = True for i in range(10): - cpm.register_checkpoint( + cpm.register_checkpoints( _TrackedCheckpoint( {"data": i}, storage_mode=CheckpointStorage.MEMORY, diff --git a/python/ray/air/tests/test_checkpoints.py b/python/ray/air/tests/test_checkpoints.py index 0b6aced414b6..6ae062e7b814 100644 --- a/python/ray/air/tests/test_checkpoints.py +++ b/python/ray/air/tests/test_checkpoints.py @@ -1,19 +1,25 @@ +import logging import os import pickle import re import shutil import tempfile import unittest +from contextlib import contextmanager from pathlib import Path from typing import Any import pytest +import boto3 import ray from ray.air._internal.remote_storage import _ensure_directory, delete_at_uri +from ray.air._internal.uri_utils import URI +from ray.air._internal.util import _copy_dir_ignore_conflicts from ray.air.checkpoint import _DICT_CHECKPOINT_ADDITIONAL_FILE_KEY, Checkpoint from ray.air.constants import MAX_REPR_LENGTH, PREPROCESSOR_KEY from ray.data import Preprocessor +from ray._private.test_utils import simulate_storage class DummyPreprocessor(Preprocessor): @@ -154,6 +160,34 @@ def test_directory_move_instead_of_copy(self): assert new_recovered_checkpoint.foo == "bar" assert not list(Path(path).glob("*")) + def test_copy_dir_ignore_conflicts(self): + tmpdir = Path(tempfile.mkdtemp()) + + src_dir = tmpdir / "src" + dst_dir = tmpdir / "dst" + + src_dir.mkdir() + dst_dir.mkdir() + + (src_dir / "foo.txt").touch() + (src_dir / "bar.txt").touch() + (src_dir / "a").mkdir() + (src_dir / "a" / "a.txt").touch() + (src_dir / "b").mkdir() + (src_dir / "b" / "b.txt").touch() + + # Has a file conflict. + (dst_dir / "foo.txt").touch() + # Has a directory conflict. + (dst_dir / "a").mkdir() + + _copy_dir_ignore_conflicts(src_dir, dst_dir) + + assert (dst_dir / "foo.txt").exists() + assert (dst_dir / "bar.txt").exists() + assert (dst_dir / "a" / "a.txt").exists() + assert (dst_dir / "b" / "b.txt").exists() + def test_uri(self): checkpoint = StubCheckpoint.from_dict({"spam": "ham"}) assert "foo" in checkpoint._SERIALIZED_ATTRS @@ -817,6 +851,80 @@ def testCheckpointUri(self): self.assertEqual(checkpoint.uri, "memory://some/location") +class URITestCheckpoint(Checkpoint): + def _to_directory(self, path: str, move_instead_of_copy: bool = False) -> None: + super()._to_directory(path, move_instead_of_copy) + # Drop a marker file with the current pid. + # Only one file should be created, as only one task should + # download the data, with the rest waiting. + with open(Path(path, f"_pid_marker_{os.getpid()}"), "w"): + pass + + +@contextmanager +def mock_s3_bucket_uri(): + port = 5002 + region = "us-west-2" + with simulate_storage("s3", port=port, region=region) as s3_uri: + s3 = boto3.client( + "s3", region_name=region, endpoint_url=f"http://localhost:{port}" + ) + # Bucket name will be autogenerated/unique per test + bucket_name = URI(s3_uri).name + s3.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={"LocationConstraint": region}, + ) + # Disable server HTTP request logging + logging.getLogger("werkzeug").setLevel(logging.WARNING) + yield URI(s3_uri) + logging.getLogger("werkzeug").setLevel(logging.INFO) + + +@ray.remote +def download_uri_checkpoint(checkpoint: URITestCheckpoint): + with checkpoint.as_directory() as dir: + dir = Path(dir) + all_pid_marker_files = list(dir.glob("_pid_marker_*")) + # There should be only one file, as only one task should + # download. + assert len(all_pid_marker_files) == 1 + assert (dir / "mock.file").exists() + + +class TestCheckpointURIConstantUUID(unittest.TestCase): + def setUp(self) -> None: + ray.shutdown() + ray.init(num_cpus=4) + + def tearDown(self) -> None: + ray.shutdown() + + def testCheckpointURIConstantUUID(self): + """Test that multiple workers using the same URI checkpoint + share the local directory, and that only one worker downloads + the data.""" + with mock_s3_bucket_uri() as base_uri, tempfile.TemporaryDirectory() as tmpdir: + checkpoint_dir = Path(tmpdir, "checkpoint") + os.makedirs(checkpoint_dir) + with open(checkpoint_dir / "mock.file", "w"): + pass + checkpoint_uri = str(base_uri / "model") + uri = Checkpoint.from_directory(checkpoint_dir).to_uri(checkpoint_uri) + + # Check that two separate checkpoints have the same uuid + checkpoint = URITestCheckpoint.from_uri(uri) + checkpoint2 = URITestCheckpoint.from_uri(uri) + assert checkpoint._uuid == checkpoint2._uuid + + # Create a separate checkpoint for each task + tasks = [ + download_uri_checkpoint.remote(URITestCheckpoint.from_uri(uri)) + for _ in range(4) + ] + ray.get(tasks) + + if __name__ == "__main__": import sys diff --git a/python/ray/air/tests/test_dataset_config.py b/python/ray/air/tests/test_dataset_config.py index 0f1a919337d4..0723094a4f5b 100644 --- a/python/ray/air/tests/test_dataset_config.py +++ b/python/ray/air/tests/test_dataset_config.py @@ -60,7 +60,7 @@ class TestWildcard(TestBasic): def test_basic(ray_start_4_cpus): - ds = ray.data.range_table(10) + ds = ray.data.range(10) # Single worker basic case. test = TestBasic( @@ -105,7 +105,7 @@ def test_basic(ray_start_4_cpus): def test_error(ray_start_4_cpus): - ds = ray.data.range_table(10) + ds = ray.data.range(10) # Missing required dataset. with pytest.raises(ValueError): @@ -136,7 +136,7 @@ def test_error(ray_start_4_cpus): def test_use_stream_api_config(ray_start_4_cpus): - ds = ray.data.range_table(10) + ds = ray.data.range(10) # Single worker basic case. test = TestBasic( @@ -160,14 +160,14 @@ def test_use_stream_api_config(ray_start_4_cpus): def test_fit_transform_config(ray_start_4_cpus): - ds = ray.data.range_table(10) + ds = ray.data.range(10) def drop_odd_pandas(batch): - return batch[batch["value"] % 2 == 0] + return batch[batch["id"] % 2 == 0] def drop_odd_numpy(batch): - arr = batch["value"] - return arr[arr % 2 == 0] + arr = batch["id"] + return {"id": arr[arr % 2 == 0]} prep_pandas = BatchMapper(drop_odd_pandas, batch_format="pandas") prep_numpy = BatchMapper(drop_odd_numpy, batch_format="numpy") @@ -232,7 +232,7 @@ def train_loop_per_worker(data_shard, check_results_fn): for _ in range(2): result = [] for batch in data_shard.iter_batches(): - for row in batch["value"]: + for row in batch["id"]: result.append(row) results.append(result) check_results_fn(data_shard, results) @@ -255,11 +255,11 @@ def checker(shard, results): assert "Stage 1 ReadRange->BatchMapper: 1/1 blocks executed " in stats, stats def rand(x): - x["value"] = x["value"].multiply(x["value"]) + x["id"] = x["id"].multiply(x["id"]) return x prep = BatchMapper(rand, batch_format="pandas") - ds = ray.data.range_table(5, parallelism=1) + ds = ray.data.range(5, parallelism=1) test = TestStream( checker, preprocessor=prep, @@ -271,11 +271,11 @@ def rand(x): def test_stream_finite_window_nocache_prep(ray_start_4_cpus): def rand(x): - x["value"] = [random.random() for _ in range(len(x))] + x["id"] = [random.random() for _ in range(len(x))] return x prep = BatchMapper(rand, batch_format="pandas") - ds = ray.data.range_table(5, parallelism=1) + ds = ray.data.range(5, parallelism=1) # Test 50% object store memory.. def checker(shard, results): @@ -305,12 +305,12 @@ def test_stream_transform_config(ray_start_4_cpus): def check_batch(batch): assert isinstance(batch, dict) - assert isinstance(batch["value"], np.ndarray) - assert len(batch["value"]) == batch_size + assert isinstance(batch["id"], np.ndarray) + assert len(batch["id"]) == batch_size return batch prep = BatchMapper(check_batch, batch_format="numpy", batch_size=2) - ds = ray.data.range_table(6, parallelism=1) + ds = ray.data.range(6, parallelism=1) test = TestStream( lambda *args: None, @@ -327,7 +327,7 @@ def checker(shard, results): stats = shard.stats() assert "RandomizeBlockOrder->RandomShuffle" in stats, stats - ds = ray.data.range_table(5) + ds = ray.data.range(5) test = TestStream( checker, datasets={"train": ds}, @@ -341,7 +341,7 @@ def checker(shard, results): stats = shard.stats() assert "Stage 1 ReadRange->RandomShuffle" in stats, stats - ds = ray.data.range_table(5) + ds = ray.data.range(5) test = TestBatch( checker, datasets={"train": ds}, @@ -357,7 +357,7 @@ def checker(shard, results): stats = shard.stats() assert "RandomizeBlockOrder: 5/5 blocks executed in" in stats, stats - ds = ray.data.range_table(5) + ds = ray.data.range(5) test = TestStream( checker, datasets={"train": ds}, @@ -368,7 +368,7 @@ def checker(shard, results): stats = shard.stats() assert "RandomizeBlockOrder" not in stats, stats - ds = ray.data.range_table(5) + ds = ray.data.range(5) test = TestStream( checker, datasets={"train": ds}, @@ -384,7 +384,7 @@ def checker(shard, results): stats = shard.stats() assert "RandomizeBlockOrder: 5/5 blocks executed" in stats, stats - ds = ray.data.range_table(5) + ds = ray.data.range(5) test = TestBatch( checker, datasets={"train": ds}, @@ -399,7 +399,7 @@ def checker(shard, results): stats = shard.stats() assert "RandomizeBlockOrder: 5/5 blocks executed in" in stats, stats - ds = ray.data.range_table(5) + ds = ray.data.range(5) test = TestStream( checker, datasets={"train": ds}, @@ -426,7 +426,7 @@ def check_error(shard, results): def test_deterministic_per_epoch_preprocessor( ray_start_4_cpus, max_object_store_memory_fraction ): - ds = ray.data.range_table(5) + ds = ray.data.range(5) def multiply(x): return x * 2 @@ -477,7 +477,7 @@ def checker(shard, results): def test_nondeterministic_per_epoch_preprocessor( ray_start_4_cpus, max_object_store_memory_fraction ): - ds = ray.data.range_table(5) + ds = ray.data.range(5) # Use randomized per-epoch preprocessor to check that it gets applied once # per epoch. @@ -504,7 +504,7 @@ def checker(shard, results): def test_validate_per_epoch_preprocessor(ray_start_4_cpus): - ds = ray.data.range_table(5) + ds = ray.data.range(5) def multiply(x): return x * 2 diff --git a/python/ray/air/tests/test_experiment_restore.py b/python/ray/air/tests/test_experiment_restore.py index 54ef313c6ee2..c6b4506a6a7b 100644 --- a/python/ray/air/tests/test_experiment_restore.py +++ b/python/ray/air/tests/test_experiment_restore.py @@ -1,4 +1,6 @@ import json +import os + import numpy as np import pandas as pd from pathlib import Path @@ -58,7 +60,6 @@ def test_experiment_restore(tmp_path, runner_type): - The test will stop the script with a SIGINT at a random time between 4-8 iterations after each restore. - Requirements: - Req 1: Reasonable runtime - The experiment should finish within 2 * 16 = 32 seconds. @@ -112,11 +113,14 @@ def test_experiment_restore(tmp_path, runner_type): "NUM_TRIALS": str(num_trials), "MAX_CONCURRENT_TRIALS": str(max_concurrent), "CSV_DATA_FILE": csv_file, + "TUNE_NEW_EXECUTION": os.environ.get("TUNE_NEW_EXECUTION", "1"), } # Pass criteria no_interrupts_runtime = 16.0 - passing_factor = 2 + # Todo(krfricke): See if we can improve the actor startup/shutdown time + # to reduce the passing factor again. + passing_factor = 2.5 passing_runtime = no_interrupts_runtime * passing_factor _print_message( "Experiment should finish with a total runtime of\n" @@ -197,17 +201,19 @@ def test_experiment_restore(tmp_path, runner_type): ) test_end_time = time.monotonic() + # Req 1: runtime + assert total_runtime <= passing_runtime, ( + f"Expected runtime to be <= {passing_runtime}, but ran for: {total_runtime}. " + f"This means the experiment did not finish (iterations still running). Are " + f"there any performance regressions or expensive failure recoveries??" + ) + # The script shouldn't have errored. (It should have finished by this point.) assert return_code == 0, ( f"The script errored with return code: {return_code}.\n" - f"Check the `{_RUN_SCRIPT_FILENAME}` script for any issues." + f"Check the `{_RUN_SCRIPT_FILENAME}` script for any issues. " ) - # Req 1: runtime - assert ( - total_runtime <= passing_runtime - ), f"Expected runtime to be <= {passing_runtime}, but ran for: {total_runtime}" - # Req 2: training progress persisted # Check that progress increases monotonically (we never go backwards/start from 0) assert np.all(np.diff(progress_history) >= 0), ( diff --git a/python/ray/air/tests/test_integration_mlflow.py b/python/ray/air/tests/test_integration_mlflow.py index 7b6ea45c0642..85cab89080e1 100644 --- a/python/ray/air/tests/test_integration_mlflow.py +++ b/python/ray/air/tests/test_integration_mlflow.py @@ -7,6 +7,7 @@ from mlflow.tracking import MlflowClient +from ray._private.dict import flatten_dict from ray.train._internal.session import init_session from ray.tune.trainable import wrap_function from ray.tune.trainable.session import _shutdown as tune_session_shutdown @@ -367,7 +368,7 @@ def test_setup_fail(self): ) def test_log_params(self): - params = {"a": "a"} + params = {"a": "a", "x": {"y": "z"}} self.mlflow_util.setup_mlflow( tracking_uri=self.tracking_uri, experiment_name="new_experiment" ) @@ -376,21 +377,23 @@ def test_log_params(self): self.mlflow_util.log_params(params_to_log=params, run_id=run_id) run = self.mlflow_util._mlflow.get_run(run_id=run_id) - assert run.data.params == params + assert run.data.params == flatten_dict(params) params2 = {"b": "b"} self.mlflow_util.start_run(set_active=True) self.mlflow_util.log_params(params_to_log=params2, run_id=run_id) run = self.mlflow_util._mlflow.get_run(run_id=run_id) - assert run.data.params == { - **params, - **params2, - } + assert run.data.params == flatten_dict( + { + **params, + **params2, + } + ) self.mlflow_util.end_run() def test_log_metrics(self): - metrics = {"a": 1.0} + metrics = {"a": 1.0, "x": {"y": 2.0}} self.mlflow_util.setup_mlflow( tracking_uri=self.tracking_uri, experiment_name="new_experiment" ) @@ -399,15 +402,19 @@ def test_log_metrics(self): self.mlflow_util.log_metrics(metrics_to_log=metrics, run_id=run_id, step=0) run = self.mlflow_util._mlflow.get_run(run_id=run_id) - assert run.data.metrics == metrics + assert run.data.metrics == flatten_dict(metrics) metrics2 = {"b": 1.0} self.mlflow_util.start_run(set_active=True) self.mlflow_util.log_metrics(metrics_to_log=metrics2, run_id=run_id, step=0) - assert self.mlflow_util._mlflow.get_run(run_id=run_id).data.metrics == { - **metrics, - **metrics2, - } + assert self.mlflow_util._mlflow.get_run( + run_id=run_id + ).data.metrics == flatten_dict( + { + **metrics, + **metrics2, + } + ) self.mlflow_util.end_run() diff --git a/python/ray/air/tests/test_remote_storage.py b/python/ray/air/tests/test_remote_storage.py index cc3c98209a6f..99adcd8939f8 100644 --- a/python/ray/air/tests/test_remote_storage.py +++ b/python/ray/air/tests/test_remote_storage.py @@ -10,6 +10,7 @@ upload_to_uri, download_from_uri, get_fs_and_path, + _is_network_mount, ) from ray.tune.utils.file_transfer import _get_recursive_files_and_stats @@ -131,6 +132,46 @@ def test_upload_exclude_multimatch(temp_data_dirs): assert_file(False, tmp_target, "subdir_exclude/something/somewhere.txt") +@pytest.mark.parametrize("no_fsspec", [False, True]) +def test_upload_local_exclude_multi(temp_data_dirs, no_fsspec): + if no_fsspec: + with patch("ray.air._internal.remote_storage.fsspec", None): + return test_upload_local_exclude_multi(temp_data_dirs, no_fsspec=False) + + tmp_source, tmp_target = temp_data_dirs + + upload_to_uri(tmp_source, tmp_target, exclude=["*_exclude.txt", "*_exclude/*"]) + + assert_file(True, tmp_target, "level0.txt") + assert_file(False, tmp_target, "level0_exclude.txt") + assert_file(True, tmp_target, "subdir/level1.txt") + assert_file(False, tmp_target, "subdir/level1_exclude.txt") + assert_file(True, tmp_target, "subdir/nested/level2.txt") + assert_file(False, tmp_target, "subdir_nested_level2_exclude.txt") + assert_file(False, tmp_target, "subdir_exclude") + assert_file(False, tmp_target, "subdir_exclude/something/somewhere.txt") + + +@pytest.mark.parametrize("no_fsspec", [False, True]) +def test_upload_local_exclude_multimatch(temp_data_dirs, no_fsspec): + if no_fsspec: + with patch("ray.air._internal.remote_storage.fsspec", None): + return test_upload_local_exclude_multimatch(temp_data_dirs, no_fsspec=False) + + tmp_source, tmp_target = temp_data_dirs + + upload_to_uri(tmp_source, tmp_target, exclude=["*_exclude*"]) + + assert_file(True, tmp_target, "level0.txt") + assert_file(False, tmp_target, "level0_exclude.txt") + assert_file(True, tmp_target, "subdir/level1.txt") + assert_file(False, tmp_target, "subdir/level1_exclude.txt") + assert_file(True, tmp_target, "subdir/nested/level2.txt") + assert_file(False, tmp_target, "subdir_nested_level2_exclude.txt") + assert_file(False, tmp_target, "subdir_exclude") + assert_file(False, tmp_target, "subdir_exclude/something/somewhere.txt") + + def test_get_recursive_files_race_con(temp_data_dirs): tmp_source, _ = temp_data_dirs @@ -175,6 +216,25 @@ def test_get_fs_and_path(): assert find_error +def test_is_network_mount(tmp_path, monkeypatch): + """Test `_is_network_mount` storage utility.""" + + with monkeypatch.context() as m: + import ray.air._internal.remote_storage + + m.setattr( + ray.air._internal.remote_storage, + "_get_network_mounts", + lambda: [str(tmp_path)], + ) + assert _is_network_mount(str(tmp_path / "a/b/c")) + + # Local paths should return False + assert not _is_network_mount(str(tmp_path / "ray_results")) + assert not _is_network_mount("~/ray_results") + assert not _is_network_mount("") # cwd + + if __name__ == "__main__": import sys diff --git a/python/ray/air/tests/test_util_torch_dist.py b/python/ray/air/tests/test_util_torch_dist.py new file mode 100644 index 000000000000..fea665fa6594 --- /dev/null +++ b/python/ray/air/tests/test_util_torch_dist.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest +import torch +import torch.distributed as dist + +import ray +from ray.air.util.torch_dist import ( + init_torch_dist_process_group, + shutdown_torch_dist_process_group, + TorchDistributedWorker, +) + + +def test_torch_process_group_gloo(): + @ray.remote + class TestWorker(TorchDistributedWorker): + def run(self): + tensor = torch.tensor([1.0]) + dist.all_reduce(tensor) + return tensor.numpy() + + workers = [TestWorker.remote() for _ in range(5)] + + init_torch_dist_process_group(workers, backend="gloo", init_method="env") + + reduced = ray.get([w.run.remote() for w in workers]) + + # One tensor from each worker. + assert len(reduced) == 5 + for r in reduced: + assert len(r) == 1 + assert r.dtype == np.float32 + # All-reduce. Each tensor contributed 1.0. 5 tensors in total. + assert r[0] == 5.0 + + shutdown_torch_dist_process_group(workers) + + +def test_torch_process_group_nccl(): + @ray.remote(num_gpus=2) + class TestWorker(TorchDistributedWorker): + def __init__(self): + super().__init__() + self.dev = f"cuda:{ray.get_gpu_ids()[0]}" + + def run(self): + tensor = torch.tensor([1.0]).to(self.dev) + dist.all_reduce(tensor) + return tensor.cpu().numpy() + + workers = [TestWorker.remote() for _ in range(2)] + + init_torch_dist_process_group(workers, backend="nccl", init_method="env") + + reduced = ray.get([w.run.remote() for w in workers]) + + # One tensor from each worker (2 workers total). + assert len(reduced) == 2 + for r in reduced: + assert len(r) == 1 + assert r.dtype == np.float32 + # All-reduce. Each tensor contributed 1.0. 5 tensors in total. + assert r[0] == 2.0 + + shutdown_torch_dist_process_group(workers) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/air/util/check_ingest.py b/python/ray/air/util/check_ingest.py index 58e46559502b..219f5ffa22bf 100755 --- a/python/ray/air/util/check_ingest.py +++ b/python/ray/air/util/check_ingest.py @@ -31,7 +31,7 @@ class DummyTrainer(DataParallelTrainer): num_epochs: How many many times to iterate through the datasets for. prefetch_batches: The number of batches to prefetch ahead of the current block during the scan. This is the same as - :meth:`~ray.data.Datastream.iter_batches` + :meth:`~ray.data.Dataset.iter_batches` time_preprocessing_separately: Whether to time the preprocessing separately from actual iteration during training. If set to True, preprocessing execution is fully executed before training begins and the preprocessing diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py index cb34a85fb8ac..41bff8c40ffb 100644 --- a/python/ray/air/util/tensor_extensions/arrow.py +++ b/python/ray/air/util/tensor_extensions/arrow.py @@ -673,9 +673,7 @@ def from_numpy( # underlying scalar data type. # - shape: a variable-sized list array containing the shapes of each tensor # element. - if isinstance(arr, Iterable): - arr = list(arr) - elif not isinstance(arr, (list, tuple)): + if not isinstance(arr, (list, tuple, np.ndarray)): raise ValueError( "ArrowVariableShapedTensorArray can only be constructed from an " f"ndarray or a list/tuple of ndarrays, but got: {type(arr)}" @@ -716,13 +714,6 @@ def from_numpy( else: np_data_buffer = np.concatenate(raveled) dtype = np_data_buffer.dtype - if dtype.type is np.object_: - types_and_shapes = [(f"dtype={a.dtype}", f"shape={a.shape}") for a in arr] - raise ValueError( - "ArrowVariableShapedTensorArray only supports heterogeneous-shaped " - "tensor collections, not arbitrarily nested ragged tensors. Got " - f"arrays: {types_and_shapes}" - ) pa_dtype = pa.from_numpy_dtype(dtype) if pa.types.is_string(pa_dtype): if dtype.byteorder == ">" or ( diff --git a/python/ray/air/util/torch_dist.py b/python/ray/air/util/torch_dist.py new file mode 100644 index 000000000000..83bb48e66142 --- /dev/null +++ b/python/ray/air/util/torch_dist.py @@ -0,0 +1,186 @@ +"""This file is modeled after ray/python/ray/train/torch/config.py + +The logics are duplicated right now to allow maximum flexibility for +setting up PyTorch DDP process groups outside the context of Ray Train. +Eventually, these use cases should be consolidated. +""" + +from abc import ABC +from collections import defaultdict +from datetime import timedelta +import os +import torch +import torch.distributed as dist +from typing import Callable, List, T + +import ray +from ray.actor import ActorHandle +from ray.train._internal.utils import get_address_and_port +from ray.train.constants import DEFAULT_NCCL_SOCKET_IFNAME +from ray.train.torch.train_loop_utils import get_device + + +class TorchDistributedWorker(ABC): + """Defines the interfaces required by the init_torch_dist_process_group(). + + This is modeled after RayTrainerWorker, which allows arbitrary functions + to be executed on a remote DDP worker. + """ + + def execute(self, func: Callable[..., T], *args, **kwargs) -> T: + """Executes the input function and returns the output. + + Args: + func: The function to execute. + args, kwargs: The arguments to pass into func. + """ + return func(*args, **kwargs) + + +def _init_torch_distributed( + init_method: str, + backend: str, + rank: int, + world_size: int, + local_rank: int, + local_world_size: int, + master_addr: str, + master_port: str, + gpu_ids: List[int], +): + """Initialize torch distributed backend""" + if init_method == "env": + os.environ["MASTER_ADDR"] = str(master_addr) + os.environ["MASTER_PORT"] = str(master_port) + url = "env://" + elif init_method == "tcp": + url = f"tcp://{master_addr}:{master_port}" + else: + raise ValueError( + f"The provided init_method (" + f"{init_method}) is not supported. Must " + f"be either 'env' or 'tcp'." + ) + + if backend == "nccl": + # Same as in Ray Train + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" + # All workers on a same node should share the same set of + # visible GPUs. Otherwise they can't talk among themselves. + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gid) for gid in gpu_ids) + if "NCCL_SOCKET_IFNAME" not in os.environ: + os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_NCCL_SOCKET_IFNAME + + dist.init_process_group( + backend=backend, + init_method=url, + rank=rank, + world_size=world_size, + timeout=timedelta(seconds=1800), + ) + + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(local_rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["LOCAL_WORLD_SIZE"] = str(local_world_size) + + +def _get_node_and_gpu_ids(): + """Returns the node_id and gpu_ids for this worker.""" + node_id = ray.get_runtime_context().get_node_id() + gpu_ids = ray.get_gpu_ids() + return node_id, gpu_ids + + +def init_torch_dist_process_group( + workers: List[ActorHandle], + backend: str = "gloo", + init_method: str = "env", +) -> List[int]: + """Initialize a torch distributed process group. + + Note: this util assumes that the order of the workers passed in + are their global ranks. + + Args: + workers: A list of TorchDistributedWorker actors. + backend: The torch distributed backend to use, + possible choices are "gloo" or "nccl". + init_method: The initialization method to use, + possible choices are "env" or "tcp". + + Returns: + Local ranks on their respective nodes for the list of workers. + """ + if not dist.is_available(): + raise RuntimeError("Distributed torch is not available.") + + # Build a map from node_id to workers on that node. + node_and_gpu_ids = ray.get( + [w.execute.remote(_get_node_and_gpu_ids) for w in workers] + ) + # All the workers on a specific node. + node_to_workers = defaultdict(list) + # All the gpu ids visible to all the workers on a specific node. + node_to_gpu_ids = defaultdict(set) + for i, (node_id, gpu_ids) in enumerate(node_and_gpu_ids): + node_to_workers[node_id].append(i) + # Force list. + if not isinstance(gpu_ids, list): + gpu_ids = [gpu_ids] + # It is possible for a worker to have access to multiple GPUs. + for gpu_id in gpu_ids: + node_to_gpu_ids[node_id].add(gpu_id) + + # Assume the first worker is the master. + master_addr, master_port = ray.get(workers[0].execute.remote(get_address_and_port)) + + setup_futures = [] + world_size = len(workers) + local_ranks = [] + for rank, worker in enumerate(workers): + node_id = node_and_gpu_ids[rank][0] + local_rank = node_to_workers[node_id].index(rank) + local_world_size = len(node_to_workers[node_id]) + setup_futures.append( + worker.execute.remote( + _init_torch_distributed, + init_method=init_method, + backend=backend, + rank=rank, + world_size=world_size, + local_rank=local_rank, + local_world_size=local_world_size, + master_addr=master_addr, + master_port=master_port, + # list(set) will sort the gpu ids, so VISIBLE_CUDA_DEVICES + # is always sorted. + gpu_ids=list(node_to_gpu_ids[node_id]), + ) + ) + local_ranks.append(local_rank) + + # Wait for all workers to join the process group. + ray.get(setup_futures) + + return local_ranks + + +def _shutdown_torch_distributed(): + """Shutdown torch distributed backend""" + dist.destroy_process_group() + + if not torch.cuda.is_available(): + return + + # Clean up cuda memory. + devices = get_device() + if not isinstance(devices, list): + devices = [devices] + for device in devices: + with torch.cuda.device(device): + torch.cuda.empty_cache() + + +def shutdown_torch_dist_process_group(workers: List[ActorHandle]): + ray.get([w.execute.remote(_shutdown_torch_distributed) for w in workers]) diff --git a/python/ray/autoscaler/_private/aws/config.py b/python/ray/autoscaler/_private/aws/config.py index 76a03b99daa8..3390ea11af0c 100644 --- a/python/ray/autoscaler/_private/aws/config.py +++ b/python/ray/autoscaler/_private/aws/config.py @@ -4,12 +4,12 @@ import logging import os import time -from distutils.version import StrictVersion from functools import lru_cache, partial from typing import Any, Dict, List, Optional, Set, Tuple import boto3 import botocore +from packaging.version import Version from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper import ( CloudwatchHelper as cwh, @@ -58,7 +58,7 @@ # todo: cli_logger should handle this assert properly # this should probably also happens somewhere else -assert StrictVersion(boto3.__version__) >= StrictVersion( +assert Version(boto3.__version__) >= Version( "1.4.8" ), "Boto3 version >= 1.4.8 required, try `pip install -U boto3`" diff --git a/python/ray/autoscaler/_private/gcp/node_provider.py b/python/ray/autoscaler/_private/gcp/node_provider.py index c45a762a5daa..f8a5d684d870 100644 --- a/python/ray/autoscaler/_private/gcp/node_provider.py +++ b/python/ray/autoscaler/_private/gcp/node_provider.py @@ -1,3 +1,4 @@ +import concurrent.futures import logging import time from functools import wraps @@ -179,23 +180,38 @@ def create_node(self, base_config: dict, tags: dict, count: int) -> Dict[str, di ) # type: List[Tuple[dict, str]] return {instance_id: result for result, instance_id in results} + def _thread_unsafe_terminate_node(self, node_id: str): + # Assumes the global lock is held for the duration of this operation. + # The lock may be held by a different thread if in `terminate_nodes()` case. + logger.info("NodeProvider: {}: Terminating node".format(node_id)) + resource = self._get_resource_depending_on_node_name(node_id) + try: + result = resource.delete_instance( + node_id=node_id, + ) + except googleapiclient.errors.HttpError as http_error: + if http_error.resp.status == 404: + logger.warning( + f"Tried to delete the node with id {node_id} " + "but it was already gone." + ) + else: + raise http_error from None + return result + @_retry def terminate_node(self, node_id: str): with self.lock: - resource = self._get_resource_depending_on_node_name(node_id) - try: - result = resource.delete_instance( - node_id=node_id, - ) - except googleapiclient.errors.HttpError as http_error: - if http_error.resp.status == 404: - logger.warning( - f"Tried to delete the node with id {node_id} " - "but it was already gone." - ) - else: - raise http_error from None - return result + self._thread_unsafe_terminate_node(node_id) + + def terminate_nodes(self, node_ids: List[str]): + if not node_ids: + return None + + with self.lock, concurrent.futures.ThreadPoolExecutor() as executor: + result = executor.map(self._thread_unsafe_terminate_node, node_ids) + + return list(result) @_retry def _get_node(self, node_id: str) -> GCPNode: diff --git a/python/ray/autoscaler/_private/monitor.py b/python/ray/autoscaler/_private/monitor.py index 14faf14fa8e9..f15e109fc9d4 100644 --- a/python/ray/autoscaler/_private/monitor.py +++ b/python/ray/autoscaler/_private/monitor.py @@ -16,7 +16,6 @@ import ray._private.ray_constants as ray_constants import ray._private.utils from ray._private.event.event_logger import get_event_logger -from ray._private.gcs_pubsub import GcsPublisher from ray._private.ray_logging import setup_component_logger from ray._raylet import GcsClient from ray.autoscaler._private.autoscaler import StandardAutoscaler @@ -560,7 +559,7 @@ def _handle_failure(self, error): _internal_kv_put( ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True ) - gcs_publisher = GcsPublisher(address=self.gcs_address) + gcs_publisher = ray._raylet.GcsPublisher(address=self.gcs_address) from ray._private.utils import publish_error_to_driver publish_error_to_driver( diff --git a/python/ray/autoscaler/aws/BUILD b/python/ray/autoscaler/aws/BUILD index 4e733cfe26be..d7fff50db624 100644 --- a/python/ray/autoscaler/aws/BUILD +++ b/python/ray/autoscaler/aws/BUILD @@ -1,5 +1,11 @@ filegroup( - name = "example", - data = glob(["example-*.yaml"]), - visibility = ["//python/ray/tests:__pkg__"], + name = "example", + data = glob(["example-*.yaml"]), + visibility = ["//python/ray/tests:__pkg__"], +) + +filegroup( + name = "test_configs", + data = glob(["tests/*.yaml"]), + visibility = ["//release:__pkg__"], ) diff --git a/python/ray/autoscaler/aws/development-example.yaml b/python/ray/autoscaler/aws/development-example.yaml index 467e755247ee..832bc1e0118d 100644 --- a/python/ray/autoscaler/aws/development-example.yaml +++ b/python/ray/autoscaler/aws/development-example.yaml @@ -59,7 +59,7 @@ setup_commands: - git clone https://github.com/ray-project/ray || true - ray/ci/env/install-bazel.sh - cd ray/python/ray/dashboard/client; npm ci; npm run build - - pip install boto3==1.4.8 cython==0.29.32 aiohttp grpcio psutil setproctitle + - pip install boto3>=1.4.8 cython==0.29.32 aiohttp grpcio psutil setproctitle - cd ray/python; pip install -e . --verbose # Command to start ray on the head node. You don't need to change this. diff --git a/python/ray/autoscaler/aws/example-full.yaml b/python/ray/autoscaler/aws/example-full.yaml index 49df110fc64c..18d7b88ef5a7 100644 --- a/python/ray/autoscaler/aws/example-full.yaml +++ b/python/ray/autoscaler/aws/example-full.yaml @@ -80,6 +80,7 @@ available_node_types: - DeviceName: /dev/sda1 Ebs: VolumeSize: 140 + VolumeType: gp3 # Additional options in the boto docs. ray.worker.default: # The minimum number of worker nodes of this type to launch. diff --git a/python/ray/autoscaler/aws/example-gpu-docker.yaml b/python/ray/autoscaler/aws/example-gpu-docker.yaml index 85c65ae416dd..6daac090092c 100644 --- a/python/ray/autoscaler/aws/example-gpu-docker.yaml +++ b/python/ray/autoscaler/aws/example-gpu-docker.yaml @@ -120,7 +120,7 @@ setup_commands: [] # Custom commands that will be run on the head node after common setup. head_setup_commands: - - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions + - pip install boto3>=1.4.8 # 1.4.8 adds InstanceMarketOptions # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/aws/example-java.yaml b/python/ray/autoscaler/aws/example-java.yaml index 6563d6bf7be4..cc3cd47ac016 100644 --- a/python/ray/autoscaler/aws/example-java.yaml +++ b/python/ray/autoscaler/aws/example-java.yaml @@ -60,7 +60,7 @@ setup_commands: - python3 -m pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl # Custom commands that will be run on the head node after common setup. head_setup_commands: - - python3 -m pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions + - python3 -m pip install boto3>=1.4.8 # 1.4.8 adds InstanceMarketOptions # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] # Command to start ray on the head node. You don't need to change this. diff --git a/python/ray/autoscaler/aws/tests/aws_compute.yaml b/python/ray/autoscaler/aws/tests/aws_compute.yaml index 88a9dd055311..1ef4e02ba1e8 100644 --- a/python/ray/autoscaler/aws/tests/aws_compute.yaml +++ b/python/ray/autoscaler/aws/tests/aws_compute.yaml @@ -1,4 +1,4 @@ -cloud_id: cld_17WvYIBBkdgLwEUNcLeRAE +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 aws: diff --git a/python/ray/autoscaler/aws/tests/aws_launch_and_verify_cluster.py b/python/ray/autoscaler/aws/tests/aws_launch_and_verify_cluster.py deleted file mode 100644 index 17253f8c4fc3..000000000000 --- a/python/ray/autoscaler/aws/tests/aws_launch_and_verify_cluster.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -This script automates the process of launching and verifying a Ray cluster using a given -cluster configuration file. It also handles cluster cleanup before and after the -verification process. The script requires two command-line arguments: the path to the -cluster configuration file and an optional number of retries for the verification step. - -Usage: - python aws_launch_and_verify_cluster.py [retries] - -Example: - python aws_launch_and_verify_cluster.py /path/to/cluster_config.yaml 5 -""" -import os -import subprocess -import sys -import time -from pathlib import Path - -import boto3 - - -def check_arguments(args): - """ - Check command line arguments and return the cluster configuration file path and the - number of retries. - - Args: - args: The list of command line arguments. - - Returns: - A tuple containing the cluster config file path and the number of retries. - - Raises: - SystemExit: If an incorrect number of command line arguments is provided. - """ - if len(args) < 2: - print( - "Error: Please provide a path to the cluster configuration file as a " - "command line argument." - ) - sys.exit(1) - return args[1], int(args[2]) if len(args) >= 3 else 3 - - -def check_file(file_path): - """ - Check if the provided file path is valid and readable. - - Args: - file_path: The path of the file to check. - - Raises: - SystemExit: If the file is not readable or does not exist. - """ - if not file_path.is_file() or not os.access(file_path, os.R_OK): - print(f"Error: Cannot read cluster configuration file: {file_path}") - sys.exit(1) - - -def download_ssh_key(): - """Download the ssh key from the S3 bucket to the local machine.""" - print("======================================") - print("Downloading ssh key...") - # Create a Boto3 client to interact with S3 - s3_client = boto3.client("s3", region_name="us-west-2") - - # Set the name of the S3 bucket and the key to download - bucket_name = "oss-release-test-ssh-keys" - key_name = "ray-autoscaler_59_us-west-2.pem" - - # Download the key from the S3 bucket to a local file - local_key_path = os.path.expanduser(f"~/.ssh/{key_name}") - s3_client.download_file(bucket_name, key_name, local_key_path) - - # Set permissions on the key file - os.chmod(local_key_path, 0o400) - - -def cleanup_cluster(cluster_config): - """ - Clean up the cluster using the given cluster configuration file. - - Args: - cluster_config: The path of the cluster configuration file. - """ - print("======================================") - print("Cleaning up cluster...") - subprocess.run(["ray", "down", "-v", "-y", str(cluster_config)], check=True) - - -def run_ray_commands(cluster_config, retries): - """ - Run the necessary Ray commands to start a cluster, verify Ray is running, and clean - up the cluster. - - Args: - cluster_config: The path of the cluster configuration file. - retries: The number of retries for the verification step. - """ - print("======================================") - cleanup_cluster(cluster_config) - - print("======================================") - print("Starting new cluster...") - subprocess.run(["ray", "up", "-v", "-y", str(cluster_config)], check=True) - - print("======================================") - print("Verifying Ray is running...") - - success = False - count = 0 - while count < retries: - try: - subprocess.run( - [ - "ray", - "exec", - "-v", - str(cluster_config), - "python -c 'import ray; ray.init(\"localhost:6379\")'", - ], - check=True, - ) - success = True - break - except subprocess.CalledProcessError: - count += 1 - print(f"Verification failed. Retry attempt {count} of {retries}...") - time.sleep(5) - - if not success: - print("======================================") - print( - f"Error: Verification failed after {retries} attempts. Cleaning up cluster " - "before exiting..." - ) - cleanup_cluster(cluster_config) - print("======================================") - print("Exiting script.") - sys.exit(1) - - print("======================================") - print("Ray verification successful.") - - cleanup_cluster(cluster_config) - - print("======================================") - print("Finished executing script.") - - -if __name__ == "__main__": - cluster_config, retries = check_arguments(sys.argv) - cluster_config = Path(cluster_config) - check_file(cluster_config) - - print(f"Using cluster configuration file: {cluster_config}") - print(f"Number of retries for 'verify ray is running' step: {retries}") - - download_ssh_key() - run_ray_commands(cluster_config, retries) diff --git a/python/ray/autoscaler/gcp/BUILD b/python/ray/autoscaler/gcp/BUILD index 4e733cfe26be..c587b2d2fc80 100644 --- a/python/ray/autoscaler/gcp/BUILD +++ b/python/ray/autoscaler/gcp/BUILD @@ -3,3 +3,9 @@ filegroup( data = glob(["example-*.yaml"]), visibility = ["//python/ray/tests:__pkg__"], ) + +filegroup( + name = "test_configs", + data = glob(["tests/*.yaml"]), + visibility = ["//release:__pkg__"], +) \ No newline at end of file diff --git a/python/ray/autoscaler/gcp/example-minimal.yaml b/python/ray/autoscaler/gcp/example-minimal.yaml index c8914d7800e8..365d5df80e5c 100644 --- a/python/ray/autoscaler/gcp/example-minimal.yaml +++ b/python/ray/autoscaler/gcp/example-minimal.yaml @@ -1,13 +1,8 @@ -# A unique identifier for the head node and workers of this cluster. +auth: + ssh_user: ubuntu cluster_name: minimal - -# Cloud-provider specific configuration. provider: - type: gcp - region: us-west1 - availability_zone: us-west1-a - project_id: null # Globally unique project id - -# How Ray will authenticate with newly launched nodes. -auth: - ssh_user: ubuntu + availability_zone: us-west1-a + project_id: null # TODO: set your GCP project ID here + region: us-west1 + type: gcp diff --git a/python/ray/autoscaler/gcp/tests/gce_config.yaml b/python/ray/autoscaler/gcp/tests/gce_config.yaml new file mode 100644 index 000000000000..028cc7ce2a32 --- /dev/null +++ b/python/ray/autoscaler/gcp/tests/gce_config.yaml @@ -0,0 +1,11 @@ +base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} +debian_packages: [] + +python: + pip_packages: [] + conda_packages: [] + +post_build_cmds: + - pip3 uninstall -y ray && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + - pip3 install -U ray[default] + - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} diff --git a/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml b/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml new file mode 100644 index 000000000000..c6d1a6729fa0 --- /dev/null +++ b/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml @@ -0,0 +1,27 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-c + +max_workers: 0 + +head_node_type: + name: head_node + instance_type: n2-standard-32 # m5.8xlarge + +worker_node_types: [] + +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 500 + +#aws: +# BlockDeviceMappings: +# - DeviceName: /dev/sda1 +# Ebs: +# DeleteOnTermination: true +# VolumeSize: 500 diff --git a/python/ray/autoscaler/launch_and_verify_cluster.py b/python/ray/autoscaler/launch_and_verify_cluster.py new file mode 100644 index 000000000000..6e130d47c60a --- /dev/null +++ b/python/ray/autoscaler/launch_and_verify_cluster.py @@ -0,0 +1,223 @@ +""" +This script automates the process of launching and verifying a Ray cluster using a given +cluster configuration file. It also handles cluster cleanup before and after the +verification process. The script requires one command-line argument: the path to the +cluster configuration file. + +Usage: + python launch_and_verify_cluster.py [--no-config-cache] [--retries NUM_RETRIES] + + +Example: + python launch_and_verify_cluster.py --retries 5 --no-config-cache + /path/to/cluster_config.yaml +""" +import argparse +import os +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +import boto3 +import yaml + + +def check_arguments(): + """ + Check command line arguments and return the cluster configuration file path, the + number of retries, and the value of the --no-config-cache flag. + + Returns: + A tuple containing the cluster config file path, the number of retries, and the + value of the --no-config-cache flag. + """ + parser = argparse.ArgumentParser(description="Launch and verify a Ray cluster") + parser.add_argument( + "--no-config-cache", + action="store_true", + help="Pass the --no-config-cache flag to Ray CLI commands", + ) + parser.add_argument( + "--retries", + type=int, + default=3, + help="Number of retries for verifying Ray is running (default: 3)", + ) + parser.add_argument( + "cluster_config", type=str, help="Path to the cluster configuration file" + ) + args = parser.parse_args() + + return args.cluster_config, args.retries, args.no_config_cache + + +def check_file(file_path): + """ + Check if the provided file path is valid and readable. + + Args: + file_path: The path of the file to check. + + Raises: + SystemExit: If the file is not readable or does not exist. + """ + if not file_path.is_file() or not os.access(file_path, os.R_OK): + print(f"Error: Cannot read cluster configuration file: {file_path}") + sys.exit(1) + + +def download_ssh_key(): + """Download the ssh key from the S3 bucket to the local machine.""" + print("======================================") + print("Downloading ssh key...") + # Create a Boto3 client to interact with S3 + s3_client = boto3.client("s3", region_name="us-west-2") + + # Set the name of the S3 bucket and the key to download + bucket_name = "aws-cluster-launcher-test" + key_name = "ray-autoscaler_59_us-west-2.pem" + + # Download the key from the S3 bucket to a local file + local_key_path = os.path.expanduser(f"~/.ssh/{key_name}") + if not os.path.exists(os.path.dirname(local_key_path)): + os.makedirs(os.path.dirname(local_key_path)) + s3_client.download_file(bucket_name, key_name, local_key_path) + + # Set permissions on the key file + os.chmod(local_key_path, 0o400) + + +def cleanup_cluster(cluster_config): + """ + Clean up the cluster using the given cluster configuration file. + + Args: + cluster_config: The path of the cluster configuration file. + """ + print("======================================") + print("Cleaning up cluster...") + subprocess.run(["ray", "down", "-v", "-y", str(cluster_config)], check=True) + + +def run_ray_commands(cluster_config, retries, no_config_cache): + """ + Run the necessary Ray commands to start a cluster, verify Ray is running, and clean + up the cluster. + + Args: + cluster_config: The path of the cluster configuration file. + retries: The number of retries for the verification step. + no_config_cache: Whether to pass the --no-config-cache flag to the ray CLI + commands. + """ + print("======================================") + cleanup_cluster(cluster_config) + + print("======================================") + print("Starting new cluster...") + cmd = ["ray", "up", "-v", "-y"] + if no_config_cache: + cmd.append("--no-config-cache") + cmd.append(str(cluster_config)) + subprocess.run(cmd, check=True) + + print("======================================") + print("Verifying Ray is running...") + + success = False + count = 0 + while count < retries: + try: + cmd = [ + "ray", + "exec", + "-v", + str(cluster_config), + "python -c 'import ray; ray.init(\"localhost:6379\")'", + ] + if no_config_cache: + cmd.append("--no-config-cache") + subprocess.run(cmd, check=True) + success = True + break + except subprocess.CalledProcessError: + count += 1 + print(f"Verification failed. Retry attempt {count} of {retries}...") + time.sleep(5) + + if not success: + print("======================================") + print( + f"Error: Verification failed after {retries} attempts. Cleaning up cluster " + "before exiting..." + ) + cleanup_cluster(cluster_config) + print("======================================") + print("Exiting script.") + sys.exit(1) + + print("======================================") + print("Ray verification successful.") + + cleanup_cluster(cluster_config) + + print("======================================") + print("Finished executing script successfully.") + + +if __name__ == "__main__": + cluster_config, retries, no_config_cache = check_arguments() + cluster_config = Path(cluster_config) + check_file(cluster_config) + + print(f"Using cluster configuration file: {cluster_config}") + print(f"Number of retries for 'verify ray is running' step: {retries}") + print(f"Using --no-config-cache flag: {no_config_cache}") + + config_yaml = yaml.safe_load(cluster_config.read_text()) + provider_type = config_yaml.get("provider", {}).get("type") + if provider_type == "aws": + download_ssh_key() + run_ray_commands(cluster_config, retries, no_config_cache) + elif provider_type == "gcp": + print("======================================") + print("GCP provider detected. Skipping ssh key download step.") + # Get the active account email + account_email = ( + subprocess.run( + ["gcloud", "config", "get-value", "account"], + stdout=subprocess.PIPE, + check=True, + ) + .stdout.decode("utf-8") + .strip() + ) + print("Active account email:", account_email) + # Get the current project ID + project_id = ( + subprocess.run( + ["gcloud", "config", "get-value", "project"], + stdout=subprocess.PIPE, + check=True, + ) + .stdout.decode("utf-8") + .strip() + ) + print( + f"Injecting GCP project '{project_id}' into cluster configuration file..." + ) + config_yaml["provider"]["project_id"] = project_id + + # Create a new temporary file and dump the updated configuration into it + with tempfile.NamedTemporaryFile(suffix=".yaml") as temp: + temp.write(yaml.dump(config_yaml).encode("utf-8")) + temp.flush() + cluster_config = Path(temp.name) + run_ray_commands(cluster_config, retries, no_config_cache) + + else: + print("======================================") + print("Provider type not recognized. Exiting script.") + sys.exit(1) diff --git a/python/ray/autoscaler/v2/BUILD b/python/ray/autoscaler/v2/BUILD new file mode 100644 index 000000000000..723d585ebc70 --- /dev/null +++ b/python/ray/autoscaler/v2/BUILD @@ -0,0 +1,22 @@ +# -------------------------------------------------------------------- +# Tests from the python/ray/autoscaler/v2/tests directory. +# Covers all tests starting with `test_`. +# Please keep these sorted alphabetically. +# -------------------------------------------------------------------- +load("//bazel:python.bzl", "py_test_module_list") + +py_test( + name = "test_instance_storage", + size = "small", + srcs = ["tests/test_instance_storage.py"], + tags = ["team:core"], + deps = ["//:ray_lib",], +) + +py_test( + name = "test_storage", + size = "small", + srcs = ["tests/test_storage.py"], + tags = ["team:core"], + deps = ["//:ray_lib",], +) \ No newline at end of file diff --git a/python/ray/autoscaler/v2/instance_manager/config.py b/python/ray/autoscaler/v2/instance_manager/config.py new file mode 100644 index 000000000000..48f81237e206 --- /dev/null +++ b/python/ray/autoscaler/v2/instance_manager/config.py @@ -0,0 +1,102 @@ +import copy +from typing import Any, Dict, List + +from ray.autoscaler._private.util import hash_runtime_conf +from ray.core.generated.instance_manager_pb2 import Instance + + +class NodeProviderConfig(object): + """ + NodeProviderConfig is the helper class to provide instance + related configs. + """ + + def __init__(self, node_configs: Dict[str, Any]) -> None: + self._sync_continuously = False + self.update_configs(node_configs) + + def update_configs(self, node_configs: Dict[str, Any]) -> None: + self._node_configs = node_configs + self._calculate_hashes() + self._sync_continuously = self._node_configs.get( + "generate_file_mounts_contents_hash", True + ) + + def _calculate_hashes(self) -> None: + self._runtime_hash, self._file_mounts_contents_hash = hash_runtime_conf( + self._node_configs["file_mounts"], + self._node_configs["cluster_synced_files"], + [ + self._node_configs["worker_setup_commands"], + self._node_configs["worker_start_ray_commands"], + ], + generate_file_mounts_contents_hash=self._node_configs.get( + "generate_file_mounts_contents_hash", True + ), + ) + + def get_node_config(self, instance_type_name: str) -> Dict[str, Any]: + return copy.deepcopy( + self._node_configs["available_node_types"][instance_type_name][ + "node_config" + ] + ) + + def get_docker_config(self, instance_type_name: str) -> Dict[str, Any]: + if "docker" not in self._node_configs: + return {} + docker_config = copy.deepcopy(self._node_configs.get("docker", {})) + node_specific_docker_config = self._node_configs["available_node_types"][ + instance_type_name + ].get("docker", {}) + docker_config.update(node_specific_docker_config) + return docker_config + + def get_worker_start_ray_commands(self, instance: Instance) -> List[str]: + if ( + instance.num_successful_updates > 0 + and not self._node_config_provider.restart_only + ): + return [] + return self._node_configs["worker_start_ray_commands"] + + def get_worker_setup_commands(self, instance: Instance) -> List[str]: + if ( + instance.num_successful_updates > 0 + and self._node_config_provider.restart_only + ): + return [] + + return self._node_configs["available_node_types"][instance.name][ + "worker_setup_commands" + ] + + def get_node_type_specific_config( + self, instance_type_name: str, config_name: str + ) -> Any: + config = self._node_config_provider.get_config(config_name) + node_specific_config = self._node_configs["available_node_types"][ + instance_type_name + ] + if config_name in node_specific_config: + config = node_specific_config[config_name] + return config + + def get_config(self, config_name, default=None) -> Any: + return self._node_configs.get(config_name, default) + + @property + def restart_only(self) -> bool: + return self._node_configs.get("restart_only", False) + + @property + def no_restart(self) -> bool: + return self._node_configs.get("no_restart", False) + + @property + def runtime_hash(self) -> str: + return self._runtime_hash + + @property + def file_mounts_contents_hash(self) -> str: + return self._file_mounts_contents_hash diff --git a/python/ray/autoscaler/v2/instance_manager/instance_storage.py b/python/ray/autoscaler/v2/instance_manager/instance_storage.py new file mode 100644 index 000000000000..990cb1f00733 --- /dev/null +++ b/python/ray/autoscaler/v2/instance_manager/instance_storage.py @@ -0,0 +1,199 @@ +import logging +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass +from typing import Dict, List, Optional, Set, Tuple + +from ray.autoscaler.v2.instance_manager.storage import Storage, StoreStatus +from ray.core.generated.instance_manager_pb2 import Instance + +logger = logging.getLogger(__name__) + + +@dataclass +class InstanceUpdateEvent: + """Notifies the status change of an instance.""" + + instance_id: str + new_status: int + + +class InstanceUpdatedSuscriber(metaclass=ABCMeta): + """Subscribers to instance status changes.""" + + @abstractmethod + def notify(self, events: List[InstanceUpdateEvent]) -> None: + pass + + +class InstanceStorage: + """Instance storage stores the states of instances in the storage. It also + allows users to subscribe to instance status changes to trigger reconciliation + with cloud provider.""" + + def __init__( + self, + cluster_id: str, + storage: Storage, + status_change_subscriber: Optional[InstanceUpdatedSuscriber] = None, + ) -> None: + self._storage = storage + self._cluster_id = cluster_id + self._table_name = f"instance_table@{cluster_id}" + self._status_change_subscriber = status_change_subscriber + + def batch_upsert_instances( + self, + updates: List[Instance], + expected_storage_version: Optional[int] = None, + ) -> StoreStatus: + """Upsert instances into the storage. If the instance already exists, + it will be updated. Otherwise, it will be inserted. If the + expected_storage_version is specified, the update will fail if the + current storage version does not match the expected version. + + Note the version of the upserted instances will be set to the current + storage version. + + Args: + updates: A list of instances to be upserted. + expected_storage_version: The expected storage version. + + Returns: + StoreStatus: A tuple of (success, storage_version). + """ + mutations = {} + version = self._storage.get_version() + # handle version mismatch + if expected_storage_version and expected_storage_version != version: + return StoreStatus(False, version) + + for instance in updates: + # the instance version is set to 0, it will be + # populated by the storage entry's verion on read + instance.version = 0 + mutations[instance.instance_id] = instance.SerializeToString() + + result, version = self._storage.batch_update( + self._table_name, mutations, {}, expected_storage_version + ) + + if result and self._status_change_subscriber: + self._status_change_subscriber.notify( + [ + InstanceUpdateEvent( + instance_id=instance.instance_id, + new_status=instance.status, + ) + for instance in updates + ], + ) + + return StoreStatus(result, version) + + def upsert_instance( + self, + instance: Instance, + expected_instance_version: Optional[int] = None, + expected_storage_verison: Optional[int] = None, + ) -> StoreStatus: + """Upsert an instance in the storage. + If the expected_instance_version is specified, the update will fail + if the current instance version does not match the expected version. + Similarly, if the expected_storage_version is + specified, the update will fail if the current storage version does not + match the expected version. + + Note the version of the upserted instances will be set to the current + storage version. + + Args: + instance: The instance to be updated. + expected_instance_version: The expected instance version. + expected_storage_version: The expected storage version. + + Returns: + StoreStatus: A tuple of (success, storage_version). + """ + # the instance version is set to 0, it will be + # populated by the storage entry's verion on read + instance.version = 0 + result, version = self._storage.update( + self._table_name, + key=instance.instance_id, + value=instance.SerializeToString(), + expected_entry_version=expected_instance_version, + expected_storage_version=expected_storage_verison, + insert_only=False, + ) + + if result and self._status_change_subscriber: + self._status_change_subscriber.notify( + [ + InstanceUpdateEvent( + instance_id=instance.instance_id, + new_status=instance.status, + ) + ], + ) + + return StoreStatus(result, version) + + def get_instances( + self, instance_ids: List[str] = None, status_filter: Set[int] = None + ) -> Tuple[Dict[str, Instance], int]: + """Get instances from the storage. + + Args: + instance_ids: A list of instance ids to be retrieved. If empty, all + instances will be retrieved. + + Returns: + Tuple[Dict[str, Instance], int]: A tuple of (instances, version). + The instances is a dictionary of (instance_id, instance) pairs. + """ + instance_ids = instance_ids or [] + status_filter = status_filter or set() + pairs, version = self._storage.get(self._table_name, instance_ids) + instances = {} + for instance_id, (instance_data, entry_version) in pairs.items(): + instance = Instance() + instance.ParseFromString(instance_data) + instance.version = entry_version + if status_filter and instance.status not in status_filter: + continue + instances[instance_id] = instance + return instances, version + + def batch_delete_instances( + self, instance_ids: List[str], expected_storage_version: Optional[int] = None + ) -> StoreStatus: + """Delete instances from the storage. If the expected_version is + specified, the update will fail if the current storage version does not + match the expected version. + + Args: + to_delete: A list of instances to be deleted. + expected_version: The expected storage version. + + Returns: + StoreStatus: A tuple of (success, storage_version). + """ + version = self._storage.get_version() + if expected_storage_version and expected_storage_version != version: + return StoreStatus(False, version) + + result = self._storage.batch_update( + self._table_name, {}, instance_ids, expected_storage_version + ) + + if result[0] and self._status_change_subscriber: + self._status_change_subscriber.notify( + [ + InstanceUpdateEvent( + instance_id=instance_id, + new_status=Instance.GARAGE_COLLECTED, + ) + for instance_id in instance_ids + ], + ) + return result diff --git a/python/ray/autoscaler/v2/instance_manager/node_provider.py b/python/ray/autoscaler/v2/instance_manager/node_provider.py new file mode 100644 index 000000000000..6d0a3c92c9c0 --- /dev/null +++ b/python/ray/autoscaler/v2/instance_manager/node_provider.py @@ -0,0 +1,136 @@ +import logging +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Set, override + +from ray.autoscaler._private.node_launcher import BaseNodeLauncher +from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1 +from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE +from ray.autoscaler.v2.instance_manager.config import NodeProviderConfig +from ray.core.generated.instance_manager_pb2 import Instance, InstanceType + +logger = logging.getLogger(__name__) + + +class NodeProvider(metaclass=ABCMeta): + """NodeProvider defines the interface for + interacting with cloud provider, such as AWS, GCP, Azure, etc. + """ + + @abstractmethod + def create_nodes(self, instance_type: InstanceType, count: int) -> List[str]: + """Create new nodes synchronously, returns all non-terminated nodes in the cluster. + Note that create_nodes could fail partially. + """ + pass + + @abstractmethod + def async_terminate_nodes(self, cloud_instance_ids: List[str]) -> None: + """ + Terminate nodes asynchronously, returns immediately.""" + pass + + @abstractmethod + def get_non_terminated_nodes( + self, + ) -> Dict[str, Instance]: + """Get all non-terminated nodes in the cluster""" + pass + + @abstractmethod + def get_nodes_by_cloud_id( + self, + cloud_instance_ids: List[str], + ) -> Dict[str, Instance]: + """Get nodes by node ids, including terminated nodes""" + pass + + @abstractmethod + def is_readonly(self) -> bool: + return False + + +class NodeProviderAdapter(NodeProvider): + """ + Warps a NodeProviderV1 to a NodeProvider. + """ + + def __init__( + self, + provider: NodeProviderV1, + node_launcher: BaseNodeLauncher, + instance_config_provider: NodeProviderConfig, + ) -> None: + super().__init__() + self._provider = provider + self._node_launcher = node_launcher + self._config = instance_config_provider + + def _filter_instances( + self, + instances: Dict[str, Instance], + instance_ids_filter: Set[str], + instance_states_filter: Set[int], + ) -> Dict[str, Instance]: + filtered = {} + for instance_id, instance in instances.items(): + if instance_ids_filter and instance_id not in instance_ids_filter: + continue + if instance_states_filter and instance.state not in instance_states_filter: + continue + filtered[instance_id] = instance + return filtered + + @override + def create_nodes(self, instance_type: InstanceType, count: int) -> List[Instance]: + result = self._node_launcher.launch_node( + self._config.get_node_config(instance_type.name), + count, + instance_type.name, + ) + # TODO: we should handle failures where the instance type is + # not available + if result: + return [ + self._get_instance(cloud_instance_id) + for cloud_instance_id in result.keys() + ] + return [] + + @override + def async_terminate_nodes(self, clould_instance_ids: List[str]) -> None: + self._provider.terminate_node(clould_instance_ids) + + @override + def is_readonly(self) -> bool: + return self._provider.is_readonly() + + @override + def get_non_terminated_nodes(self): + clould_instance_ids = self._provider.non_terminated_nodes() + return self.get_nodes_by_id(clould_instance_ids) + + @override + def get_nodes_by_cloud_id( + self, + cloud_instance_ids: List[str], + ) -> Dict[str, Instance]: + instances = {} + for cloud_instance_id in cloud_instance_ids: + instances[cloud_instance_id] = self._get_instance(cloud_instance_id) + return instances + + def _get_instance(self, cloud_instance_id: str) -> Instance: + instance = Instance() + instance.cloud_instance_id = cloud_instance_id + if self._provider.is_running(cloud_instance_id): + instance.state = Instance.STARTING + elif self._provider.is_terminated(cloud_instance_id): + instance.state = Instance.STOPPED + else: + instance.state = Instance.INSTANCE_STATUS_UNSPECIFIED + instance.interal_ip = self._provider.internal_ip(cloud_instance_id) + instance.external_ip = self._provider.external_ip(cloud_instance_id) + instance.instance_type = self._provider.node_tags(cloud_instance_id)[ + TAG_RAY_USER_NODE_TYPE + ] + return instance diff --git a/python/ray/autoscaler/v2/instance_manager/ray_installer.py b/python/ray/autoscaler/v2/instance_manager/ray_installer.py new file mode 100644 index 000000000000..f4f936cbe2b8 --- /dev/null +++ b/python/ray/autoscaler/v2/instance_manager/ray_installer.py @@ -0,0 +1,66 @@ +import logging + +from ray.autoscaler._private.updater import NodeUpdater +from ray.autoscaler._private.util import with_head_node_ip +from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1 +from ray.autoscaler.v2.instance_manager.config import NodeProviderConfig +from ray.core.generated.instance_manager_pb2 import Instance + +logger = logging.getLogger(__name__) + + +class RayInstaller(object): + """ + RayInstaller is responsible for installing ray on the target instance. + """ + + def __init__( + self, + provider: NodeProviderV1, + config: NodeProviderConfig, + ) -> None: + self._provider = provider + self._config = config + + def install_ray(self, instance: Instance, head_node_ip: str) -> bool: + """ + Install ray on the target instance synchronously. + """ + + setup_commands = self._config.get_worker_setup_commands(instance) + ray_start_commands = self._config.get_worker_start_ray_commands(instance) + docker_config = self._config.get_docker_config(instance) + + logger.info( + f"Creating new (spawn_updater) updater thread for node" + f" {instance.cloud_instance_id}." + ) + updater = NodeUpdater( + node_id=instance.instance_id, + provider_config=self._config.get_config("provider"), + provider=self._provider, + auth_config=self._config.get_config("auth"), + cluster_name=self._config.get_config("cluster_name"), + file_mounts=self._config.get_config("file_mounts"), + initialization_commands=with_head_node_ip( + self.get_node_type_specific_config( + instance.instance_id, "initialization_commands" + ), + head_node_ip, + ), + setup_commands=with_head_node_ip(setup_commands, head_node_ip), + ray_start_commands=with_head_node_ip(ray_start_commands, head_node_ip), + runtime_hash=self._config.runtime_hash, + file_mounts_contents_hash=self._config.file_mounts_contents_hash, + is_head_node=False, + cluster_synced_files=self._config.get_config("cluster_synced_files"), + rsync_options={ + "rsync_exclude": self._config.get_config("rsync_exclude"), + "rsync_filter": self._config.get_config("rsync_filter"), + }, + use_internal_ip=True, + docker_config=docker_config, + node_resources=instance.node_resources, + ) + updater.run() + # TODO: handle failures diff --git a/python/ray/autoscaler/v2/instance_manager/storage.py b/python/ray/autoscaler/v2/instance_manager/storage.py new file mode 100644 index 000000000000..40447bf79696 --- /dev/null +++ b/python/ray/autoscaler/v2/instance_manager/storage.py @@ -0,0 +1,180 @@ +import copy +from abc import ABCMeta, abstractmethod +from collections import defaultdict, namedtuple +from threading import Lock +from typing import Dict, List, Optional, Tuple + +StoreStatus = namedtuple("StoreStatus", ["success", "version"]) +VersionedValue = namedtuple("VersionedValue", ["value", "version"]) + + +class Storage(metaclass=ABCMeta): + """Interface for a storage backend that stores the state of nodes in the cluster. + + The storage is thread-safe. + + The storage is versioned, which means that each successful stage change to the + storage will bump the version number. The version number can be used to + implement optimistic concurrency control. + + Each entry in the storage table is also versioned. The version number of an entry + is the last version number when the entry is updated. + """ + + @abstractmethod + def batch_update( + self, + table: str, + mutation: Optional[Dict[str, str]] = None, + deletion: Optional[List[str]] = None, + expected_storage_version: Optional[int] = None, + ) -> StoreStatus: + """Batch update the storage table. This method is atomic. + + Args: + table: The name of the table. + mutation: A dictionary of key-value pairs to be updated. + deletion: A list of keys to be deleted. + expected_storage_version: The expected storage version. The + update will fail if the version does not match the + current storage version. + + Returns: + StoreStatus: A tuple of (success, version). If the update is + successful, returns (True, new_version). + Otherwise, returns (False, current_version). + """ + raise NotImplementedError("batch_update() has to be implemented") + + @abstractmethod + def update( + self, + table: str, + key: str, + value: str, + expected_entry_version: Optional[int] = None, + insert_only: bool = False, + ) -> StoreStatus: + """Update a single entry in the storage table. + + Args: + table: The name of the table. + key: The key of the entry. + value: The value of the entry. + expected_entry_version: The expected version of the entry. + The update will fail if the version does not match the current + version of the entry. + insert_only: If True, the update will + fail if the entry already exists. + Returns: + StoreStatus: A tuple of (success, version). If the update is + successful, returns (True, new_version). Otherwise, + returns (False, current_version). + """ + raise NotImplementedError("update() has to be implemented") + + @abstractmethod + def get_all(self, table: str) -> Tuple[Dict[str, Tuple[str, int]], int]: + raise NotImplementedError("get_all() has to be implemented") + + @abstractmethod + def get( + self, table: str, keys: List[str] + ) -> Tuple[Dict[str, Tuple[str, int]], int]: + """Get a list of entries from the storage table. + + Args: + table: The name of the table. + keys: A list of keys to be retrieved. If the list is empty, + all entries in the table will be returned. + + Returns: + Tuple[Dict[str, VersionedValue], int]: A tuple of + (entries, storage_version). The entries is a dictionary of + (key, (value, entry_version)) pairs. The entry_version is the + version of the entry when it was last updated. The + storage_version is the current storage version. + """ + raise NotImplementedError("get() has to be implemented") + + @abstractmethod + def get_version(self) -> int: + """Get the current storage version. + + Returns: + int: The current storage version. + """ + raise NotImplementedError("get_version() has to be implemented") + + +class InMemoryStorage(Storage): + """An in-memory implementation of the Storage interface. This implementation + is not durable""" + + def __init__(self): + self._version = 0 + self._tables = defaultdict(dict) + self._lock = Lock() + + def batch_update( + self, + table: str, + mutation: Dict[str, str] = None, + deletion: List[str] = None, + expected_version: Optional[int] = None, + ) -> StoreStatus: + mutation = mutation if mutation else {} + deletion = deletion if deletion else [] + with self._lock: + if expected_version is not None and expected_version != self._version: + return StoreStatus(False, self._version) + self._version += 1 + key_value_pairs_with_version = { + key: VersionedValue(value, self._version) + for key, value in mutation.items() + } + self._tables[table].update(key_value_pairs_with_version) + for deleted_key in deletion: + self._tables[table].pop(deleted_key, None) + return StoreStatus(True, self._version) + + def update( + self, + table: str, + key: str, + value: str, + expected_entry_version: Optional[int] = None, + expected_storage_version: Optional[int] = None, + insert_only: bool = False, + ) -> StoreStatus: + with self._lock: + if ( + expected_storage_version is not None + and expected_storage_version != self._version + ): + return StoreStatus(False, self._version) + if insert_only and key in self._tables[table]: + return StoreStatus(False, self._version) + _, version = self._tables[table].get(key, (None, -1)) + if expected_entry_version is not None and expected_entry_version != version: + return StoreStatus(False, self._version) + self._version += 1 + self._tables[table][key] = VersionedValue(value, self._version) + return StoreStatus(True, self._version) + + def get_all(self, table: str) -> Tuple[Dict[str, VersionedValue], int]: + with self._lock: + return (copy.deepcopy(self._tables[table]), self._version) + + def get(self, table: str, keys: List[str]) -> Tuple[Dict[str, VersionedValue], int]: + if not keys: + return self.get_all(table) + with self._lock: + result = {} + for key in keys: + if key in self._tables.get(table, {}): + result[key] = self._tables[table][key] + return StoreStatus(result, self._version) + + def get_version(self) -> int: + return self._version diff --git a/python/ray/autoscaler/v2/tests/test_instance_storage.py b/python/ray/autoscaler/v2/tests/test_instance_storage.py new file mode 100644 index 000000000000..881520ea6cf4 --- /dev/null +++ b/python/ray/autoscaler/v2/tests/test_instance_storage.py @@ -0,0 +1,290 @@ +# coding: utf-8 +import copy +import os +import sys + +import pytest # noqa + +from ray.autoscaler.v2.instance_manager.instance_storage import ( + InstanceStorage, + InstanceUpdatedSuscriber, + InstanceUpdateEvent, +) +from ray.autoscaler.v2.instance_manager.storage import InMemoryStorage +from ray.core.generated.instance_manager_pb2 import Instance + + +class DummySubscriber(InstanceUpdatedSuscriber): + def __init__(self): + self.events = [] + + def notify(self, events): + self.events.extend(events) + + +def create_instance( + instance_id, status=Instance.INSTANCE_STATUS_UNSPECIFIED, version=0 +): + return Instance(instance_id=instance_id, status=status, version=version) + + +def test_upsert(): + subscriber = DummySubscriber() + + storage = InstanceStorage( + cluster_id="test_cluster", + storage=InMemoryStorage(), + status_change_subscriber=subscriber, + ) + instance1 = create_instance("instance1") + instance2 = create_instance("instance2") + instance3 = create_instance("instance3") + + assert (True, 1) == storage.batch_upsert_instances( + [instance1, instance2], + expected_storage_version=None, + ) + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + ] + + instance1.version = 1 + instance2.version = 1 + entries, storage_version = storage.get_instances() + + assert storage_version == 1 + assert entries == { + "instance1": instance1, + "instance2": instance2, + } + + assert (False, 1) == storage.batch_upsert_instances( + [create_instance("instance1"), create_instance("instance2")], + expected_storage_version=0, + ) + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + ] + + instance2.status = Instance.IDLE + assert (True, 2) == storage.batch_upsert_instances( + [instance3, instance2], + expected_storage_version=1, + ) + + instance1.version = 1 + instance2.version = 2 + instance3.version = 2 + entries, storage_version = storage.get_instances() + + assert storage_version == 2 + assert entries == { + "instance1": instance1, + "instance2": instance2, + "instance3": instance3, + } + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance3", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.IDLE), + ] + + +def test_update(): + subscriber = DummySubscriber() + + storage = InstanceStorage( + cluster_id="test_cluster", + storage=InMemoryStorage(), + status_change_subscriber=subscriber, + ) + instance1 = create_instance("instance1") + instance2 = create_instance("instance2") + + assert (True, 1) == storage.upsert_instance(instance=instance1) + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + ] + assert (True, 2) == storage.upsert_instance(instance=instance2) + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + ] + + assert ( + { + "instance1": create_instance("instance1", version=1), + "instance2": create_instance("instance2", version=2), + }, + 2, + ) == storage.get_instances() + + # failed because instance version is not correct + assert (False, 2) == storage.upsert_instance( + instance=instance1, + expected_instance_version=0, + ) + + # failed because storage version is not correct + assert (False, 2) == storage.upsert_instance( + instance=instance1, + expected_storage_verison=0, + ) + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + ] + + assert (True, 3) == storage.upsert_instance( + instance=instance2, + expected_storage_verison=2, + ) + + assert ( + { + "instance1": create_instance("instance1", version=1), + "instance2": create_instance("instance2", version=3), + }, + 3, + ) == storage.get_instances() + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + ] + + assert (True, 4) == storage.upsert_instance( + instance=instance1, + expected_instance_version=1, + ) + + assert ( + { + "instance1": create_instance("instance1", version=4), + "instance2": create_instance("instance2", version=3), + }, + 4, + ) == storage.get_instances() + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + ] + + +def test_delete(): + subscriber = DummySubscriber() + + storage = InstanceStorage( + cluster_id="test_cluster", + storage=InMemoryStorage(), + status_change_subscriber=subscriber, + ) + instance1 = create_instance("instance1") + instance2 = create_instance("instance2") + instance3 = create_instance("instance3") + + assert (True, 1) == storage.batch_upsert_instances( + [instance1, instance2, instance3], + expected_storage_version=None, + ) + + assert (False, 1) == storage.batch_delete_instances( + instance_ids=["instance1"], expected_storage_version=0 + ) + assert (True, 2) == storage.batch_delete_instances(instance_ids=["instance1"]) + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance3", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance1", Instance.GARAGE_COLLECTED), + ] + + assert ( + { + "instance2": create_instance("instance2", version=1), + "instance3": create_instance("instance3", version=1), + }, + 2, + ) == storage.get_instances() + + assert (True, 3) == storage.batch_delete_instances( + instance_ids=["instance2"], expected_storage_version=2 + ) + + assert ( + { + "instance3": create_instance("instance3", version=1), + }, + 3, + ) == storage.get_instances() + + assert subscriber.events == [ + InstanceUpdateEvent("instance1", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance2", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance3", Instance.INSTANCE_STATUS_UNSPECIFIED), + InstanceUpdateEvent("instance1", Instance.GARAGE_COLLECTED), + InstanceUpdateEvent("instance2", Instance.GARAGE_COLLECTED), + ] + + +def test_get_instances(): + storage = InstanceStorage( + cluster_id="test_cluster", + storage=InMemoryStorage(), + ) + instance1 = create_instance("instance1", version=1) + instance2 = create_instance("instance2", status=Instance.RUNNING, version=1) + instance3 = create_instance("instance3", status=Instance.IDLE, version=1) + + assert (True, 1) == storage.batch_upsert_instances( + [copy.deepcopy(instance1), copy.deepcopy(instance2), copy.deepcopy(instance3)], + expected_storage_version=None, + ) + + assert ( + { + "instance1": instance1, + "instance2": instance2, + "instance3": instance3, + }, + 1, + ) == storage.get_instances() + + assert ( + { + "instance1": instance1, + "instance2": instance2, + }, + 1, + ) == storage.get_instances(instance_ids=["instance1", "instance2"]) + + assert ({"instance2": instance2}, 1) == storage.get_instances( + instance_ids=["instance1", "instance2"], status_filter={Instance.RUNNING} + ) + + assert ( + { + "instance2": instance2, + }, + 1, + ) == storage.get_instances(status_filter={Instance.RUNNING}) + + +if __name__ == "__main__": + if os.environ.get("PARALLEL_CI"): + sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) + else: + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/autoscaler/v2/tests/test_storage.py b/python/ray/autoscaler/v2/tests/test_storage.py new file mode 100644 index 000000000000..92231c89402f --- /dev/null +++ b/python/ray/autoscaler/v2/tests/test_storage.py @@ -0,0 +1,87 @@ +# coding: utf-8 +import os +import sys + +import pytest # noqa + +from ray.autoscaler.v2.instance_manager.storage import ( + InMemoryStorage, + StoreStatus, + VersionedValue, +) + + +@pytest.mark.parametrize("storage", [InMemoryStorage()]) +def test_storage(storage): + assert storage.get_version() == 0 + assert storage.get_all(table="test_table") == ({}, 0) + assert storage.get(table="test_table", keys=[]) == ({}, 0) + assert storage.get(table="test_table", keys=["key1"]) == ({}, 0) + + assert storage.batch_update( + table="test_table", mutation={"key1": "value1"} + ) == StoreStatus( + True, + 1, + ) + + assert storage.get_version() == 1 + + assert storage.get_all(table="test_table") == ( + {"key1": VersionedValue("value1", 1)}, + 1, + ) + assert storage.get(table="test_table", keys=[]) == ( + {"key1": VersionedValue("value1", 1)}, + 1, + ) + + assert storage.batch_update( + table="test_table", mutation={"key1": "value2"}, expected_version=0 + ) == StoreStatus(False, 1) + + assert storage.batch_update( + table="test_table", mutation={"key1": "value2"}, expected_version=1 + ) == StoreStatus(True, 2) + + assert storage.get_all(table="test_table") == ( + {"key1": VersionedValue("value2", 2)}, + 2, + ) + + assert storage.batch_update( + table="test_table", + mutation={"key2": "value3", "key3": "value4"}, + deletion=["key1"], + expected_version=2, + ) == StoreStatus(True, 3) + + assert storage.get_all(table="test_table") == ( + {"key2": VersionedValue("value3", 3), "key3": VersionedValue("value4", 3)}, + 3, + ) + + assert storage.get(table="test_table", keys=["key2", "key1"]) == ( + {"key2": VersionedValue("value3", 3)}, + 3, + ) + + assert storage.update( + table="test_table", key="key2", value="value5" + ) == StoreStatus(True, 4) + assert storage.update( + table="test_table", key="key2", value="value5", insert_only=True + ) == StoreStatus(False, 4) + assert storage.update( + table="test_table", key="key2", value="value5", expected_entry_version=3 + ) == StoreStatus(False, 4) + assert storage.update( + table="test_table", key="key2", value="value6", expected_entry_version=4 + ) == StoreStatus(True, 5) + + +if __name__ == "__main__": + if os.environ.get("PARALLEL_CI"): + sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) + else: + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/dag/tests/test_class_dag.py b/python/ray/dag/tests/test_class_dag.py index bb5d72760c2e..55cff1e540c4 100644 --- a/python/ray/dag/tests/test_class_dag.py +++ b/python/ray/dag/tests/test_class_dag.py @@ -115,7 +115,7 @@ def test_actor_method_options(shared_ray_instance): def test_basic_actor_dag_constructor_invalid_options(shared_ray_instance): with pytest.raises( - ValueError, match=r".*only accepts None, 0 or a positive number.*" + ValueError, match=r".*quantity of resource num_cpus cannot be negative.*" ): a1 = Actor.options(num_cpus=-1).bind(10) invalid_dag = a1.get.bind() diff --git a/python/ray/dag/tests/test_function_dag.py b/python/ray/dag/tests/test_function_dag.py index 3d891ad1deab..2c577f10448d 100644 --- a/python/ray/dag/tests/test_function_dag.py +++ b/python/ray/dag/tests/test_function_dag.py @@ -112,7 +112,7 @@ def b(x): # Ensure current DAG is executable assert ray.get(dag.execute()) == 4 with pytest.raises( - ValueError, match=r".*only accepts None, 0 or a positive number.*" + ValueError, match=r".*quantity of resource num_cpus cannot be negative.*" ): invalid_dag = b.options(num_cpus=-1).bind(a_ref) ray.get(invalid_dag.execute()) diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD index 9e2e0fa4f9dd..1007bc1333a0 100644 --- a/python/ray/data/BUILD +++ b/python/ray/data/BUILD @@ -42,6 +42,22 @@ py_test( deps = ["//:ray_lib", ":conftest"], ) +py_test( + name = "test_numpy_support", + size = "small", + srcs = ["tests/test_numpy_support.py"], + tags = ["team:data", "exclusive"], + deps = ["//:ray_lib", ":conftest"], +) + +py_test( + name = "test_nonstrict_mode", + size = "small", + srcs = ["tests/test_nonstrict_mode.py"], + tags = ["team:data", "exclusive"], + deps = ["//:ray_lib", ":conftest"], +) + py_test( name = "test_sql", size = "small", diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index c47e95f52557..f42c113f2414 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -1,12 +1,11 @@ # Short term workaround for https://github.com/ray-project/ray/issues/32435 -# Datastream has a hard dependency on pandas, so it doesn't need to be delayed. +# Dataset has a hard dependency on pandas, so it doesn't need to be delayed. import pandas # noqa from ray.data._internal.compute import ActorPoolStrategy from ray.data._internal.progress_bar import set_progress_bars from ray.data._internal.execution.interfaces import ExecutionOptions, ExecutionResources -from ray.data.dataset import Dataset -from ray.data.datastream import Datastream +from ray.data.dataset import Dataset, Schema from ray.data.context import DatasetContext, DataContext from ray.data.iterator import DatasetIterator, DataIterator from ray.data.dataset_pipeline import DatasetPipeline @@ -54,8 +53,7 @@ __all__ = [ "ActorPoolStrategy", - "Datastream", - "Dataset", # Backwards compatibility alias. + "Dataset", "DataContext", "DatasetContext", # Backwards compatibility alias. "DataIterator", @@ -65,6 +63,7 @@ "ExecutionOptions", "ExecutionResources", "ReadTask", + "Schema", "from_dask", "from_items", "from_arrow", diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py index 82f65a07ac89..1627da13e3bd 100644 --- a/python/ray/data/_internal/arrow_block.py +++ b/python/ray/data/_internal/arrow_block.py @@ -19,17 +19,21 @@ from ray.air.constants import TENSOR_COLUMN_NAME from ray._private.utils import _get_pyarrow_version from ray.data._internal.arrow_ops import transform_polars, transform_pyarrow +from ray.data._internal.numpy_support import ( + convert_udf_returns_to_numpy, + is_valid_udf_return, +) from ray.data._internal.table_block import ( TableBlockAccessor, TableBlockBuilder, ) +from ray.data._internal.util import _truncated_repr from ray.data.aggregate import AggregateFn from ray.data.block import ( Block, BlockAccessor, BlockExecStats, BlockMetadata, - KeyFn, KeyType, U, ) @@ -68,7 +72,7 @@ def get_concat_and_sort_transform(context: DataContext) -> Callable: class ArrowRow(TableRow): """ - Row of a tabular Datastream backed by a Arrow Table block. + Row of a tabular Dataset backed by a Arrow Table block. """ def __getitem__(self, key: str) -> Any: @@ -105,7 +109,7 @@ def __len__(self): return self._row.num_columns -class ArrowBlockBuilder(TableBlockBuilder[T]): +class ArrowBlockBuilder(TableBlockBuilder): def __init__(self): if pyarrow is None: raise ImportError("Run `pip install pyarrow` for Arrow support") @@ -153,8 +157,7 @@ def from_bytes(cls, data: bytes) -> "ArrowBlockAccessor": @staticmethod def numpy_to_block( - batch: Union[np.ndarray, Dict[str, np.ndarray]], - passthrough_arrow_not_implemented_errors: bool = False, + batch: Union[np.ndarray, Dict[str, np.ndarray], Dict[str, list]], ) -> "pyarrow.Table": import pyarrow as pa @@ -163,27 +166,20 @@ def numpy_to_block( if isinstance(batch, np.ndarray): batch = {TENSOR_COLUMN_NAME: batch} elif not isinstance(batch, collections.abc.Mapping) or any( - not isinstance(col, np.ndarray) for col in batch.values() + not is_valid_udf_return(col) for col in batch.values() ): raise ValueError( "Batch must be an ndarray or dictionary of ndarrays when converting " - f"a numpy batch to a block, got: {type(batch)}" + f"a numpy batch to a block, got: {type(batch)} " + f"({_truncated_repr(batch)})" ) new_batch = {} for col_name, col in batch.items(): + # Coerce to np.ndarray format if possible. + col = convert_udf_returns_to_numpy(col) # Use Arrow's native *List types for 1-dimensional ndarrays. if col.dtype.type is np.object_ or col.ndim > 1: - try: - col = ArrowTensorArray.from_numpy(col) - except pa.ArrowNotImplementedError as e: - if passthrough_arrow_not_implemented_errors: - raise e - raise ValueError( - "Failed to convert multi-dimensional ndarray of dtype " - f"{col.dtype} to our tensor extension since this dtype is not " - "supported by Arrow. If encountering this due to string data, " - 'cast the ndarray to a string dtype, e.g. a.astype("U").' - ) from e + col = ArrowTensorArray.from_numpy(col) new_batch[col_name] = col return pa.Table.from_pydict(new_batch) @@ -293,7 +289,7 @@ def num_rows(self) -> int: def size_bytes(self) -> int: return self._table.nbytes - def _zip(self, acc: BlockAccessor) -> "Block[T]": + def _zip(self, acc: BlockAccessor) -> "Block": r = self.to_arrow() s = acc.to_arrow() for col_name in s.column_names: @@ -310,7 +306,7 @@ def _zip(self, acc: BlockAccessor) -> "Block[T]": return r @staticmethod - def builder() -> ArrowBlockBuilder[T]: + def builder() -> ArrowBlockBuilder: return ArrowBlockBuilder() @staticmethod @@ -328,7 +324,7 @@ def take( """ return transform_pyarrow.take_table(self._table, indices) - def select(self, columns: List[KeyFn]) -> "pyarrow.Table": + def select(self, columns: List[str]) -> "pyarrow.Table": if not all(isinstance(col, str) for col in columns): raise ValueError( "Columns must be a list of column name strings when aggregating on " @@ -341,7 +337,7 @@ def _sample(self, n_samples: int, key: "SortKeyT") -> "pyarrow.Table": table = self._table.select([k[0] for k in key]) return transform_pyarrow.take_table(table, indices) - def count(self, on: KeyFn) -> Optional[U]: + def count(self, on: str) -> Optional[U]: """Count the number of non-null values in the provided column.""" import pyarrow.compute as pac @@ -358,7 +354,7 @@ def count(self, on: KeyFn) -> Optional[U]: return pac.count(col).as_py() def _apply_arrow_compute( - self, compute_fn: Callable, on: KeyFn, ignore_nulls: bool + self, compute_fn: Callable, on: str, ignore_nulls: bool ) -> Optional[U]: """Helper providing null handling around applying an aggregation to a column.""" import pyarrow as pa @@ -378,29 +374,29 @@ def _apply_arrow_compute( else: return compute_fn(col, skip_nulls=ignore_nulls).as_py() - def sum(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def sum(self, on: str, ignore_nulls: bool) -> Optional[U]: import pyarrow.compute as pac return self._apply_arrow_compute(pac.sum, on, ignore_nulls) - def min(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def min(self, on: str, ignore_nulls: bool) -> Optional[U]: import pyarrow.compute as pac return self._apply_arrow_compute(pac.min, on, ignore_nulls) - def max(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def max(self, on: str, ignore_nulls: bool) -> Optional[U]: import pyarrow.compute as pac return self._apply_arrow_compute(pac.max, on, ignore_nulls) - def mean(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def mean(self, on: str, ignore_nulls: bool) -> Optional[U]: import pyarrow.compute as pac return self._apply_arrow_compute(pac.mean, on, ignore_nulls) def sum_of_squared_diffs_from_mean( self, - on: KeyFn, + on: str, ignore_nulls: bool, mean: Optional[U] = None, ) -> Optional[U]: @@ -422,7 +418,7 @@ def sum_of_squared_diffs_from_mean( def sort_and_partition( self, boundaries: List[T], key: "SortKeyT", descending: bool - ) -> List["Block[T]"]: + ) -> List["Block"]: if len(key) > 1: raise NotImplementedError( "sorting by multiple columns is not supported yet" @@ -461,7 +457,7 @@ def sort_and_partition( partitions.append(table.slice(last_idx)) return partitions - def combine(self, key: KeyFn, aggs: Tuple[AggregateFn]) -> Block[ArrowRow]: + def combine(self, key: str, aggs: Tuple[AggregateFn]) -> Block: """Combine rows with the same key into an accumulator. This assumes the block is already sorted by key in ascending order. @@ -490,7 +486,7 @@ def iter_groups() -> Iterator[Tuple[KeyType, Block]]: return start = end = 0 - iter = self.iter_rows() + iter = self.iter_rows(public_row_format=False) next_row = None while True: try: @@ -540,8 +536,8 @@ def _munge_conflict(name, count): @staticmethod def merge_sorted_blocks( - blocks: List[Block[T]], key: "SortKeyT", _descending: bool - ) -> Tuple[Block[T], BlockMetadata]: + blocks: List[Block], key: "SortKeyT", _descending: bool + ) -> Tuple[Block, BlockMetadata]: stats = BlockExecStats.builder() blocks = [b for b in blocks if b.num_rows > 0] if len(blocks) == 0: @@ -553,11 +549,11 @@ def merge_sorted_blocks( @staticmethod def aggregate_combined_blocks( - blocks: List[Block[ArrowRow]], - key: KeyFn, + blocks: List[Block], + key: str, aggs: Tuple[AggregateFn], finalize: bool, - ) -> Tuple[Block[ArrowRow], BlockMetadata]: + ) -> Tuple[Block, BlockMetadata]: """Aggregate sorted, partially combined blocks with the same key range. This assumes blocks are already sorted by key in ascending order, @@ -584,7 +580,11 @@ def aggregate_combined_blocks( ) iter = heapq.merge( - *[ArrowBlockAccessor(block).iter_rows() for block in blocks], key=key_fn + *[ + ArrowBlockAccessor(block).iter_rows(public_row_format=False) + for block in blocks + ], + key=key_fn, ) next_row = None builder = ArrowBlockBuilder() diff --git a/python/ray/data/_internal/block_batching/block_batching.py b/python/ray/data/_internal/block_batching/block_batching.py index 6fc4990f3674..cf1f89aee7e9 100644 --- a/python/ray/data/_internal/block_batching/block_batching.py +++ b/python/ray/data/_internal/block_batching/block_batching.py @@ -15,7 +15,7 @@ ActorBlockPrefetcher, ) from ray.data._internal.memory_tracing import trace_deallocation -from ray.data._internal.stats import DatasetPipelineStats, DatastreamStats +from ray.data._internal.stats import DatasetPipelineStats, DatasetStats from ray.data.block import Block, DataBatch from ray.data.context import DataContext from ray.types import ObjectRef @@ -26,7 +26,7 @@ def batch_block_refs( block_refs: Iterator[ObjectRef[Block]], *, - stats: Optional[Union[DatastreamStats, DatasetPipelineStats]] = None, + stats: Optional[Union[DatasetStats, DatasetPipelineStats]] = None, prefetch_blocks: int = 0, clear_block_after_read: bool = False, batch_size: Optional[int] = None, @@ -42,8 +42,8 @@ def batch_block_refs( This takes a block iterator and creates batch_size batches, slicing, unioning, shuffling, prefetching, and formatting blocks as needed. - This is used by both Datastream.iter_batches()/DatasetPipeline.iter_batches() - and Datastream.map_batches()/DatasetPipeline.map_batches(). + This is used by both Dataset.iter_batches()/DatasetPipeline.iter_batches() + and Dataset.map_batches()/DatasetPipeline.map_batches(). Args: block_refs: An iterator over block object references. @@ -114,7 +114,7 @@ def batch_block_refs( def batch_blocks( blocks: Iterator[Block], *, - stats: Optional[Union[DatastreamStats, DatasetPipelineStats]] = None, + stats: Optional[Union[DatasetStats, DatasetPipelineStats]] = None, batch_size: Optional[int] = None, batch_format: str = "default", drop_last: bool = False, @@ -164,7 +164,7 @@ def _prefetch_blocks( prefetcher: BlockPrefetcher, num_blocks_to_prefetch: int, eager_free: bool = False, - stats: Optional[Union[DatastreamStats, DatasetPipelineStats]] = None, + stats: Optional[Union[DatasetStats, DatasetPipelineStats]] = None, ) -> Iterator[ObjectRef[Block]]: """Given an iterable of Block Object References, returns an iterator over these object reference while prefetching `num_block_to_prefetch` @@ -174,7 +174,7 @@ def _prefetch_blocks( block_ref_iter: An iterator over block object references. num_blocks_to_prefetch: The number of blocks to prefetch ahead of the current block during the scan. - stats: Datastream stats object used to store block wait time. + stats: Dataset stats object used to store block wait time. """ if num_blocks_to_prefetch == 0: for block_ref in block_ref_iter: diff --git a/python/ray/data/_internal/block_batching/iter_batches.py b/python/ray/data/_internal/block_batching/iter_batches.py index e3c2e8b56e06..e406bcd79f12 100644 --- a/python/ray/data/_internal/block_batching/iter_batches.py +++ b/python/ray/data/_internal/block_batching/iter_batches.py @@ -19,7 +19,7 @@ make_async_gen, ) from ray.data._internal.memory_tracing import trace_deallocation -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats from ray.data.context import DataContext from contextlib import nullcontext @@ -27,7 +27,7 @@ def iter_batches( block_refs: Iterator[Tuple[ObjectRef[Block], BlockMetadata]], *, - stats: Optional[DatastreamStats] = None, + stats: Optional[DatasetStats] = None, clear_block_after_read: bool = False, batch_size: Optional[int] = None, batch_format: Optional[str] = "default", @@ -74,7 +74,7 @@ def iter_batches( Args: block_refs: An iterator over block object references and their corresponding metadata. - stats: DatastreamStats object to record timing and other statistics. + stats: DatasetStats object to record timing and other statistics. clear_block_after_read: Whether to clear the block from object store manually (i.e. without waiting for Python's automatic GC) after it is read. Doing so will reclaim memory faster and hence reduce the @@ -176,7 +176,7 @@ def _async_iter_batches( def _format_in_threadpool( batch_iter: Iterator[Batch], - stats: DatastreamStats, + stats: DatasetStats, batch_format: Optional[str], collate_fn: Optional[Callable[[DataBatch], Any]], num_threadpool_workers: int, @@ -185,7 +185,7 @@ def _format_in_threadpool( Args: logical_batch_iterator: An iterator over logical batches. - stats: DatastreamStats object to record timing and other statistics. + stats: DatasetStats object to record timing and other statistics. batch_format: The format in which to return each batch. Specify "default" to use the current block format (promoting Arrow to pandas automatically), "pandas" to diff --git a/python/ray/data/_internal/block_batching/util.py b/python/ray/data/_internal/block_batching/util.py index a82b46ae43a7..bdf9807b5253 100644 --- a/python/ray/data/_internal/block_batching/util.py +++ b/python/ray/data/_internal/block_batching/util.py @@ -1,7 +1,7 @@ import logging -import queue import threading from typing import Any, Callable, Iterator, List, Optional, Tuple, TypeVar, Union +from collections import deque from contextlib import nullcontext import ray @@ -14,7 +14,7 @@ CollatedBatch, BlockPrefetcher, ) -from ray.data._internal.stats import DatasetPipelineStats, DatastreamStats +from ray.data._internal.stats import DatasetPipelineStats, DatasetStats from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy T = TypeVar("T") @@ -39,7 +39,7 @@ def _calculate_ref_hits(refs: List[ObjectRef[Any]]) -> Tuple[int, int, int]: def resolve_block_refs( block_ref_iter: Iterator[ObjectRef[Block]], - stats: Optional[Union[DatastreamStats, DatasetPipelineStats]] = None, + stats: Optional[Union[DatasetStats, DatasetPipelineStats]] = None, ) -> Iterator[Block]: """Resolves the block references for each logical batch. @@ -71,7 +71,7 @@ def resolve_block_refs( def blocks_to_batches( block_iter: Iterator[Block], - stats: Optional[Union[DatastreamStats, DatasetPipelineStats]] = None, + stats: Optional[Union[DatasetStats, DatasetPipelineStats]] = None, batch_size: Optional[int] = None, drop_last: bool = False, shuffle_buffer_min_size: Optional[int] = None, @@ -86,7 +86,7 @@ def blocks_to_batches( Args: block_iter: An iterator over blocks. - stats: Datastream stats object used to store block batching time. + stats: Dataset stats object used to store block batching time. batch_size: Record batch size, or None to let the system pick. drop_last: Whether to drop the last batch if it's incomplete. shuffle_buffer_min_size: If non-None, the data will be randomly shuffled @@ -143,7 +143,7 @@ def get_iter_next_batch_s_timer(): def format_batches( block_iter: Iterator[Batch], batch_format: Optional[str], - stats: Optional[Union[DatastreamStats, DatasetPipelineStats]] = None, + stats: Optional[Union[DatasetStats, DatasetPipelineStats]] = None, ) -> Iterator[Batch]: """Given an iterator of blocks, returns an iterator of formatted batches. @@ -166,7 +166,7 @@ def format_batches( def collate( batch_iter: Iterator[Batch], collate_fn: Optional[Callable[[DataBatch], Any]], - stats: Optional[DatastreamStats] = None, + stats: Optional[DatasetStats] = None, ) -> Iterator[CollatedBatch]: """Returns an iterator with the provided collate_fn applied to items of the batch iterator. @@ -230,7 +230,7 @@ class Sentinel: def __init__(self, thread_index: int): self.thread_index = thread_index - output_queue = queue.Queue(1) + output_queue = Queue(1) # Because pulling from the base iterator cannot happen concurrently, # we must execute the expensive computation in a separate step which @@ -238,11 +238,14 @@ def __init__(self, thread_index: int): def execute_computation(thread_index: int): try: for item in fn(thread_safe_generator): - output_queue.put(item, block=True) - output_queue.put(Sentinel(thread_index), block=True) + if output_queue.put(item): + # Return early when it's instructed to do so. + return + output_queue.put(Sentinel(thread_index)) except Exception as e: - output_queue.put(e, block=True) + output_queue.put(e) + # Use separate threads to produce output batches. threads = [ threading.Thread(target=execute_computation, args=(i,), daemon=True) for i in range(num_workers) @@ -251,25 +254,31 @@ def execute_computation(thread_index: int): for thread in threads: thread.start() + # Use main thread to consume output batches. num_threads_finished = 0 - while True: - next_item = output_queue.get(block=True) - if isinstance(next_item, Exception): - output_queue.task_done() - raise next_item - if isinstance(next_item, Sentinel): - output_queue.task_done() - logger.debug(f"Thread {next_item.thread_index} finished.") - num_threads_finished += 1 - threads[next_item.thread_index].join() - else: - yield next_item - output_queue.task_done() - if num_threads_finished >= num_workers: - break - - -PREFETCHER_ACTOR_NAMESPACE = "ray.datastream" + try: + while True: + next_item = output_queue.get() + if isinstance(next_item, Exception): + raise next_item + if isinstance(next_item, Sentinel): + logger.debug(f"Thread {next_item.thread_index} finished.") + num_threads_finished += 1 + else: + yield next_item + if num_threads_finished >= num_workers: + break + finally: + # Cooperatively exit all producer threads. + # This is to avoid these daemon threads hanging there with holding batches in + # memory, which can cause GRAM OOM easily. This can happen when caller breaks + # in the middle of iteration. + num_threads_alive = num_workers - num_threads_finished + if num_threads_alive > 0: + output_queue.release(num_threads_alive) + + +PREFETCHER_ACTOR_NAMESPACE = "ray.dataset" class WaitBlockPrefetcher(BlockPrefetcher): @@ -291,7 +300,7 @@ def __init__(self): @staticmethod def _get_or_create_actor_prefetcher() -> "ActorHandle": node_id = ray.get_runtime_context().get_node_id() - actor_name = f"datastream-block-prefetcher-{node_id}" + actor_name = f"dataset-block-prefetcher-{node_id}" return _BlockPretcher.options( scheduling_strategy=NodeAffinitySchedulingStrategy(node_id, soft=False), name=actor_name, @@ -309,3 +318,66 @@ class _BlockPretcher: def prefetch(self, *blocks) -> None: pass + + +class Queue: + """A thread-safe queue implementation for multiple producers and consumers. + + Provide `release()` to exit producer threads cooperatively for resource release. + """ + + def __init__(self, queue_size: int): + # The queue shared across multiple producer threads. + self._queue = deque() + # The boolean varilable to indicate whether producer threads should exit. + self._threads_exit = False + # The semaphore for producer threads to put item into queue. + self._producer_semaphore = threading.Semaphore(queue_size) + # The semaphore for consumer threads to get item from queue. + self._consumer_semaphore = threading.Semaphore(0) + # The mutex lock to guard access of `self._queue` and `self._threads_exit`. + self._mutex = threading.Lock() + + def put(self, item: Any) -> bool: + """Put an item into the queue. + + Block if necessary until a free slot is available in queue. + This method is called by producer threads. + + Returns: + True if the caller thread should exit immediately. + """ + self._producer_semaphore.acquire() + with self._mutex: + if self._threads_exit: + return True + else: + self._queue.append(item) + self._consumer_semaphore.release() + return False + + def get(self) -> Any: + """Remove and return an item from the queue. + + Block if necessary until an item is available in queue. + This method is called by consumer threads. + """ + self._consumer_semaphore.acquire() + with self._mutex: + next_item = self._queue.popleft() + self._producer_semaphore.release() + return next_item + + def release(self, num_threads: int): + """Release `num_threads` of producers so they would exit cooperatively.""" + with self._mutex: + self._threads_exit = True + for _ in range(num_threads): + # NOTE: After Python 3.9+, Semaphore.release(n) can be used to + # release all threads at once. + self._producer_semaphore.release() + + def qsize(self): + """Return the size of the queue.""" + with self._mutex: + return len(self._queue) diff --git a/python/ray/data/_internal/block_builder.py b/python/ray/data/_internal/block_builder.py index 0d64ddadb26f..27787f088a2c 100644 --- a/python/ray/data/_internal/block_builder.py +++ b/python/ray/data/_internal/block_builder.py @@ -7,7 +7,7 @@ class BlockBuilder(Generic[T]): """A builder class for blocks.""" @staticmethod - def for_block(block: Block) -> "BlockBuilder[T]": + def for_block(block: Block) -> "BlockBuilder": return BlockAccessor.for_block(block).builder() def add(self, item: T) -> None: diff --git a/python/ray/data/_internal/block_list.py b/python/ray/data/_internal/block_list.py index 06dd3cf5cad4..b046c24e4cc7 100644 --- a/python/ray/data/_internal/block_list.py +++ b/python/ray/data/_internal/block_list.py @@ -58,8 +58,8 @@ def _check_if_cleared(self) -> None: """Raise an error if this BlockList has been previously cleared.""" if self.is_cleared(): raise ValueError( - "This Datastream's blocks have been moved, which means that you " - "can no longer use this Datastream." + "This Dataset's blocks have been moved, which means that you " + "can no longer use this Dataset." ) def split(self, split_size: int) -> List["BlockList"]: diff --git a/python/ray/data/_internal/compute.py b/python/ray/data/_internal/compute.py index 6e74fbcf6c3b..a97e54127dec 100644 --- a/python/ray/data/_internal/compute.py +++ b/python/ray/data/_internal/compute.py @@ -10,14 +10,14 @@ from ray.data._internal.progress_bar import ProgressBar from ray.data._internal.remote_fn import cached_remote_fn from ray.data.block import ( - BatchUDF, + UserDefinedFunction, Block, BlockAccessor, BlockExecStats, BlockMetadata, BlockPartition, CallableClass, - RowUDF, + StrictModeError, ) from ray.data.context import DEFAULT_SCHEDULING_STRATEGY, DataContext from ray.types import ObjectRef @@ -36,15 +36,12 @@ # TODO(Clark): Once Ray only supports Python 3.8+, use protocol to constrain block # transform type. # Callable[[Block, ...], Iterable[Block]] - # Callable[[Block, BatchUDF, ...], Iterable[Block]], + # Callable[[Block, UserDefinedFunction, ...], Iterable[Block]], Callable[[Iterable[Block], TaskContext], Iterable[Block]], - Callable[[Iterable[Block], TaskContext, Union[BatchUDF, RowUDF]], Iterable[Block]], + Callable[[Iterable[Block], TaskContext, UserDefinedFunction], Iterable[Block]], Callable[..., Iterable[Block]], ] -# UDF on a batch or row. -UDF = Union[BatchUDF, RowUDF] - @DeveloperAPI class ComputeStrategy: @@ -68,7 +65,7 @@ def _apply( clear_input_blocks: bool, name: Optional[str] = None, target_block_size: Optional[int] = None, - fn: Optional[UDF] = None, + fn: Optional[UserDefinedFunction] = None, fn_args: Optional[Iterable[Any]] = None, fn_kwargs: Optional[Dict[str, Any]] = None, fn_constructor_args: Optional[Iterable[Any]] = None, @@ -83,7 +80,7 @@ def _apply( context = DataContext.get_current() - # Handle empty datastreams. + # Handle empty datasets. if block_list.initial_num_blocks() == 0: return block_list @@ -177,15 +174,15 @@ def _apply( ) def __eq__(self, other: Any) -> bool: - return isinstance(other, TaskPoolStrategy) + return isinstance(other, TaskPoolStrategy) or other == "tasks" @PublicAPI class ActorPoolStrategy(ComputeStrategy): - """Specify the compute strategy for a Datastream transform. + """Specify the compute strategy for a Dataset transform. ActorPoolStrategy specifies that an autoscaling pool of actors should be used - for a given Datastream transform. This is useful for stateful setup of callable + for a given Dataset transform. This is useful for stateful setup of callable classes. For a fixed-sized pool of size ``n``, specify ``compute=ActorPoolStrategy(size=n)``. @@ -209,7 +206,7 @@ def __init__( max_size: Optional[int] = None, max_tasks_in_flight_per_actor: Optional[int] = None, ): - """Construct ActorPoolStrategy for a Datastream transform. + """Construct ActorPoolStrategy for a Dataset transform. Args: size: Specify a fixed size actor pool of this size. It is an error to @@ -222,12 +219,18 @@ def __init__( computation and avoiding actor startup delays, but will also increase queueing delay. """ + ctx = DataContext.get_current() if legacy_min_size is not None or legacy_max_size is not None: - # TODO: make this an error in Ray 2.5. - logger.warning( - "DeprecationWarning: ActorPoolStrategy will require min_size and " - "max_size to be explicit kwargs in a future release" - ) + if ctx.strict_mode: + raise StrictModeError( + "In Ray 2.5, ActorPoolStrategy requires min_size and " + "max_size to be explicit kwargs." + ) + else: + logger.warning( + "DeprecationWarning: ActorPoolStrategy will require min_size and " + "max_size to be explicit kwargs in a future release" + ) if legacy_min_size is not None: min_size = legacy_min_size if legacy_max_size is not None: @@ -270,13 +273,13 @@ def _apply( clear_input_blocks: bool, name: Optional[str] = None, target_block_size: Optional[int] = None, - fn: Optional[UDF] = None, + fn: Optional[UserDefinedFunction] = None, fn_args: Optional[Iterable[Any]] = None, fn_kwargs: Optional[Dict[str, Any]] = None, fn_constructor_args: Optional[Iterable[Any]] = None, fn_constructor_kwargs: Optional[Dict[str, Any]] = None, ) -> BlockList: - """Note: this is not part of the Datastream public API.""" + """Note: this is not part of the Dataset public API.""" assert not DataContext.get_current().new_execution_backend, "Legacy backend off" if fn_args is None: fn_args = tuple() @@ -495,7 +498,15 @@ def __eq__(self, other: Any) -> bool: def get_compute(compute_spec: Union[str, ComputeStrategy]) -> ComputeStrategy: - if not compute_spec or compute_spec == "tasks": + ctx = DataContext.get_current() + if ctx.strict_mode and not isinstance( + compute_spec, (TaskPoolStrategy, ActorPoolStrategy) + ): + raise StrictModeError( + "In Ray 2.5, the compute spec must be either " + f"TaskPoolStrategy or ActorPoolStategy, was: {compute_spec}." + ) + elif not compute_spec or compute_spec == "tasks": return TaskPoolStrategy() elif compute_spec == "actors": return ActorPoolStrategy() @@ -516,7 +527,7 @@ def is_task_compute(compute_spec: Union[str, ComputeStrategy]) -> bool: def _map_block_split( block_fn: BlockTransform, input_files: List[str], - fn: Optional[UDF], + fn: Optional[UserDefinedFunction], num_blocks: int, *blocks_and_fn_args: Union[Block, Any], **fn_kwargs, @@ -544,7 +555,7 @@ def _map_block_split( def _map_block_nosplit( block_fn: BlockTransform, input_files: List[str], - fn: Optional[UDF], + fn: Optional[UserDefinedFunction], num_blocks: int, *blocks_and_fn_args: Union[Block, Any], **fn_kwargs, diff --git a/python/ray/data/_internal/datastream_logger.py b/python/ray/data/_internal/dataset_logger.py similarity index 81% rename from python/ray/data/_internal/datastream_logger.py rename to python/ray/data/_internal/dataset_logger.py index 8ed7dde89602..47e014c77ff4 100644 --- a/python/ray/data/_internal/datastream_logger.py +++ b/python/ray/data/_internal/dataset_logger.py @@ -5,16 +5,16 @@ from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL -class DatastreamLogger: - """Logger for Ray Datastreams which writes logs to a separate log file - at `DatastreamLogger.DEFAULT_DATASET_LOG_PATH`. Can optionally turn off +class DatasetLogger: + """Logger for Ray Datasets which writes logs to a separate log file + at `DatasetLogger.DEFAULT_DATASET_LOG_PATH`. Can optionally turn off logging to stdout to reduce clutter (but always logs to the aformentioned - Datastreams-specific log file). + Datasets-specific log file). After initialization, always use the `get_logger()` method to correctly set whether to log to stdout. Example usage: ``` - logger = DatastreamLogger(__name__) + logger = DatasetLogger(__name__) logger.get_logger().info("This logs to file and stdout") logger.get_logger(log_to_stdout=False).info("This logs to file only) logger.get_logger().warning("Can call the usual Logger methods") @@ -24,7 +24,7 @@ class DatastreamLogger: DEFAULT_DATASET_LOG_PATH = "logs/ray-data.log" def __init__(self, log_name: str): - """Initialize DatastreamLogger for a given `log_name`. + """Initialize DatasetLogger for a given `log_name`. Args: log_name: Name of logger (usually passed into `logging.getLogger(...)`) @@ -40,7 +40,7 @@ def __init__(self, log_name: str): def _initialize_logger(self) -> logging.Logger: """Internal method to initialize the logger and the extra file handler - for writing to the Datastream log file. Not intended (nor necessary) + for writing to the Dataset log file. Not intended (nor necessary) to call explicitly. Assumes that `ray.init()` has already been called prior to calling this method; otherwise raises a `ValueError`.""" @@ -62,20 +62,20 @@ def _initialize_logger(self) -> logging.Logger: # If ray.init() is called and the global node session directory path # is valid, we can create the additional handler to write to the - # Datastream log file. If this is not the case (e.g. when used in Ray + # Dataset log file. If this is not the case (e.g. when used in Ray # Client), then we skip initializing the FileHandler. global_node = ray._private.worker._global_node if global_node is not None: - # Add a FileHandler to write to the specific Ray Datastreams log file - # at `DatastreamLogger.DEFAULT_DATASET_LOG_PATH`, using the standard + # Add a FileHandler to write to the specific Ray Datasets log file + # at `DatasetLogger.DEFAULT_DATASET_LOG_PATH`, using the standard # default logger format used by the root logger session_dir = global_node.get_session_dir_path() - datastreams_log_path = os.path.join( + datasets_log_path = os.path.join( session_dir, - DatastreamLogger.DEFAULT_DATASET_LOG_PATH, + DatasetLogger.DEFAULT_DATASET_LOG_PATH, ) file_log_formatter = logging.Formatter(fmt=LOGGER_FORMAT) - file_log_handler = logging.FileHandler(datastreams_log_path) + file_log_handler = logging.FileHandler(datasets_log_path) file_log_handler.setLevel(LOGGER_LEVEL.upper()) file_log_handler.setFormatter(file_log_formatter) logger.addHandler(file_log_handler) @@ -85,10 +85,10 @@ def get_logger(self, log_to_stdout: bool = True) -> logging.Logger: """ Returns the underlying Logger, with the `propagate` attribute set to the same value as `log_to_stdout`. For example, when - `log_to_stdout = False`, we do not want the `DatastreamLogger` to + `log_to_stdout = False`, we do not want the `DatasetLogger` to propagate up to the base Logger which writes to stdout. - This is a workaround needed due to the DatastreamLogger wrapper object + This is a workaround needed due to the DatasetLogger wrapper object not having access to the log caller's scope in Python <3.8. In the future, with Python 3.8 support, we can use the `stacklevel` arg, which allows the logger to fetch the correct calling file/line and diff --git a/python/ray/data/_internal/delegating_block_builder.py b/python/ray/data/_internal/delegating_block_builder.py index e5fb44155459..47baf887caf8 100644 --- a/python/ray/data/_internal/delegating_block_builder.py +++ b/python/ray/data/_internal/delegating_block_builder.py @@ -4,14 +4,14 @@ import numpy as np import ray -from ray.data.block import Block, DataBatch, T, BlockAccessor +from ray.data.block import Block, DataBatch, BlockAccessor from ray.data._internal.block_builder import BlockBuilder from ray.data._internal.simple_block import SimpleBlockBuilder from ray.data._internal.arrow_block import ArrowRow, ArrowBlockBuilder from ray.data._internal.pandas_block import PandasRow, PandasBlockBuilder -class DelegatingBlockBuilder(BlockBuilder[T]): +class DelegatingBlockBuilder(BlockBuilder): def __init__(self): self._builder = None self._empty_block = None @@ -28,7 +28,7 @@ def add(self, item: Any) -> None: check.build() self._builder = ArrowBlockBuilder() except (TypeError, pyarrow.lib.ArrowInvalid): - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: # Can also handle nested Python objects, which Arrow cannot. self._builder = PandasBlockBuilder() diff --git a/python/ray/data/_internal/execution/autoscaling_requester.py b/python/ray/data/_internal/execution/autoscaling_requester.py index 92f6a5dd690c..7d14d6c6920d 100644 --- a/python/ray/data/_internal/execution/autoscaling_requester.py +++ b/python/ray/data/_internal/execution/autoscaling_requester.py @@ -19,7 +19,7 @@ @ray.remote(num_cpus=0, max_restarts=-1, max_task_retries=-1) class AutoscalingRequester: - """Actor to make resource requests to autoscaler for the datastreams. + """Actor to make resource requests to autoscaler for the datasets. The resource requests are set to timeout after RESOURCE_REQUEST_TIMEOUT seconds. For those live requests, we keep track of the last request made for each execution, diff --git a/python/ray/data/_internal/execution/bulk_executor.py b/python/ray/data/_internal/execution/bulk_executor.py index 7c3b2d9d8e73..3355a063b2bf 100644 --- a/python/ray/data/_internal/execution/bulk_executor.py +++ b/python/ray/data/_internal/execution/bulk_executor.py @@ -9,12 +9,12 @@ RefBundle, PhysicalOperator, ) -from ray.data._internal.datastream_logger import DatastreamLogger +from ray.data._internal.dataset_logger import DatasetLogger from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer from ray.data._internal.progress_bar import ProgressBar -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats -logger = DatastreamLogger(__name__) +logger = DatasetLogger(__name__) class BulkExecutor(Executor): @@ -28,11 +28,11 @@ def __init__(self, options: ExecutionOptions): # Bulk executor always preserves order. options.preserve_order = True super().__init__(options) - self._stats: Optional[DatastreamStats] = DatastreamStats(stages={}, parent=None) + self._stats: Optional[DatasetStats] = DatasetStats(stages={}, parent=None) self._executed = False def execute( - self, dag: PhysicalOperator, initial_stats: Optional[DatastreamStats] = None + self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None ) -> Iterator[RefBundle]: """Synchronously executes the DAG via bottom-up recursive traversal.""" @@ -84,7 +84,7 @@ def execute_recursive(op: PhysicalOperator) -> List[RefBundle]: return OutputIterator(execute_recursive(dag)) - def get_stats(self) -> DatastreamStats: + def get_stats(self) -> DatasetStats: return self._stats diff --git a/python/ray/data/_internal/execution/interfaces.py b/python/ray/data/_internal/execution/interfaces.py index cd06591bdd0b..eee7cf1c46dd 100644 --- a/python/ray/data/_internal/execution/interfaces.py +++ b/python/ray/data/_internal/execution/interfaces.py @@ -8,7 +8,7 @@ from ray.data._internal.logical.interfaces import Operator from ray.data._internal.memory_tracing import trace_deallocation from ray.data._internal.progress_bar import ProgressBar -from ray.data._internal.stats import DatastreamStats, StatsDict +from ray.data._internal.stats import DatasetStats, StatsDict from ray.data.block import Block, BlockMetadata from ray.data.context import DataContext from ray.types import ObjectRef @@ -233,6 +233,10 @@ class TaskContext: # TODO(chengsu): clean it up from TaskContext with new optimizer framework. sub_progress_bar_dict: Optional[Dict[str, ProgressBar]] = None + # The underlying function called in a MapOperator; this is used when fusing + # an AllToAllOperator with an upstream MapOperator. + upstream_map_transform_fn: Optional["MapTransformFn"] = None + # Block transform function applied by task and actor pools in MapOperator. MapTransformFn = Callable[[Iterable[Block], TaskContext], Iterable[Block]] @@ -250,7 +254,7 @@ class PhysicalOperator(Operator): output stream of RefBundles. Physical operators are stateful and non-serializable; they live on the driver side - of the Datastream only. + of the Dataset only. Here's a simple example of implementing a basic "Map" operator: @@ -281,21 +285,27 @@ def __init__(self, name: str, input_dependencies: List["PhysicalOperator"]): for x in input_dependencies: assert isinstance(x, PhysicalOperator), x self._inputs_complete = not input_dependencies + self._dependents_complete = False self._started = False def __reduce__(self): raise ValueError("Operator is not serializable.") def completed(self) -> bool: - """Return True when this operator is done and all outputs are taken.""" + """Return True when this operator is completed. + + An operator is completed if any of the following conditions are met: + - All upstream operators are completed and all outputs are taken. + - All downstream operators are completed. + """ return ( self._inputs_complete and len(self.get_work_refs()) == 0 and not self.has_next() - ) + ) or self._dependents_complete def get_stats(self) -> StatsDict: - """Return recorded execution stats for use with DatastreamStats.""" + """Return recorded execution stats for use with DatasetStats.""" raise NotImplementedError def get_metrics(self) -> Dict[str, int]: @@ -345,6 +355,13 @@ def should_add_input(self) -> bool: """ return True + def need_more_inputs(self) -> bool: + """Return true if the operator still needs more inputs. + + Once this return false, it should never return true again. + """ + return True + def add_input(self, refs: RefBundle, input_index: int) -> None: """Called when an upstream result is available. @@ -367,6 +384,13 @@ def inputs_done(self) -> None: """ self._inputs_complete = True + def all_dependents_complete(self) -> None: + """Called when all downstream operators have completed(). + + After this is called, the operator is marked as completed. + """ + self._dependents_complete = True + def has_next(self) -> bool: """Returns when a downstream output is available. @@ -468,7 +492,7 @@ def notify_resource_usage( class OutputIterator(Iterator[RefBundle]): """Iterator used to access the output of an Executor execution. - This is a blocking iterator. Datastreams guarantees that all its iterators are + This is a blocking iterator. Datasets guarantees that all its iterators are thread-safe (i.e., multiple threads can block on them at the same time). """ @@ -483,7 +507,7 @@ def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: Args: output_split_idx: The output split index to get results for. This arg is - only allowed for iterators created by `Datastream.streaming_split()`. + only allowed for iterators created by `Dataset.streaming_split()`. Raises: StopIteration if there are no more outputs to return. @@ -509,13 +533,13 @@ def __init__(self, options: ExecutionOptions): self._options = options def execute( - self, dag: PhysicalOperator, initial_stats: Optional[DatastreamStats] = None + self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None ) -> OutputIterator: """Start execution. Args: dag: The operator graph to execute. - initial_stats: The DatastreamStats to prepend to the stats returned by the + initial_stats: The DatasetStats to prepend to the stats returned by the executor. These stats represent actions done to compute inputs. """ raise NotImplementedError @@ -527,7 +551,7 @@ def shutdown(self): """ pass - def get_stats(self) -> DatastreamStats: + def get_stats(self) -> DatasetStats: """Return stats for the execution so far. This is generally called after `execute` has completed, but may be called diff --git a/python/ray/data/_internal/execution/legacy_compat.py b/python/ray/data/_internal/execution/legacy_compat.py index f3f28c01f8eb..907c368a2ffa 100644 --- a/python/ray/data/_internal/execution/legacy_compat.py +++ b/python/ray/data/_internal/execution/legacy_compat.py @@ -8,24 +8,26 @@ import ray from ray.data._internal.logical.optimizers import get_execution_plan -from ray.data._internal.usage import record_operators_usage +from ray.data._internal.logical.util import record_operators_usage from ray.data.context import DataContext from ray.types import ObjectRef -from ray.data.block import Block, BlockMetadata, List +from ray.data.block import Block, BlockMetadata, CallableClass, List from ray.data.datasource import ReadTask -from ray.data._internal.stats import StatsDict, DatastreamStats -from ray.data._internal.stage_impl import RandomizeBlocksStage +from ray.data._internal.stats import StatsDict, DatasetStats +from ray.data._internal.stage_impl import ( + RandomizeBlocksStage, + LimitStage, +) from ray.data._internal.block_list import BlockList from ray.data._internal.lazy_block_list import LazyBlockList from ray.data._internal.compute import ( get_compute, - CallableClass, - TaskPoolStrategy, ActorPoolStrategy, ) from ray.data._internal.memory_tracing import trace_allocation from ray.data._internal.plan import ExecutionPlan, OneToOneStage, AllToAllStage, Stage from ray.data._internal.execution.operators.map_operator import MapOperator +from ray.data._internal.execution.operators.limit_operator import LimitOperator from ray.data._internal.execution.operators.all_to_all_operator import AllToAllOperator from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer from ray.data._internal.execution.interfaces import ( @@ -34,18 +36,22 @@ RefBundle, TaskContext, ) +from ray.data._internal.util import validate_compute from ray.data._internal.execution.util import make_callable_class_concurrent +# Warn about tasks larger than this. +TASK_SIZE_WARN_THRESHOLD_BYTES = 100000 + def execute_to_legacy_block_iterator( executor: Executor, plan: ExecutionPlan, allow_clear_input_blocks: bool, - datastream_uuid: str, + dataset_uuid: str, ) -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]: """Same as execute_to_legacy_bundle_iterator but returning blocks and metadata.""" bundle_iter = execute_to_legacy_bundle_iterator( - executor, plan, allow_clear_input_blocks, datastream_uuid + executor, plan, allow_clear_input_blocks, dataset_uuid ) for bundle in bundle_iter: for block, metadata in bundle.blocks: @@ -56,7 +62,7 @@ def execute_to_legacy_bundle_iterator( executor: Executor, plan: ExecutionPlan, allow_clear_input_blocks: bool, - datastream_uuid: str, + dataset_uuid: str, dag_rewrite=None, ) -> Iterator[RefBundle]: """Execute a plan with the new executor and return a bundle iterator. @@ -65,10 +71,10 @@ def execute_to_legacy_bundle_iterator( executor: The executor to use. plan: The legacy plan to execute. allow_clear_input_blocks: Whether the executor may consider clearing blocks. - datastream_uuid: UUID of the datastream for this execution. + dataset_uuid: UUID of the dataset for this execution. dag_rewrite: Callback that can be used to mutate the DAG prior to execution. This is currently used as a legacy hack to inject the OutputSplit operator - for `Datastream.streaming_split()`. + for `Dataset.streaming_split()`. Returns: The output as a bundle iterator. @@ -90,7 +96,7 @@ def execute_to_legacy_block_list( executor: Executor, plan: ExecutionPlan, allow_clear_input_blocks: bool, - datastream_uuid: str, + dataset_uuid: str, preserve_order: bool, ) -> BlockList: """Execute a plan with the new executor and translate it into a legacy block list. @@ -99,7 +105,7 @@ def execute_to_legacy_block_list( executor: The executor to use. plan: The legacy plan to execute. allow_clear_input_blocks: Whether the executor may consider clearing blocks. - datastream_uuid: UUID of the datastream for this execution. + dataset_uuid: UUID of the dataset for this execution. preserve_order: Whether to preserve order in execution. Returns: @@ -114,7 +120,7 @@ def execute_to_legacy_block_list( bundles = executor.execute(dag, initial_stats=stats) block_list = _bundles_to_block_list(bundles) # Set the stats UUID after execution finishes. - _set_stats_uuid_recursive(executor.get_stats(), datastream_uuid) + _set_stats_uuid_recursive(executor.get_stats(), dataset_uuid) return block_list @@ -123,7 +129,7 @@ def _get_execution_dag( plan: ExecutionPlan, allow_clear_input_blocks: bool, preserve_order: bool, -) -> Tuple[PhysicalOperator, DatastreamStats]: +) -> Tuple[PhysicalOperator, DatasetStats]: """Get the physical operators DAG from a plan.""" # Record usage of logical operators if available. if hasattr(plan, "_logical_plan") and plan._logical_plan is not None: @@ -145,7 +151,7 @@ def _get_execution_dag( return dag, stats -def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatastreamStats: +def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatasetStats: assert DataContext.get_current().optimizer_enabled if plan._snapshot_blocks is not None and not plan._snapshot_blocks.is_cleared(): return plan._snapshot_stats @@ -154,7 +160,7 @@ def _get_initial_stats_from_plan(plan: ExecutionPlan) -> DatastreamStats: def _to_operator_dag( plan: ExecutionPlan, allow_clear_input_blocks: bool -) -> Tuple[PhysicalOperator, DatastreamStats]: +) -> Tuple[PhysicalOperator, DatasetStats]: """Translate a plan into an operator DAG for the new execution backend.""" blocks, stats, stages = plan._optimize() @@ -188,6 +194,24 @@ def _blocks_to_input_buffer(blocks: BlockList, owns_blocks: bool) -> PhysicalOpe read_tasks = blocks._tasks remote_args = blocks._remote_args assert all(isinstance(t, ReadTask) for t in read_tasks), read_tasks + + # Defensively compute the size of the block as the max size reported by the + # datasource and the actual read task size. This is to guard against issues + # with bad metadata reporting. + def cleaned_metadata(read_task): + block_meta = read_task.get_metadata() + task_size = len(cloudpickle.dumps(read_task)) + if block_meta.size_bytes is None or task_size > block_meta.size_bytes: + if task_size > TASK_SIZE_WARN_THRESHOLD_BYTES: + print( + f"WARNING: the read task size ({task_size} bytes) is larger " + "than the reported output size of the task " + f"({block_meta.size_bytes} bytes). This may be a size " + "reporting bug in the datasource being read from." + ) + block_meta.size_bytes = task_size + return block_meta + inputs = InputDataBuffer( [ RefBundle( @@ -196,13 +220,7 @@ def _blocks_to_input_buffer(blocks: BlockList, owns_blocks: bool) -> PhysicalOpe # This isn't a proper block, but it's what we are doing # in the legacy code. ray.put(read_task), - BlockMetadata( - num_rows=1, - size_bytes=len(cloudpickle.dumps(read_task)), - schema=None, - input_files=[], - exec_stats=None, - ), + cleaned_metadata(read_task), ) ], owns_blocks=True, @@ -248,16 +266,11 @@ def _stage_to_operator(stage: Stage, input_op: PhysicalOperator) -> PhysicalOper if isinstance(stage, OneToOneStage): compute = get_compute(stage.compute) + validate_compute(stage.fn, compute) block_fn = stage.block_fn if stage.fn: if isinstance(stage.fn, CallableClass): - if isinstance(compute, TaskPoolStrategy): - raise ValueError( - "``compute`` must be specified when using a callable class, " - "and must specify the actor compute strategy. " - "For example, use ``compute=ActorPoolStrategy(size=n)``." - ) assert isinstance(compute, ActorPoolStrategy) fn_constructor_args = stage.fn_constructor_args or () @@ -300,6 +313,8 @@ def do_map(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]: min_rows_per_bundle=stage.target_block_size, ray_remote_args=stage.ray_remote_args, ) + elif isinstance(stage, LimitStage): + return LimitOperator(stage.limit, input_op) elif isinstance(stage, AllToAllStage): fn = stage.fn block_udf = stage.block_udf @@ -363,8 +378,8 @@ def _block_list_to_bundles(blocks: BlockList, owns_blocks: bool) -> List[RefBund return output -def _set_stats_uuid_recursive(stats: DatastreamStats, datastream_uuid: str) -> None: - if not stats.datastream_uuid: - stats.datastream_uuid = datastream_uuid +def _set_stats_uuid_recursive(stats: DatasetStats, dataset_uuid: str) -> None: + if not stats.dataset_uuid: + stats.dataset_uuid = dataset_uuid for parent in stats.parents or []: - _set_stats_uuid_recursive(parent, datastream_uuid) + _set_stats_uuid_recursive(parent, dataset_uuid) diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py index 1156bc09bf72..bc55b0d9503d 100644 --- a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py @@ -6,7 +6,7 @@ from ray.data.block import Block, BlockMetadata, _CallableClassProtocol from ray.data.context import DataContext, DEFAULT_SCHEDULING_STRATEGY from ray.data._internal.compute import ActorPoolStrategy -from ray.data._internal.datastream_logger import DatastreamLogger +from ray.data._internal.dataset_logger import DatasetLogger from ray.data._internal.execution.interfaces import ( RefBundle, ExecutionResources, @@ -24,7 +24,7 @@ from ray.types import ObjectRef from ray._raylet import ObjectRefGenerator -logger = DatastreamLogger(__name__) +logger = DatasetLogger(__name__) # Higher values here are better for prefetching and locality. It's ok for this to be # fairly high since streaming backpressure prevents us from overloading actors. @@ -67,7 +67,7 @@ def __init__( min_rows_per_bundle: The number of rows to gather per batch passed to the transform_fn, or None to use the block size. Setting the batch size is important for the performance of GPU-accelerated transform functions. - The actual rows passed may be less if the datastream is small. + The actual rows passed may be less if the dataset is small. ray_remote_args: Customize the ray remote args for this op's tasks. """ super().__init__( @@ -75,6 +75,7 @@ def __init__( ) self._init_fn = init_fn self._ray_remote_args = self._apply_default_remote_args(self._ray_remote_args) + self._min_rows_per_bundle = min_rows_per_bundle # Create autoscaling policy from compute strategy. self._autoscaling_policy = autoscaling_policy @@ -249,6 +250,32 @@ def shutdown(self): self._actor_pool.kill_all_actors() super().shutdown() + # Warn if the user specified a batch or block size that prevents full + # parallelization across the actor pool. We only know this information after + # execution has completed. + total_rows = sum([m.num_rows for m in self._output_metadata]) + min_workers = self._autoscaling_policy.min_workers + max_desired_batch_size = total_rows // min_workers + if ( + self._min_rows_per_bundle is not None + and self._min_rows_per_bundle > max_desired_batch_size + ): + # The user specified a batch size, but it was probably too large. + logger.get_logger().warning( + "To ensure full parallelization across an actor pool of size " + f"{min_workers}, the specified batch size " + f"should be at most {max_desired_batch_size}. Your configured batch " + f"size for this operator was {self._min_rows_per_bundle}." + ) + elif len(self._output_metadata) < min_workers: + # The user created a stream that has too few blocks to begin with. + logger.get_logger().warning( + "To ensure full parallelization across an actor pool of size " + f"{min_workers}, the Dataset should consist of at least " + f"{min_workers} distinct blocks. Consider increasing " + "the parallelism when creating the Dataset." + ) + def get_work_refs(self) -> List[ray.ObjectRef]: # Work references that we wish the executor to wait on includes both task # futures AND worker ready futures. diff --git a/python/ray/data/_internal/execution/operators/input_data_buffer.py b/python/ray/data/_internal/execution/operators/input_data_buffer.py index fb0d686333db..b81f1031deae 100644 --- a/python/ray/data/_internal/execution/operators/input_data_buffer.py +++ b/python/ray/data/_internal/execution/operators/input_data_buffer.py @@ -11,7 +11,7 @@ class InputDataBuffer(PhysicalOperator): """Defines the input data for the operator DAG. - For example, this may hold cached blocks from a previous Datastream execution, or + For example, this may hold cached blocks from a previous Dataset execution, or the arguments for read tasks. """ diff --git a/python/ray/data/_internal/execution/operators/limit_operator.py b/python/ray/data/_internal/execution/operators/limit_operator.py new file mode 100644 index 000000000000..80d66a57131a --- /dev/null +++ b/python/ray/data/_internal/execution/operators/limit_operator.py @@ -0,0 +1,108 @@ +import ray +import copy +from collections import deque +from ray.data.block import ( + Block, + BlockAccessor, + BlockMetadata, +) +from ray.data._internal.stats import StatsDict +from ray.data._internal.execution.interfaces import ( + PhysicalOperator, + RefBundle, +) +from ray.data._internal.remote_fn import cached_remote_fn +from ray.types import ObjectRef +from typing import ( + Deque, + List, + Optional, + Tuple, +) + + +class LimitOperator(PhysicalOperator): + """Physical operator for limit.""" + + def __init__( + self, + limit: int, + input_op: PhysicalOperator, + ): + self._limit = limit + self._consumed_rows = 0 + self._buffer: Deque[RefBundle] = deque() + self._name = f"Limit[limit={limit}]" + self._output_metadata: List[BlockMetadata] = [] + self._cur_output_bundles = 0 + super().__init__(self._name, [input_op]) + if self._limit <= 0: + self.inputs_done() + + def _limit_reached(self) -> bool: + return self._consumed_rows >= self._limit + + def need_more_inputs(self) -> bool: + return not self._limit_reached() + + def add_input(self, refs: RefBundle, input_index: int) -> None: + assert not self.completed() + assert input_index == 0, input_index + if self._limit_reached(): + return + out_blocks: List[ObjectRef[Block]] = [] + out_metadata: List[BlockMetadata] = [] + for block, metadata in refs.blocks: + num_rows = metadata.num_rows + assert num_rows is not None + if self._consumed_rows + num_rows <= self._limit: + out_blocks.append(block) + out_metadata.append(metadata) + self._output_metadata.append(metadata) + self._consumed_rows += num_rows + else: + # Slice the last block. + def slice_fn(block, metadata, num_rows) -> Tuple[Block, BlockMetadata]: + block = BlockAccessor.for_block(block).slice(0, num_rows, copy=True) + metadata = copy.deepcopy(metadata) + metadata.num_rows = num_rows + metadata.size_bytes = BlockAccessor.for_block(block).size_bytes() + return block, metadata + + block, metadata_ref = cached_remote_fn(slice_fn, num_returns=2).remote( + block, + metadata, + self._limit - self._consumed_rows, + ) + out_blocks.append(block) + metadata = ray.get(metadata_ref) + out_metadata.append(metadata) + self._output_metadata.append(metadata) + self._consumed_rows = self._limit + break + self._cur_output_bundles += 1 + out_refs = RefBundle( + list(zip(out_blocks, out_metadata)), + owns_blocks=refs.owns_blocks, + ) + self._buffer.append(out_refs) + if self._limit_reached(): + self.inputs_done() + + def has_next(self) -> bool: + return len(self._buffer) > 0 + + def get_next(self) -> RefBundle: + return self._buffer.popleft() + + def get_stats(self) -> StatsDict: + return {self._name: self._output_metadata} + + def num_outputs_total(self) -> Optional[int]: + # Before inputs are completed (either because the limit is reached or + # because the inputs operators are done), we don't know how many output + # bundles we will have. + if self._inputs_complete: + return self._cur_output_bundles + else: + return None diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py index 5c580b5c7eac..d76b6a32af93 100644 --- a/python/ray/data/_internal/execution/operators/map_operator.py +++ b/python/ray/data/_internal/execution/operators/map_operator.py @@ -95,7 +95,7 @@ def create( min_rows_per_bundle: The number of rows to gather per batch passed to the transform_fn, or None to use the block size. Setting the batch size is important for the performance of GPU-accelerated transform functions. - The actual rows passed may be less if the datastream is small. + The actual rows passed may be less if the dataset is small. ray_remote_args: Customize the ray remote args for this op's tasks. """ if compute_strategy is None: diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py index fe649b7fafb2..a5bdaa622cbc 100644 --- a/python/ray/data/_internal/execution/operators/output_splitter.py +++ b/python/ray/data/_internal/execution/operators/output_splitter.py @@ -21,7 +21,7 @@ class OutputSplitter(PhysicalOperator): The output bundles of this operator will have a `bundle.output_split_idx` attr set to an integer from [0..n-1]. This operator tries to divide the rows evenly across output splits. If the `equal` option is set, the operator will furthermore - guarantee an exact split of rows across outputs, truncating the Datastream. + guarantee an exact split of rows across outputs, truncating the Dataset. Implementation wise, this operator keeps an internal buffer of bundles. The buffer has a minimum size calculated to enable a good locality hit rate, as well as ensure diff --git a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py index d72f1afe6df0..89d51d7857ac 100644 --- a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py @@ -38,7 +38,7 @@ def __init__( min_rows_per_bundle: The number of rows to gather per batch passed to the transform_fn, or None to use the block size. Setting the batch size is important for the performance of GPU-accelerated transform functions. - The actual rows passed may be less if the datastream is small. + The actual rows passed may be less if the dataset is small. ray_remote_args: Customize the ray remote args for this op's tasks. """ super().__init__( diff --git a/python/ray/data/_internal/execution/operators/zip_operator.py b/python/ray/data/_internal/execution/operators/zip_operator.py index db2287042810..3238948ff3e4 100644 --- a/python/ray/data/_internal/execution/operators/zip_operator.py +++ b/python/ray/data/_internal/execution/operators/zip_operator.py @@ -121,7 +121,7 @@ def _zip( total_right_rows = sum(right_block_rows) if total_left_rows != total_right_rows: raise ValueError( - "Cannot zip datastreams of different number of rows: " + "Cannot zip datasets of different number of rows: " f"{total_left_rows}, {total_right_rows}" ) diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py index 9c3676eb65d6..d4ced149c440 100644 --- a/python/ray/data/_internal/execution/streaming_executor.py +++ b/python/ray/data/_internal/execution/streaming_executor.py @@ -6,7 +6,7 @@ import ray from ray.data.context import DataContext -from ray.data._internal.datastream_logger import DatastreamLogger +from ray.data._internal.dataset_logger import DatasetLogger from ray.data._internal.execution.interfaces import ( Executor, ExecutionOptions, @@ -30,9 +30,9 @@ get_or_create_autoscaling_requester_actor, ) from ray.data._internal.progress_bar import ProgressBar -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats -logger = DatastreamLogger(__name__) +logger = DatasetLogger(__name__) # Set this environment variable for detailed scheduler debugging logs. DEBUG_TRACE_SCHEDULING = "RAY_DATA_TRACE_SCHEDULING" in os.environ @@ -41,19 +41,22 @@ # progress bar seeming to stall for very large scale workloads. PROGRESS_BAR_UPDATE_INTERVAL = 50 +# Visible for testing. +_num_shutdown = 0 + class StreamingExecutor(Executor, threading.Thread): - """A streaming Datastream executor. + """A streaming Dataset executor. - This implementation executes Datastream DAGs in a fully streamed way. It runs + This implementation executes Dataset DAGs in a fully streamed way. It runs by setting up the operator topology, and then routing blocks through operators in a way that maximizes throughput under resource constraints. """ def __init__(self, options: ExecutionOptions): self._start_time: Optional[float] = None - self._initial_stats: Optional[DatastreamStats] = None - self._final_stats: Optional[DatastreamStats] = None + self._initial_stats: Optional[DatasetStats] = None + self._final_stats: Optional[DatasetStats] = None self._global_info: Optional[ProgressBar] = None self._execution_id = uuid.uuid4().hex @@ -70,16 +73,17 @@ def __init__(self, options: ExecutionOptions): self._output_node: Optional[OpState] = None Executor.__init__(self, options) - threading.Thread.__init__(self) + threading.Thread.__init__(self, daemon=True) def execute( - self, dag: PhysicalOperator, initial_stats: Optional[DatastreamStats] = None + self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None ) -> Iterator[RefBundle]: """Executes the DAG using a streaming execution strategy. We take an event-loop approach to scheduling. We block on the next scheduling event using `ray.wait`, updating operator state and dispatching new tasks. """ + self._initial_stats = initial_stats self._start_time = time.perf_counter() @@ -88,8 +92,9 @@ def execute( logger.get_logger().info("Execution config: %s", self._options) if not self._options.verbose_progress: logger.get_logger().info( - "Tip: To enable per-operator progress reporting, set " - "RAY_DATA_VERBOSE_PROGRESS=1." + "Tip: For detailed progress reporting, run " + "`ray.data.DataContext.get_current()." + "execution_options.verbose_progress = True`" ) # Setup the streaming DAG topology and start the runner thread. @@ -115,7 +120,10 @@ def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: # Translate the special sentinel values for MaybeRefBundle into # exceptions. if item is None: - raise StopIteration + if self._outer._shutdown: + raise StopIteration(f"{self._outer} is shutdown.") + else: + raise StopIteration elif isinstance(item, Exception): raise item else: @@ -132,9 +140,14 @@ def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: return StreamIterator(self) def shutdown(self): + context = DataContext.get_current() + global _num_shutdown + with self._shutdown_lock: if self._shutdown: return + logger.get_logger().info(f"Shutting down {self}.") + _num_shutdown += 1 self._shutdown = True # Give the scheduling loop some time to finish processing. self.join(timeout=2.0) @@ -143,7 +156,6 @@ def shutdown(self): stats_summary_string = self._final_stats.to_summary().to_string( include_parent=False ) - context = DataContext.get_current() logger.get_logger(log_to_stdout=context.enable_auto_log_stats).info( stats_summary_string, ) @@ -184,9 +196,9 @@ def get_stats(self): else: return self._generate_stats() - def _generate_stats(self) -> DatastreamStats: + def _generate_stats(self) -> DatasetStats: """Create a new stats object reflecting execution status so far.""" - stats = self._initial_stats or DatastreamStats(stages={}, parent=None) + stats = self._initial_stats or DatasetStats(stages={}, parent=None) for op in self._topology: if isinstance(op, InputDataBuffer): continue diff --git a/python/ray/data/_internal/execution/streaming_executor_state.py b/python/ray/data/_internal/execution/streaming_executor_state.py index 24d815bd4e7c..05d32a2c2103 100644 --- a/python/ray/data/_internal/execution/streaming_executor_state.py +++ b/python/ray/data/_internal/execution/streaming_executor_state.py @@ -121,6 +121,7 @@ def __init__(self, op: PhysicalOperator, inqueues: List[Deque[MaybeRefBundle]]): self.progress_bar = None self.num_completed_tasks = 0 self.inputs_done_called = False + self.dependents_completed_called = False def initialize_progress_bars(self, index: int, verbose_progress: bool) -> int: """Create progress bars at the given index (line offset in console). @@ -334,16 +335,31 @@ def process_completed_tasks(topology: Topology) -> None: # Call inputs_done() on ops where no more inputs are coming. for op, op_state in topology.items(): + if op_state.inputs_done_called: + continue inputs_done = all( [ dep.completed() and not topology[dep].outqueue for dep in op.input_dependencies ] ) - if inputs_done and not op_state.inputs_done_called: + if inputs_done: op.inputs_done() op_state.inputs_done_called = True + # Traverse the topology in reverse topological order. + # For each op, if all of its downstream operators don't need any more inputs, + # call all_dependents_complete() to also complete this op. + for op, op_state in reversed(list(topology.items())): + if op_state.dependents_completed_called: + continue + dependents_completed = len(op.output_dependencies) > 0 and all( + not dep.need_more_inputs() for dep in op.output_dependencies + ) + if dependents_completed: + op.all_dependents_complete() + op_state.dependents_completed_called = True + def select_operator_to_run( topology: Topology, @@ -372,7 +388,12 @@ def select_operator_to_run( ops = [] for op, state in topology.items(): under_resource_limits = _execution_allowed(op, cur_usage, limits) - if state.num_queued() > 0 and op.should_add_input() and under_resource_limits: + if ( + op.need_more_inputs() + and state.num_queued() > 0 + and op.should_add_input() + and under_resource_limits + ): ops.append(op) # Update the op in all cases to enable internal autoscaling, etc. op.notify_resource_usage(state.num_queued(), under_resource_limits) @@ -396,7 +417,11 @@ def select_operator_to_run( and all(op.num_active_work_refs() == 0 for op in topology) ): # The topology is entirely idle, so choose from all ready ops ignoring limits. - ops = [op for op, state in topology.items() if state.num_queued() > 0] + ops = [ + op + for op, state in topology.items() + if op.need_more_inputs() and state.num_queued() > 0 + ] # Nothing to run. if not ops: diff --git a/python/ray/data/_internal/execution/util.py b/python/ray/data/_internal/execution/util.py index 8674ed60cccf..4a87f12c0a4a 100644 --- a/python/ray/data/_internal/execution/util.py +++ b/python/ray/data/_internal/execution/util.py @@ -1,22 +1,24 @@ from concurrent.futures import ThreadPoolExecutor -from typing import List, TYPE_CHECKING +from typing import List, Any, TYPE_CHECKING import ray -from ray.data.block import Block, BlockAccessor, CallableClass +from ray.data.block import BlockAccessor, CallableClass if TYPE_CHECKING: from ray.data._internal.execution.interfaces import RefBundle -def make_ref_bundles(simple_data: List[Block]) -> List["RefBundle"]: +def make_ref_bundles(simple_data: List[List[Any]]) -> List["RefBundle"]: """Create ref bundles from a list of block data. One bundle is created for each input block. """ from ray.data._internal.execution.interfaces import RefBundle + import pandas as pd output = [] for block in simple_data: + block = pd.DataFrame({"id": block}) output.append( RefBundle( [ diff --git a/python/ray/data/_internal/fast_repartition.py b/python/ray/data/_internal/fast_repartition.py index 461dd8d683df..06c53877cace 100644 --- a/python/ray/data/_internal/fast_repartition.py +++ b/python/ray/data/_internal/fast_repartition.py @@ -8,16 +8,16 @@ from ray.data._internal.progress_bar import ProgressBar from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.shuffle_and_partition import _ShufflePartitionOp -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats def fast_repartition(blocks, num_blocks, ctx: Optional[TaskContext] = None): - from ray.data.datastream import Datastream + from ray.data.dataset import Dataset, Schema - wrapped_ds = Datastream( + wrapped_ds = Dataset( ExecutionPlan( blocks, - DatastreamStats(stages={}, parent=None), + DatasetStats(stages={}, parent=None), run_by_consumer=blocks._owned_by_consumer, ), 0, @@ -59,8 +59,10 @@ def fast_repartition(blocks, num_blocks, ctx: Optional[TaskContext] = None): owned_by_consumer = blocks._owned_by_consumer # Schema is safe to fetch here since we have already called - # get_internal_block_refs and executed the datastream. + # get_internal_block_refs and executed the dataset. schema = wrapped_ds.schema(fetch_if_missing=True) + if isinstance(schema, Schema): + schema = schema.base_schema # Early-release memory. del splits, blocks, wrapped_ds @@ -84,8 +86,8 @@ def fast_repartition(blocks, num_blocks, ctx: Optional[TaskContext] = None): if schema is None: raise ValueError( - "Datastream is empty or cleared, can't determine the format of " - "the datastream." + "Dataset is empty or cleared, can't determine the format of " + "the dataset." ) elif isinstance(schema, type): builder = SimpleBlockBuilder() diff --git a/python/ray/data/_internal/iterator/iterator_impl.py b/python/ray/data/_internal/iterator/iterator_impl.py index c97131d920fd..2dc69a34f649 100644 --- a/python/ray/data/_internal/iterator/iterator_impl.py +++ b/python/ray/data/_internal/iterator/iterator_impl.py @@ -2,54 +2,52 @@ from ray.types import ObjectRef from ray.data.block import Block, BlockMetadata -from ray.data.context import DataContext from ray.data.iterator import DataIterator -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats if TYPE_CHECKING: import pyarrow - from ray.data import Datastream + from ray.data import Dataset class DataIteratorImpl(DataIterator): def __init__( self, - base_datastream: "Datastream", + base_dataset: "Dataset", ): - self._base_datastream = base_datastream - self._base_context = DataContext.get_current() + self._base_dataset = base_dataset def __repr__(self) -> str: - return f"DataIterator({self._base_datastream})" + return f"DataIterator({self._base_dataset})" def _to_block_iterator( self, ) -> Tuple[ Iterator[Tuple[ObjectRef[Block], BlockMetadata]], - Optional[DatastreamStats], + Optional[DatasetStats], bool, ]: - ds = self._base_datastream + ds = self._base_dataset block_iterator, stats, executor = ds._plan.execute_to_iterator() ds._current_executor = executor return block_iterator, stats, False def stats(self) -> str: - return self._base_datastream.stats() + return self._base_dataset.stats() def schema(self) -> Union[type, "pyarrow.lib.Schema"]: - return self._base_datastream.schema() + return self._base_dataset.schema() def __getattr__(self, name): - if name == "_base_datastream": + if name == "_base_dataset": raise AttributeError() - if hasattr(self._base_datastream, name) and not name.startswith("_"): + if hasattr(self._base_dataset, name) and not name.startswith("_"): # Raise error for backwards compatibility. # TODO: remove this method in 2.6. raise DeprecationWarning( "session.get_dataset_shard returns a ray.data.DataIterator " - "instead of a Datastream/DatasetPipeline as of Ray v2.3. " + "instead of a Dataset/DatasetPipeline as of Ray v2.3. " "Use iter_torch_batches(), to_tf(), or iter_batches() to " "iterate over one epoch. See " "https://docs.ray.io/en/latest/data/api/dataset_iterator.html " diff --git a/python/ray/data/_internal/iterator/pipelined_iterator.py b/python/ray/data/_internal/iterator/pipelined_iterator.py index 591964fc72e9..011fea818d78 100644 --- a/python/ray/data/_internal/iterator/pipelined_iterator.py +++ b/python/ray/data/_internal/iterator/pipelined_iterator.py @@ -3,7 +3,7 @@ from ray.types import ObjectRef from ray.data.block import Block, BlockMetadata, DataBatch from ray.data.iterator import DataIterator -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats if TYPE_CHECKING: import pyarrow @@ -21,7 +21,7 @@ def __init__( def __repr__(self) -> str: return f"DataIterator({self._base_dataset_pipeline})" - def _get_next_datastream(self) -> "DatasetPipeline": + def _get_next_dataset(self) -> "DatasetPipeline": if self._epoch_iterator is None: self._epoch_iterator = self._base_dataset_pipeline.iter_epochs() @@ -32,18 +32,18 @@ def _to_block_iterator( self, ) -> Tuple[ Iterator[Tuple[ObjectRef[Block], BlockMetadata]], - Optional[DatastreamStats], + Optional[DatasetStats], bool, ]: - epoch_pipeline = self._get_next_datastream() + epoch_pipeline = self._get_next_dataset() - # Peek the first datastream from the pipeline to see if blocks are owned + # Peek the first dataset from the pipeline to see if blocks are owned # by consumer. If so, the blocks are safe to be eagerly cleared after use # because memories are not shared across different consumers. This will # improve the memory efficiency. - if epoch_pipeline._first_datastream is not None: + if epoch_pipeline._first_dataset is not None: blocks_owned_by_consumer = ( - epoch_pipeline._first_datastream._plan.execute()._owned_by_consumer + epoch_pipeline._first_dataset._plan.execute()._owned_by_consumer ) else: blocks_owned_by_consumer = ( @@ -96,7 +96,7 @@ def __getattr__(self, name): # TODO: remove this method in 2.6. raise DeprecationWarning( "session.get_dataset_shard returns a ray.data.DataIterator " - "instead of a Datastream/DatasetPipeline as of Ray v2.3. " + "instead of a Dataset/DatasetPipeline as of Ray v2.3. " "Use iter_torch_batches(), to_tf(), or iter_batches() to " "iterate over one epoch. See " "https://docs.ray.io/en/latest/data/api/dataset_iterator.html " diff --git a/python/ray/data/_internal/iterator/stream_split_iterator.py b/python/ray/data/_internal/iterator/stream_split_iterator.py index 6fb3a23a1b93..db3dec49fe5a 100644 --- a/python/ray/data/_internal/iterator/stream_split_iterator.py +++ b/python/ray/data/_internal/iterator/stream_split_iterator.py @@ -23,14 +23,14 @@ ) from ray.data._internal.execution.operators.output_splitter import OutputSplitter from ray.data._internal.execution.interfaces import NodeIdStr, RefBundle -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats from ray.types import ObjectRef from ray.util.debug import log_once from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy if TYPE_CHECKING: import pyarrow - from ray.data import Datastream + from ray.data import Dataset logger = logging.getLogger(__name__) @@ -43,14 +43,14 @@ class StreamSplitDataIterator(DataIterator): @staticmethod def create( - base_datastream: "Datastream", + base_dataset: "Dataset", n: int, equal: bool, locality_hints: Optional[List[NodeIdStr]], ) -> List["StreamSplitDataIterator"]: - """Create a split iterator from the given base Datastream and options. + """Create a split iterator from the given base Dataset and options. - See also: `Datastream.streaming_split`. + See also: `Dataset.streaming_split`. """ ctx = DataContext.get_current() @@ -60,19 +60,17 @@ def create( scheduling_strategy=NodeAffinitySchedulingStrategy( ray.get_runtime_context().get_node_id(), soft=False ), - ).remote(ctx, base_datastream, n, equal, locality_hints) + ).remote(ctx, base_dataset, n, equal, locality_hints) - return [ - StreamSplitDataIterator(base_datastream, coord_actor, i) for i in range(n) - ] + return [StreamSplitDataIterator(base_dataset, coord_actor, i) for i in range(n)] def __init__( self, - base_datastream: "Datastream", + base_dataset: "Dataset", coord_actor: ray.actor.ActorHandle, output_split_idx: int, ): - self._base_datastream = base_datastream + self._base_dataset = base_dataset self._coord_actor = coord_actor self._output_split_idx = output_split_idx @@ -80,7 +78,7 @@ def _to_block_iterator( self, ) -> Tuple[ Iterator[Tuple[ObjectRef[Block], BlockMetadata]], - Optional[DatastreamStats], + Optional[DatasetStats], bool, ]: def gen_blocks() -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]: @@ -106,11 +104,11 @@ def gen_blocks() -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]: def stats(self) -> str: """Implements DataIterator.""" - return self._base_datastream.stats() + return self._base_dataset.stats() def schema(self) -> Union[type, "pyarrow.lib.Schema"]: """Implements DataIterator.""" - return self._base_datastream.schema() + return self._base_dataset.schema() @ray.remote(num_cpus=0) @@ -124,7 +122,7 @@ class SplitCoordinator: def __init__( self, ctx: DataContext, - datastream: "Datastream", + dataset: "Dataset", n: int, equal: bool, locality_hints: Optional[List[NodeIdStr]], @@ -135,7 +133,7 @@ def __init__( logger.info(f"Auto configuring locality_with_output={locality_hints}") DataContext._set_current(ctx) - self._base_datastream = datastream + self._base_dataset = dataset self._n = n self._equal = equal self._locality_hints = locality_hints @@ -155,9 +153,9 @@ def add_split_op(dag): output_iterator = execute_to_legacy_bundle_iterator( executor, - datastream._plan, + dataset._plan, True, - datastream._plan._datastream_uuid, + dataset._plan._dataset_uuid, dag_rewrite=add_split_op, ) yield output_iterator @@ -186,7 +184,7 @@ def get( if epoch_id != self._cur_epoch: raise ValueError( - "Invalid iterator: the datastream has moved on to another epoch." + "Invalid iterator: the dataset has moved on to another epoch." ) try: diff --git a/python/ray/data/_internal/lazy_block_list.py b/python/ray/data/_internal/lazy_block_list.py index 23331ddd1773..bd3513dd3d90 100644 --- a/python/ray/data/_internal/lazy_block_list.py +++ b/python/ray/data/_internal/lazy_block_list.py @@ -7,7 +7,7 @@ from ray.data._internal.progress_bar import ProgressBar from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.memory_tracing import trace_allocation -from ray.data._internal.stats import DatastreamStats, _get_or_create_stats_actor +from ray.data._internal.stats import DatasetStats, _get_or_create_stats_actor from ray.data._internal.util import _split_list from ray.data.block import ( Block, @@ -26,7 +26,7 @@ class LazyBlockList(BlockList): """A BlockList that submits tasks lazily on-demand. This BlockList is used for implementing read operations (e.g., to avoid - needing to read all files of a Datastream when the user is just wanting to + needing to read all files of a Dataset when the user is just wanting to .take() the first few rows or view the schema). """ @@ -58,7 +58,7 @@ def __init__( in cached_metadata represents the list of output blocks metadata per the read task. One task can produce multiple output blocks. ray_remote_args: Ray remote arguments for the read tasks. - stats_uuid: UUID for the datastream stats, used to group and fetch read task + stats_uuid: UUID for the dataset stats, used to group and fetch read task stats. If not provided, a new UUID will be created. """ self._tasks = tasks @@ -119,10 +119,10 @@ def get_metadata(self, fetch_if_missing: bool = False) -> List[BlockMetadata]: _, metadata = self._get_blocks_with_metadata() return metadata - def stats(self) -> DatastreamStats: - """Create DatastreamStats for this LazyBlockList.""" - return DatastreamStats( - # Make a copy of metadata, as the DatastreamStats may mutate it in-place. + def stats(self) -> DatasetStats: + """Create DatasetStats for this LazyBlockList.""" + return DatasetStats( + # Make a copy of metadata, as the DatasetStats may mutate it in-place. stages={"Read": self.get_metadata(fetch_if_missing=False).copy()}, parent=None, needs_stats_actor=True, @@ -315,7 +315,7 @@ def _get_blocks_with_metadata( if context.block_splitting_enabled: # If block splitting is enabled, fetch the partitions through generator. read_progress_bar = ProgressBar("Read progress", total=len(block_refs)) - # Handle duplicates (e.g. due to unioning the same datastream). + # Handle duplicates (e.g. due to unioning the same dataset). unique_refs = list(set(block_refs)) generators = read_progress_bar.fetch_until_complete(unique_refs) @@ -341,7 +341,7 @@ def _get_blocks_with_metadata( return [], [] read_progress_bar = ProgressBar("Read progress", total=len(meta_refs)) # Fetch the metadata in bulk. - # Handle duplicates (e.g. due to unioning the same datastream). + # Handle duplicates (e.g. due to unioning the same dataset). unique_meta_refs = set(meta_refs) metadata = read_progress_bar.fetch_until_complete(list(unique_meta_refs)) ref_to_data = { @@ -359,7 +359,7 @@ def compute_first_block(self): """Kick off computation for the first block in the list. This is useful if looking to support rapid lightweight interaction with a small - amount of the datastream. + amount of the dataset. """ if self._tasks: self._get_or_compute(0) @@ -385,7 +385,7 @@ def ensure_metadata_for_first_block(self) -> Optional[BlockMetadata]: try: block_partition_ref, metadata_ref = next(self._iter_block_partition_refs()) except (StopIteration, ValueError): - # Datastream is empty (no blocks) or was manually cleared. + # Dataset is empty (no blocks) or was manually cleared. pass else: # This blocks until the underlying read task is finished. diff --git a/python/ray/data/_internal/logical/interfaces.py b/python/ray/data/_internal/logical/interfaces.py index 0bd800a78681..7fd5b4662193 100644 --- a/python/ray/data/_internal/logical/interfaces.py +++ b/python/ray/data/_internal/logical/interfaces.py @@ -7,14 +7,16 @@ class Operator: """Abstract class for operators. - Operators live on the driver side of the Datastream only. + Operators live on the driver side of the Dataset only. """ def __init__(self, name: str, input_dependencies: List["Operator"]): self._name = name self._input_dependencies = input_dependencies + self._output_dependencies = [] for x in input_dependencies: assert isinstance(x, Operator), x + x._output_dependencies.append(self) @property def name(self) -> str: @@ -28,6 +30,14 @@ def input_dependencies(self) -> List["Operator"]: ), "Operator.__init__() was not called." return self._input_dependencies + @property + def output_dependencies(self) -> List["Operator"]: + """List of operators that consume outputs from this operator.""" + assert hasattr( + self, "_output_dependencies" + ), "Operator.__init__() was not called." + return self._output_dependencies + def post_order_iter(self) -> Iterator["Operator"]: """Depth-first traversal of this operator and its input dependencies.""" for op in self.input_dependencies: diff --git a/python/ray/data/_internal/logical/operators/all_to_all_operator.py b/python/ray/data/_internal/logical/operators/all_to_all_operator.py index 9dacd39ad5ec..a22851396a84 100644 --- a/python/ray/data/_internal/logical/operators/all_to_all_operator.py +++ b/python/ray/data/_internal/logical/operators/all_to_all_operator.py @@ -2,7 +2,6 @@ from ray.data._internal.logical.interfaces import LogicalOperator from ray.data.aggregate import AggregateFn -from ray.data.block import KeyFn class AbstractAllToAll(LogicalOperator): @@ -20,7 +19,7 @@ def __init__( """ Args: name: Name for this operator. This is the name that will appear when - inspecting the logical plan of a Datastream. + inspecting the logical plan of a Dataset. input_op: The operator preceding this operator in the plan DAG. The outputs of `input_op` will be the inputs to this operator. num_outputs: The number of expected output bundles outputted by this @@ -53,12 +52,13 @@ class RandomShuffle(AbstractAllToAll): def __init__( self, input_op: LogicalOperator, + name: str = "RandomShuffle", seed: Optional[int] = None, num_outputs: Optional[int] = None, ray_remote_args: Optional[Dict[str, Any]] = None, ): super().__init__( - "RandomShuffle", + name, input_op, num_outputs=num_outputs, ray_remote_args=ray_remote_args, @@ -89,7 +89,7 @@ class Sort(AbstractAllToAll): def __init__( self, input_op: LogicalOperator, - key: Optional[KeyFn], + key: Optional[str], descending: bool, ): super().__init__( @@ -106,7 +106,7 @@ class Aggregate(AbstractAllToAll): def __init__( self, input_op: LogicalOperator, - key: Optional[KeyFn], + key: Optional[str], aggs: List[AggregateFn], ): super().__init__( diff --git a/python/ray/data/_internal/logical/operators/limit_operator.py b/python/ray/data/_internal/logical/operators/limit_operator.py new file mode 100644 index 000000000000..c7d9690ad8b7 --- /dev/null +++ b/python/ray/data/_internal/logical/operators/limit_operator.py @@ -0,0 +1,16 @@ +from ray.data._internal.logical.interfaces import LogicalOperator + + +class Limit(LogicalOperator): + """Logical operator for limit.""" + + def __init__( + self, + input_op: LogicalOperator, + limit: int, + ): + super().__init__( + "Limit", + [input_op], + ) + self._limit = limit diff --git a/python/ray/data/_internal/logical/operators/map_operator.py b/python/ray/data/_internal/logical/operators/map_operator.py index 133e2ff59d1a..d3cd66080119 100644 --- a/python/ray/data/_internal/logical/operators/map_operator.py +++ b/python/ray/data/_internal/logical/operators/map_operator.py @@ -1,11 +1,8 @@ from typing import Any, Dict, Iterable, Optional, Union from ray.data._internal.logical.interfaces import LogicalOperator -from ray.data._internal.compute import ( - UDF, - ComputeStrategy, -) -from ray.data.block import BatchUDF, RowUDF +from ray.data._internal.compute import ComputeStrategy, TaskPoolStrategy +from ray.data.block import UserDefinedFunction from ray.data.context import DEFAULT_BATCH_SIZE @@ -23,7 +20,7 @@ def __init__( """ Args: name: Name for this operator. This is the name that will appear when - inspecting the logical plan of a Datastream. + inspecting the logical plan of a Dataset. input_op: The operator preceding this operator in the plan DAG. The outputs of `input_op` will be the inputs to this operator. ray_remote_args: Args to provide to ray.remote. @@ -41,7 +38,7 @@ def __init__( self, name: str, input_op: LogicalOperator, - fn: UDF, + fn: UserDefinedFunction, fn_args: Optional[Iterable[Any]] = None, fn_kwargs: Optional[Dict[str, Any]] = None, fn_constructor_args: Optional[Iterable[Any]] = None, @@ -53,7 +50,7 @@ def __init__( """ Args: name: Name for this operator. This is the name that will appear when - inspecting the logical plan of a Datastream. + inspecting the logical plan of a Dataset. input_op: The operator preceding this operator in the plan DAG. The outputs of `input_op` will be the inputs to this operator. fn: User-defined function to be called. @@ -75,7 +72,7 @@ def __init__( self._fn_constructor_args = fn_constructor_args self._fn_constructor_kwargs = fn_constructor_kwargs self._target_block_size = target_block_size - self._compute = compute or "tasks" + self._compute = compute or TaskPoolStrategy() class MapBatches(AbstractUDFMap): @@ -84,7 +81,7 @@ class MapBatches(AbstractUDFMap): def __init__( self, input_op: LogicalOperator, - fn: BatchUDF, + fn: UserDefinedFunction, batch_size: Optional[int] = DEFAULT_BATCH_SIZE, batch_format: Optional[str] = "default", zero_copy_batch: bool = False, @@ -119,7 +116,7 @@ class MapRows(AbstractUDFMap): def __init__( self, input_op: LogicalOperator, - fn: RowUDF, + fn: UserDefinedFunction, compute: Optional[Union[str, ComputeStrategy]] = None, ray_remote_args: Optional[Dict[str, Any]] = None, ): @@ -138,7 +135,7 @@ class Filter(AbstractUDFMap): def __init__( self, input_op: LogicalOperator, - fn: RowUDF, + fn: UserDefinedFunction, compute: Optional[Union[str, ComputeStrategy]] = None, ray_remote_args: Optional[Dict[str, Any]] = None, ): @@ -157,7 +154,7 @@ class FlatMap(AbstractUDFMap): def __init__( self, input_op: LogicalOperator, - fn: RowUDF, + fn: UserDefinedFunction, compute: Optional[Union[str, ComputeStrategy]] = None, ray_remote_args: Optional[Dict[str, Any]] = None, ): diff --git a/python/ray/data/_internal/logical/operators/write_operator.py b/python/ray/data/_internal/logical/operators/write_operator.py index f85b513e37f1..83b997933eb1 100644 --- a/python/ray/data/_internal/logical/operators/write_operator.py +++ b/python/ray/data/_internal/logical/operators/write_operator.py @@ -2,6 +2,7 @@ from ray.data._internal.logical.interfaces import LogicalOperator from ray.data._internal.logical.operators.map_operator import AbstractMap +from ray.data._internal.compute import TaskPoolStrategy from ray.data.datasource.datasource import Datasource @@ -23,6 +24,6 @@ def __init__( self._datasource = datasource self._write_args = write_args # Always use task to write. - self._compute = "tasks" + self._compute = TaskPoolStrategy() # Take the input blocks unchanged while writing. self._target_block_size = float("inf") diff --git a/python/ray/data/_internal/logical/rules/operator_fusion.py b/python/ray/data/_internal/logical/rules/operator_fusion.py index ea8e91dc6b59..41b61538bb5a 100644 --- a/python/ray/data/_internal/logical/rules/operator_fusion.py +++ b/python/ray/data/_internal/logical/rules/operator_fusion.py @@ -1,11 +1,24 @@ -from typing import Iterator +from typing import Iterator, List, Tuple +from ray.data._internal.logical.operators.all_to_all_operator import Repartition +from ray.data._internal.execution.operators.map_operator import MapOperator +from ray.data._internal.logical.operators.all_to_all_operator import ( + AbstractAllToAll, + RandomShuffle, +) +from ray.data._internal.stats import StatsDict from ray.data.block import Block # TODO(Clark): Remove compute dependency once we delete the legacy compute. from ray.data._internal.compute import is_task_compute, CallableClass, get_compute -from ray.data._internal.execution.interfaces import PhysicalOperator, TaskContext +from ray.data._internal.execution.interfaces import ( + PhysicalOperator, + RefBundle, + TaskContext, +) from ray.data._internal.logical.interfaces import Rule, PhysicalPlan +from ray.data._internal.execution.operators.all_to_all_operator import AllToAllOperator +from ray.data._internal.logical.operators.map_operator import AbstractUDFMap # Scheduling strategy can be inherited from upstream operator if not specified. @@ -17,35 +30,72 @@ class OperatorFusionRule(Rule): def apply(self, plan: PhysicalPlan) -> PhysicalPlan: self._op_map = plan.op_map.copy() - # Do DFS fusion. - root = self._apply(plan.dag) - return PhysicalPlan(root, self._op_map) + # Do DFS fusion on compatible pairwise operators in two passes. + # In the first pass, only fuse back-to-back map operators together. + fused_dag = self._fuse_map_operators_in_dag(plan.dag) - def _apply(self, op: PhysicalOperator) -> PhysicalOperator: - """Performs DFS fusion of linear chains of physical map operators, provided that - they are pairwise-compatible. + # Now that we have fused together all back-to-back map operators, + # we fuse together MapOperator -> AllToAllOperator pairs. + fused_dag = self._fuse_all_to_all_operators_in_dag(fused_dag) - Args: - op: The op that we're trying to fuse with its input. + return PhysicalPlan(fused_dag, self._op_map) + + def _fuse_map_operators_in_dag(self, dag: PhysicalOperator) -> MapOperator: + """Starting at the given operator, traverses up the DAG of operators + and recursively fuses compatible MapOperator -> MapOperator pairs. + Returns the current (root) operator after completing upstream operator fusions. """ - upstream_ops = op.input_dependencies - # Fuse with upstream ops while possible. - while len(upstream_ops) == 1 and self._can_fuse(op, upstream_ops[0]): + upstream_ops = dag.input_dependencies + while ( + len(upstream_ops) == 1 + and isinstance(dag, MapOperator) + and isinstance(upstream_ops[0], MapOperator) + and self._can_fuse(dag, upstream_ops[0]) + ): # Fuse operator with its upstream op. - op = self._fuse(op, upstream_ops[0]) - upstream_ops = op.input_dependencies - # Can no longer fuse with upstream ops, proceed up the DAG. - op._input_dependencies = [ - self._apply(upstream_op) for upstream_op in upstream_ops + dag = self._get_fused_map_operator(dag, upstream_ops[0]) + upstream_ops = dag.input_dependencies + + # Done fusing back-to-back map operators together here, + # move up the DAG to find the next map operators to fuse. + dag._input_dependencies = [ + self._fuse_map_operators_in_dag(upstream_op) for upstream_op in upstream_ops ] - return op + return dag + + def _fuse_all_to_all_operators_in_dag( + self, dag: AllToAllOperator + ) -> AllToAllOperator: + """Starting at the given operator, traverses up the DAG of operators + and recursively fuses compatible MapOperator -> AllToAllOperator pairs. + Returns the current (root) operator after completing upstream operator fusions. + """ + upstream_ops = dag.input_dependencies + while ( + len(upstream_ops) == 1 + and isinstance(dag, AllToAllOperator) + and isinstance(upstream_ops[0], MapOperator) + and self._can_fuse(dag, upstream_ops[0]) + ): + # Fuse operator with its upstream op. + dag = self._get_fused_all_to_all_operator(dag, upstream_ops[0]) + upstream_ops = dag.input_dependencies + + # Done fusing MapOperator -> AllToAllOperator together here, + # move up the DAG to find the next pair of operators to fuse. + dag._input_dependencies = [ + self._fuse_all_to_all_operators_in_dag(upstream_op) + for upstream_op in upstream_ops + ] + return dag def _can_fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator) -> bool: """Returns whether the provided downstream operator can be fused with the given upstream operator. We currently support fusing two operators if the following are all true: - * They are both MapOperators. + * We are fusing either MapOperator -> MapOperator or + MapOperator -> AllToAllOperator. * They either use the same compute configuration, or the upstream operator uses a task pool while the downstream operator uses an actor pool. * If both operators involve callable classes, the callable classes are @@ -56,8 +106,13 @@ def _can_fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator) -> bool: from ray.data._internal.logical.operators.map_operator import AbstractMap from ray.data._internal.logical.operators.map_operator import AbstractUDFMap - # We only support fusing MapOperators. - if not isinstance(down_op, MapOperator) or not isinstance(up_op, MapOperator): + # We currently only support fusing for the following cases: + # - MapOperator -> MapOperator + # - MapOperator -> AllToAllOperator + # (only RandomShuffle and Repartition LogicalOperators are currently supported) + if not isinstance(down_op, (MapOperator, AllToAllOperator)) or not isinstance( + up_op, MapOperator + ): return False down_logical_op = self._op_map[down_op] @@ -68,17 +123,26 @@ def _can_fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator) -> bool: if not down_logical_op._input_dependencies: return False - # We only support fusing AbstractMap -> AbstractMap operators. - if not isinstance(down_logical_op, AbstractMap) or not isinstance( - up_logical_op, AbstractMap - ): + # We currently only support fusing for the following cases: + # - AbstractMap -> AbstractMap + # - AbstractMap -> RandomShuffle + # - AbstractMap -> Repartition (shuffle=True) + if not isinstance( + down_logical_op, (AbstractMap, RandomShuffle, Repartition) + ) or not isinstance(up_logical_op, AbstractMap): + return False + + # Do not fuse Repartition operator if shuffle is disabled + # (i.e. using split shuffle). + if isinstance(down_logical_op, Repartition) and not down_logical_op._shuffle: return False # Allow fusing tasks->actors if the resources are compatible (read->map), but # not the other way around. The latter (downstream op) will be used as the # compute if fused. if ( - is_task_compute(down_logical_op._compute) + isinstance(down_logical_op, AbstractUDFMap) + and is_task_compute(down_logical_op._compute) and isinstance(up_logical_op, AbstractUDFMap) and get_compute(up_logical_op._compute) != get_compute(down_logical_op._compute) @@ -116,12 +180,13 @@ def _can_fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator) -> bool: # Otherwise, ops are compatible for fusion. return True - def _fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator): - """Fuse the downstream operator with its upstream operator.""" - from ray.data._internal.execution.operators.map_operator import MapOperator - from ray.data._internal.logical.operators.map_operator import AbstractUDFMap - - assert self._can_fuse(down_op, up_op) + def _get_fused_map_operator( + self, down_op: MapOperator, up_op: MapOperator + ) -> MapOperator: + assert self._can_fuse(down_op, up_op), ( + "Current rule supports fusing MapOperator->MapOperator, but received: " + f"{type(up_op).__name__} -> {type(down_op).__name__}" + ) # Fuse operator names. name = up_op.name + "->" + down_op.name @@ -147,9 +212,11 @@ def _fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator): down_transform_fn = down_op.get_transformation_fn() up_transform_fn = up_op.get_transformation_fn() - def transform_fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]: + def fused_map_transform_fn( + blocks: Iterator[Block], ctx: TaskContext + ) -> Iterator[Block]: blocks = up_transform_fn(blocks, ctx) - # TODO(Clark): Add zero-copy batching between transform functions. + # TODO(Scott): Add zero-copy batching between transform functions. return down_transform_fn(blocks, ctx) # We take the downstream op's compute in case we're fusing upstream tasks with a @@ -163,7 +230,7 @@ def transform_fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]: # Fused physical map operator. op = MapOperator.create( - transform_fn, + fused_map_transform_fn, input_op, name=name, compute_strategy=compute, @@ -172,7 +239,7 @@ def transform_fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]: ) # Build a map logical operator to be used as a reference for further fusion. - # TODO(Clark): This is hacky, remove this once we push fusion to be purely based + # TODO(Scott): This is hacky, remove this once we push fusion to be purely based # on a lower-level operator spec. if isinstance(up_logical_op, AbstractUDFMap): input_op = up_logical_op.input_dependencies[0] @@ -205,6 +272,63 @@ def transform_fn(blocks: Iterator[Block], ctx: TaskContext) -> Iterator[Block]: # Return the fused physical operator. return op + def _get_fused_all_to_all_operator( + self, down_op: AllToAllOperator, up_op: MapOperator + ) -> AllToAllOperator: + assert self._can_fuse(down_op, up_op), ( + "Current rule supports fusing MapOperator -> AllToAllOperator" + f", but received: {type(up_op).__name__} -> {type(down_op).__name__}" + ) + + # Fuse operator names. + name = up_op.name + "->" + down_op.name + + down_logical_op: AbstractAllToAll = self._op_map.pop(down_op) + up_logical_op: AbstractUDFMap = self._op_map.pop(up_op) + + # Fuse transformation functions. + down_transform_fn = down_op.get_transformation_fn() + up_transform_fn = up_op.get_transformation_fn() + + def fused_all_to_all_transform_fn( + blocks: List[RefBundle], ctx: TaskContext + ) -> Tuple[List[RefBundle], StatsDict]: + """To fuse MapOperator->AllToAllOperator, we store the map function + in the TaskContext so that it may be used by the downstream + AllToAllOperator's transform function.""" + ctx.upstream_map_transform_fn = up_transform_fn + return down_transform_fn(blocks, ctx) + + ray_remote_args = down_logical_op._ray_remote_args + # Make the upstream operator's inputs the new, fused operator's inputs. + input_deps = up_op.input_dependencies + assert len(input_deps) == 1 + input_op = input_deps[0] + + op = AllToAllOperator( + fused_all_to_all_transform_fn, + input_op, + name=name, + ) + # Bottom out at the source logical op (e.g. Read()). + input_op = up_logical_op + + if isinstance(down_logical_op, RandomShuffle): + logical_op = RandomShuffle( + input_op, + name=name, + ray_remote_args=ray_remote_args, + ) + elif isinstance(down_logical_op, Repartition): + logical_op = Repartition( + input_op, + num_outputs=down_logical_op._num_outputs, + shuffle=down_logical_op._shuffle, + ) + self._op_map[op] = logical_op + # Return the fused physical operator. + return op + def _are_remote_args_compatible(up_args, down_args): """Check if Ray remote arguments are compatible for merging.""" diff --git a/python/ray/data/_internal/usage.py b/python/ray/data/_internal/logical/util.py similarity index 70% rename from python/ray/data/_internal/usage.py rename to python/ray/data/_internal/logical/util.py index a3b8af4f756a..cc7e0dc40cdb 100644 --- a/python/ray/data/_internal/usage.py +++ b/python/ray/data/_internal/logical/util.py @@ -1,18 +1,15 @@ -from typing import Dict, TYPE_CHECKING +from typing import Dict import json import threading from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag +from ray.data._internal.logical.interfaces import LogicalOperator +from ray.data._internal.logical.operators.read_operator import Read +from ray.data._internal.logical.operators.write_operator import Write -if TYPE_CHECKING: - from ray.data._internal.logical.interfaces import LogicalOperator - -# Guards the below dicts. -_recording_lock = threading.Lock() # The dictionary for the operator name and count. _recorded_operators = dict() -# The dictionary for the block format name and count. -_recorded_block_formats = dict() +_recorded_operators_lock = threading.Lock() # The white list of operator names allowed to be recorded. _op_name_white_list = [ @@ -62,21 +59,12 @@ ] -def record_block_format_usage(block_format: str): - with _recording_lock: - _recorded_block_formats.setdefault(block_format, 0) - _recorded_block_formats[block_format] += 1 - formats_json_str = json.dumps(_recorded_block_formats) - - record_extra_usage_tag(TagKey.DATA_BLOCK_FORMATS, formats_json_str) - - -def record_operators_usage(op: "LogicalOperator"): +def record_operators_usage(op: LogicalOperator): """Record logical operator usage with Ray telemetry.""" ops_dict = dict() _collect_operators_to_dict(op, ops_dict) ops_json_str = "" - with _recording_lock: + with _recorded_operators_lock: for op, count in ops_dict.items(): _recorded_operators.setdefault(op, 0) _recorded_operators[op] += count @@ -85,11 +73,8 @@ def record_operators_usage(op: "LogicalOperator"): record_extra_usage_tag(TagKey.DATA_LOGICAL_OPS, ops_json_str) -def _collect_operators_to_dict(op: "LogicalOperator", ops_dict: Dict[str, int]): +def _collect_operators_to_dict(op: LogicalOperator, ops_dict: Dict[str, int]): """Collect the logical operator name and count into `ops_dict`.""" - from ray.data._internal.logical.operators.read_operator import Read - from ray.data._internal.logical.operators.write_operator import Write - for child in op.input_dependencies: _collect_operators_to_dict(child, ops_dict) diff --git a/python/ray/data/_internal/memory_tracing.py b/python/ray/data/_internal/memory_tracing.py index 7402218cbca9..f44c648452ad 100644 --- a/python/ray/data/_internal/memory_tracing.py +++ b/python/ray/data/_internal/memory_tracing.py @@ -1,4 +1,4 @@ -"""Utility for debugging object store memory eager deletion in Datastreams. +"""Utility for debugging object store memory eager deletion in Datasets. NOTE: the performance overhead of tracing object allocation is fairly substantial. This is meant to use in unit test for debugging. Please do not enable in production, diff --git a/python/ray/data/_internal/null_aggregate.py b/python/ray/data/_internal/null_aggregate.py index dc1f969ea2c5..3a202c1710ed 100644 --- a/python/ray/data/_internal/null_aggregate.py +++ b/python/ray/data/_internal/null_aggregate.py @@ -15,7 +15,7 @@ # aggregation of non-null values. # 2. Mix of values and nulls - ignore_nulls=False: Return None. # 3. All nulls: Return None. -# 4. Empty datastream: Return None. +# 4. Empty dataset: Return None. # # This is accomplished by checking rows for null values and by propagating nulls # if found AND if we're not ignoring them. If not ignoring nulls, in order to delineate @@ -190,9 +190,9 @@ def _accum(a: WrappedAggType, r: T) -> WrappedAggType: def _null_wrap_accumulate_block( ignore_nulls: bool, - accum_block: Callable[[AggType, Block[T]], AggType], + accum_block: Callable[[AggType, Block], AggType], null_merge: Callable[[WrappedAggType, WrappedAggType], WrappedAggType], -) -> Callable[[WrappedAggType, Block[T]], WrappedAggType]: +) -> Callable[[WrappedAggType, Block], WrappedAggType]: """ Wrap vectorized aggregate function with null handling. @@ -212,7 +212,7 @@ def _null_wrap_accumulate_block( A new vectorized aggregate function that handles nulls. """ - def _accum_block_null(a: WrappedAggType, block: Block[T]) -> WrappedAggType: + def _accum_block_null(a: WrappedAggType, block: Block) -> WrappedAggType: ret = accum_block(block) if ret is not None: ret = _wrap_acc(ret, has_data=True) diff --git a/python/ray/data/_internal/numpy_support.py b/python/ray/data/_internal/numpy_support.py new file mode 100644 index 000000000000..69cb09aa6be9 --- /dev/null +++ b/python/ray/data/_internal/numpy_support.py @@ -0,0 +1,87 @@ +from typing import Any + +import numpy as np + +import ray +from ray.data._internal.dataset_logger import DatasetLogger +from ray.data._internal.util import _truncated_repr +from ray.air.util.tensor_extensions.utils import create_ragged_ndarray + +logger = DatasetLogger(__name__) + + +def is_array_like(value: Any) -> bool: + """Checks whether objects are array-like, excluding numpy scalars.""" + + return hasattr(value, "__array__") and hasattr(value, "__len__") + + +def is_valid_udf_return(udf_return_col: Any) -> bool: + """Check whether a UDF column is valid. + + Valid columns must either be a list of elements, or an array-like object. + """ + + return isinstance(udf_return_col, list) or is_array_like(udf_return_col) + + +def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any: + """Convert UDF columns (output of map_batches) to numpy, if possible. + + This includes lists of scalars, objects supporting the array protocol, and lists + of objects supporting the array protocol, such as `[1, 2, 3]`, `Tensor([1, 2, 3])`, + and `[array(1), array(2), array(3)]`. + + Returns: + The input as an np.ndarray if possible, otherwise the original input. + + Raises: + ValueError if an input was array-like but we failed to convert it to an array. + """ + + if isinstance(udf_return_col, np.ndarray): + # No copy/conversion needed, just keep it verbatim. + return udf_return_col + + ctx = ray.data.DataContext.get_current() + if not ctx.strict_mode: + # Legacy compat. + return np.array(udf_return_col) + + if isinstance(udf_return_col, list): + # Try to convert list values into an numpy array via + # np.array(), so users don't need to manually cast. + # NOTE: we don't cast generic iterables, since types like + # `str` are also Iterable. + try: + # Try to cast the inner scalars to numpy as well, to avoid unnecessarily + # creating an inefficient array of array of object dtype. + if all(is_valid_udf_return(e) for e in udf_return_col): + udf_return_col = [np.array(e) for e in udf_return_col] + shapes = set() + for e in udf_return_col: + if isinstance(e, np.ndarray): + shapes.add((e.dtype, e.shape)) + else: + shapes.add(type(e)) + if len(shapes) > 1: + # This util works around some limitations of np.array(dtype=object). + udf_return_col = create_ragged_ndarray(udf_return_col) + else: + udf_return_col = np.array(udf_return_col) + except Exception as e: + raise ValueError( + "Failed to convert column values to numpy array: " + f"({_truncated_repr(udf_return_col)}): {e}." + ) + elif hasattr(udf_return_col, "__array__"): + # Converts other array-like objects such as torch.Tensor. + try: + udf_return_col = np.array(udf_return_col) + except Exception as e: + raise ValueError( + "Failed to convert column values to numpy array: " + f"({_truncated_repr(udf_return_col)}): {e}." + ) + + return udf_return_col diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py index d6c1bc1b807c..1c0efae931bf 100644 --- a/python/ray/data/_internal/pandas_block.py +++ b/python/ray/data/_internal/pandas_block.py @@ -21,7 +21,6 @@ BlockAccessor, BlockMetadata, BlockExecStats, - KeyFn, KeyType, U, ) @@ -54,7 +53,7 @@ def lazy_import_pandas(): class PandasRow(TableRow): """ - Row of a tabular Datastream backed by a Pandas DataFrame block. + Row of a tabular Dataset backed by a Pandas DataFrame block. """ def __getitem__(self, key: str) -> Any: @@ -84,7 +83,7 @@ def __len__(self): return self._row.shape[1] -class PandasBlockBuilder(TableBlockBuilder[T]): +class PandasBlockBuilder(TableBlockBuilder): def __init__(self): pandas = lazy_import_pandas() super().__init__(pandas.DataFrame) @@ -167,7 +166,7 @@ def take(self, indices: List[int]) -> "pandas.DataFrame": table.reset_index(drop=True, inplace=True) return table - def select(self, columns: List[KeyFn]) -> "pandas.DataFrame": + def select(self, columns: List[str]) -> "pandas.DataFrame": if not all(isinstance(col, str) for col in columns): raise ValueError( "Columns must be a list of column name strings when aggregating on " @@ -186,11 +185,11 @@ def schema(self) -> PandasBlockSchema: names=dtypes.index.tolist(), types=dtypes.values.tolist() ) # Column names with non-str types of a pandas DataFrame is not - # supported by Ray Datastream. + # supported by Ray Dataset. if any(not isinstance(name, str) for name in schema.names): raise ValueError( "A Pandas DataFrame with column names of non-str types" - " is not supported by Ray Datastream. Column names of this" + " is not supported by Ray Dataset. Column names of this" f" DataFrame: {schema.names!r}." ) return schema @@ -264,7 +263,7 @@ def _zip(self, acc: BlockAccessor) -> "pandas.DataFrame": return r @staticmethod - def builder() -> PandasBlockBuilder[T]: + def builder() -> PandasBlockBuilder: return PandasBlockBuilder() @staticmethod @@ -275,7 +274,7 @@ def _sample(self, n_samples: int, key: "SortKeyT") -> "pandas.DataFrame": return self._table[[k[0] for k in key]].sample(n_samples, ignore_index=True) def _apply_agg( - self, agg_fn: Callable[["pandas.Series", bool], U], on: KeyFn + self, agg_fn: Callable[["pandas.Series", bool], U], on: str ) -> Optional[U]: """Helper providing null handling around applying an aggregation to a column.""" pd = lazy_import_pandas() @@ -303,10 +302,10 @@ def _apply_agg( return None return val - def count(self, on: KeyFn) -> Optional[U]: + def count(self, on: str) -> Optional[U]: return self._apply_agg(lambda col: col.count(), on) - def sum(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def sum(self, on: str, ignore_nulls: bool) -> Optional[U]: pd = lazy_import_pandas() if on is not None and not isinstance(on, str): raise ValueError( @@ -328,18 +327,18 @@ def sum(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: return None return val - def min(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def min(self, on: str, ignore_nulls: bool) -> Optional[U]: return self._apply_agg(lambda col: col.min(skipna=ignore_nulls), on) - def max(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def max(self, on: str, ignore_nulls: bool) -> Optional[U]: return self._apply_agg(lambda col: col.max(skipna=ignore_nulls), on) - def mean(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def mean(self, on: str, ignore_nulls: bool) -> Optional[U]: return self._apply_agg(lambda col: col.mean(skipna=ignore_nulls), on) def sum_of_squared_diffs_from_mean( self, - on: KeyFn, + on: str, ignore_nulls: bool, mean: Optional[U] = None, ) -> Optional[U]: @@ -352,7 +351,7 @@ def sum_of_squared_diffs_from_mean( def sort_and_partition( self, boundaries: List[T], key: "SortKeyT", descending: bool - ) -> List[Block[T]]: + ) -> List[Block]: if len(key) > 1: raise NotImplementedError( "sorting by multiple columns is not supported yet" @@ -389,7 +388,7 @@ def sort_and_partition( partitions.append(table[last_idx:]) return partitions - def combine(self, key: KeyFn, aggs: Tuple[AggregateFn]) -> "pandas.DataFrame": + def combine(self, key: str, aggs: Tuple[AggregateFn]) -> "pandas.DataFrame": """Combine rows with the same key into an accumulator. This assumes the block is already sorted by key in ascending order. @@ -418,7 +417,7 @@ def iter_groups() -> Iterator[Tuple[KeyType, Block]]: return start = end = 0 - iter = self.iter_rows() + iter = self.iter_rows(public_row_format=False) next_row = None while True: try: @@ -464,7 +463,7 @@ def iter_groups() -> Iterator[Tuple[KeyType, Block]]: @staticmethod def merge_sorted_blocks( - blocks: List[Block[T]], key: "SortKeyT", _descending: bool + blocks: List[Block], key: "SortKeyT", _descending: bool ) -> Tuple["pandas.DataFrame", BlockMetadata]: pd = lazy_import_pandas() stats = BlockExecStats.builder() @@ -481,7 +480,7 @@ def merge_sorted_blocks( @staticmethod def aggregate_combined_blocks( blocks: List["pandas.DataFrame"], - key: KeyFn, + key: str, aggs: Tuple[AggregateFn], finalize: bool, ) -> Tuple["pandas.DataFrame", BlockMetadata]: @@ -509,7 +508,11 @@ def aggregate_combined_blocks( key_fn = (lambda r: r[r._row.columns[0]]) if key is not None else (lambda r: 0) iter = heapq.merge( - *[PandasBlockAccessor(block).iter_rows() for block in blocks], key=key_fn + *[ + PandasBlockAccessor(block).iter_rows(public_row_format=False) + for block in blocks + ], + key=key_fn, ) next_row = None builder = PandasBlockBuilder() diff --git a/python/ray/data/_internal/pipeline_executor.py b/python/ray/data/_internal/pipeline_executor.py index b44b65b28181..fef5c0f4a020 100644 --- a/python/ray/data/_internal/pipeline_executor.py +++ b/python/ray/data/_internal/pipeline_executor.py @@ -1,12 +1,11 @@ -from typing import Any, Callable, List, Optional, TYPE_CHECKING +from typing import Callable, List, Optional, TYPE_CHECKING import time import concurrent.futures import logging import ray -from ray.data.block import T from ray.data.context import DataContext -from ray.data.datastream import Datastream +from ray.data.dataset import Dataset from ray.data._internal.progress_bar import ProgressBar from ray.data._internal import progress_bar @@ -16,7 +15,7 @@ from ray.data.dataset_pipeline import DatasetPipeline -def pipeline_stage(fn: Callable[[], Datastream[T]]) -> Datastream[T]: +def pipeline_stage(fn: Callable[[], Dataset]) -> Dataset: # Force eager evaluation of all blocks in the pipeline stage. This # prevents resource deadlocks due to overlapping stage execution (e.g., # task -> actor stage). @@ -24,9 +23,9 @@ def pipeline_stage(fn: Callable[[], Datastream[T]]) -> Datastream[T]: class PipelineExecutor: - def __init__(self, pipeline: "DatasetPipeline[T]"): - self._pipeline: "DatasetPipeline[T]" = pipeline - self._stages: List[concurrent.futures.Future[Datastream[Any]]] = [None] * ( + def __init__(self, pipeline: "DatasetPipeline"): + self._pipeline: "DatasetPipeline" = pipeline + self._stages: List[concurrent.futures.Future[Dataset]] = [None] * ( len(self._pipeline._optimized_stages) + 1 ) self._iter = iter(self._pipeline._base_iterable) @@ -160,9 +159,9 @@ def __next__(self): class PipelineSplitExecutorCoordinator: def __init__( self, - pipeline: "DatasetPipeline[T]", + pipeline: "DatasetPipeline", n: int, - splitter: Callable[[Datastream], List["Datastream[T]"]], + splitter: Callable[[Dataset], List["Dataset"]], context: DataContext, ): DataContext._set_current(context) @@ -172,17 +171,17 @@ def __init__( self.splitter = splitter self.cur_splits = [None] * self.n - def next_datastream_if_ready(self, split_index: int) -> Optional[Datastream[T]]: + def next_dataset_if_ready(self, split_index: int) -> Optional[Dataset]: # TODO(swang): This will hang if one of the consumers fails and is # re-executed from the beginning. To make this fault-tolerant, we need - # to make next_datastream_if_ready idempotent. - # Pull the next datastream once all splits are fully consumed. + # to make next_dataset_if_ready idempotent. + # Pull the next dataset once all splits are fully consumed. if all(s is None for s in self.cur_splits): ds = next(self.executor) self.cur_splits = self.splitter(ds) assert len(self.cur_splits) == self.n, (self.cur_splits, self.n) - # Return the datastream at the split index once per split. + # Return the dataset at the split index once per split. ret = self.cur_splits[split_index] self.cur_splits[split_index] = None return ret diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py index d10be2363225..235247636973 100644 --- a/python/ray/data/_internal/plan.py +++ b/python/ray/data/_internal/plan.py @@ -22,18 +22,19 @@ from ray.types import ObjectRef from ray.data._internal.block_list import BlockList from ray.data._internal.compute import ( - UDF, + UserDefinedFunction, ActorPoolStrategy, + TaskPoolStrategy, BlockTransform, CallableClass, ComputeStrategy, get_compute, is_task_compute, ) -from ray.data._internal.datastream_logger import DatastreamLogger +from ray.data._internal.dataset_logger import DatasetLogger from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.lazy_block_list import LazyBlockList -from ray.data._internal.stats import DatastreamStats, DatastreamStatsSummary +from ray.data._internal.stats import DatasetStats, DatasetStatsSummary from ray.data.block import Block from ray.data.context import DataContext from ray.util.debug import log_once @@ -47,11 +48,11 @@ INHERITABLE_REMOTE_ARGS = ["scheduling_strategy"] -logger = DatastreamLogger(__name__) +logger = DatasetLogger(__name__) class Stage: - """Represents a Datastream transform stage (e.g., map or shuffle).""" + """Represents a Dataset transform stage (e.g., map or shuffle).""" def __init__(self, name: str, num_blocks: Optional[int]): self.name = name @@ -79,7 +80,7 @@ def __str__(self): class ExecutionPlan: - """A lazy execution plan for a Datastream.""" + """A lazy execution plan for a Dataset.""" # Implementation Notes: # @@ -100,8 +101,8 @@ class ExecutionPlan: def __init__( self, in_blocks: BlockList, - stats: DatastreamStats, - datastream_uuid=None, + stats: DatasetStats, + dataset_uuid=None, *, run_by_consumer: bool, ): @@ -110,7 +111,7 @@ def __init__( Args: in_blocks: Base list of blocks. stats: Stats for the base blocks. - datastream_uuid: Datastream's UUID. + dataset_uuid: Dataset's UUID. run_by_consumer: Whether this plan is invoked to run by the consumption APIs (e.g. .iter_batches()). """ @@ -125,16 +126,20 @@ def __init__( # Cache of optimized stages. self._last_optimized_stages = None - self._datastream_uuid = datastream_uuid or uuid.uuid4().hex - if not stats.datastream_uuid: - stats.datastream_uuid = self._datastream_uuid + self._dataset_uuid = dataset_uuid or uuid.uuid4().hex + if not stats.dataset_uuid: + stats.dataset_uuid = self._dataset_uuid self._run_by_consumer = run_by_consumer + # Snapshot the current context, so that the config of Datasets is always + # determined by the config at the time it was created. + self._context = copy.deepcopy(DataContext.get_current()) + def __repr__(self) -> str: return ( f"ExecutionPlan(" - f"datastream_uuid={self._datastream_uuid}, " + f"dataset_uuid={self._dataset_uuid}, " f"run_by_consumer={self._run_by_consumer}, " f"in_blocks={self._in_blocks}, " f"stages_before_snapshot={self._stages_before_snapshot}, " @@ -148,7 +153,7 @@ def get_plan_as_string(self, classname: str) -> str: Returns: The string representation of this execution plan. """ - # NOTE: this is used for Datastream.__repr__ to give a user-facing string + # NOTE: this is used for Dataset.__repr__ to give a user-facing string # representation. Ideally ExecutionPlan.__repr__ should be replaced with this # method as well. @@ -156,7 +161,7 @@ def get_plan_as_string(self, classname: str) -> str: # cheap. plan_str = "" num_stages = 0 - datastream_blocks = None + dataset_blocks = None if self._stages_after_snapshot: # Get string representation of each stage in reverse order. for stage in self._stages_after_snapshot[::-1]: @@ -181,17 +186,17 @@ def get_plan_as_string(self, classname: str) -> str: schema = self._get_unified_blocks_schema( self._snapshot_blocks, fetch_if_missing=False ) - datastream_blocks = self._snapshot_blocks + dataset_blocks = self._snapshot_blocks else: assert self._in_blocks is not None schema = self._get_unified_blocks_schema( self._in_blocks, fetch_if_missing=False ) - datastream_blocks = self._in_blocks + dataset_blocks = self._in_blocks else: # Get schema of output blocks. schema = self.schema(fetch_if_missing=False) - datastream_blocks = self._snapshot_blocks + dataset_blocks = self._snapshot_blocks if schema is None: schema_str = "Unknown schema" @@ -205,14 +210,14 @@ def get_plan_as_string(self, classname: str) -> str: schema_str.append(f"{n}: {t}") schema_str = ", ".join(schema_str) schema_str = "{" + schema_str + "}" - count = self._get_num_rows_from_blocks_metadata(datastream_blocks) + count = self._get_num_rows_from_blocks_metadata(dataset_blocks) if count is None: count = "?" - if datastream_blocks is None: + if dataset_blocks is None: num_blocks = "?" else: - num_blocks = datastream_blocks.initial_num_blocks() - datastream_str = "{}(num_blocks={}, num_rows={}, schema={})".format( + num_blocks = dataset_blocks.initial_num_blocks() + dataset_str = "{}(num_blocks={}, num_rows={}, schema={})".format( classname, num_blocks, count, schema_str ) @@ -221,9 +226,9 @@ def get_plan_as_string(self, classname: str) -> str: MIN_FIELD_LENGTH = 10 INDENT_STR = " " * 3 trailing_space = " " * (max(num_stages, 0) * 3) - if len(datastream_str) > SCHEMA_LINE_CHAR_LIMIT: + if len(dataset_str) > SCHEMA_LINE_CHAR_LIMIT: # If the resulting string representation exceeds the line char limit, - # first try breaking up each `Datastream` parameter into its own line + # first try breaking up each `Dataset` parameter into its own line # and check if each line fits within the line limit. We check the # `schema` param's length, since this is likely the longest string. schema_str_on_new_line = f"{trailing_space}{INDENT_STR}schema={schema_str}" @@ -253,7 +258,7 @@ def get_plan_as_string(self, classname: str) -> str: schema_str = ( "{\n" + schema_str + f"\n{trailing_space}{INDENT_STR}" + "}" ) - datastream_str = ( + dataset_str = ( f"{classname}(" f"\n{trailing_space}{INDENT_STR}num_blocks={num_blocks}," f"\n{trailing_space}{INDENT_STR}num_rows={count}," @@ -262,10 +267,10 @@ def get_plan_as_string(self, classname: str) -> str: ) if num_stages == 0: - plan_str = datastream_str + plan_str = dataset_str else: trailing_space = " " * ((num_stages - 1) * 3) - plan_str += f"{trailing_space}+- {datastream_str}" + plan_str += f"{trailing_space}+- {dataset_str}" return plan_str def with_stage(self, stage: "Stage") -> "ExecutionPlan": @@ -320,16 +325,16 @@ def deep_copy(self, preserve_uuid: bool = False) -> "ExecutionPlan": Returns: A deep copy of this execution plan. """ - datastream_uuid = None + dataset_uuid = None if preserve_uuid: - datastream_uuid = self._datastream_uuid + dataset_uuid = self._dataset_uuid in_blocks = self._in_blocks if isinstance(in_blocks, BlockList): in_blocks = in_blocks.copy() plan_copy = ExecutionPlan( in_blocks, copy.copy(self._in_stats), - datastream_uuid=datastream_uuid, + dataset_uuid=dataset_uuid, run_by_consumer=self._run_by_consumer, ) if self._snapshot_blocks: @@ -365,7 +370,7 @@ def schema( fetch_if_missing: Whether to execute the plan to fetch the schema. Returns: - The schema of the output datastream. + The schema of the output dataset. """ from ray.data._internal.stage_impl import RandomizeBlocksStage @@ -393,8 +398,8 @@ def schema( return None elif self._in_blocks is not None and self._snapshot_blocks is None: # If the plan only has input blocks, we execute it, so snapshot has output. - # This applies to newly created datastream. For example, initial datastream - # from read, and output datastreams of Datastream.split(). + # This applies to newly created dataset. For example, initial dataset + # from read, and output datasets of Dataset.split(). self.execute() # Snapshot is now guaranteed to be the output of the final stage or None. blocks = self._snapshot_blocks @@ -440,14 +445,14 @@ def meta_count(self) -> Optional[int]: This method will never trigger any computation. Returns: - The number of records of the result Datastream, or None. + The number of records of the result Dataset, or None. """ if self._stages_after_snapshot: return None elif self._in_blocks is not None and self._snapshot_blocks is None: # If the plan only has input blocks, we execute it, so snapshot has output. - # This applies to newly created datastream. For example, initial datastream - # from read, and output datastreams of Datastream.split(). + # This applies to newly created dataset. For example, initial dataset + # from read, and output datasets of Dataset.split(). self.execute() # Snapshot is now guaranteed to be the final block or None. return self._get_num_rows_from_blocks_metadata(self._snapshot_blocks) @@ -465,7 +470,7 @@ def execute_to_iterator( force_read: bool = False, ) -> Tuple[ Iterator[Tuple[ObjectRef[Block], BlockMetadata]], - DatastreamStats, + DatasetStats, Optional["Executor"], ]: """Execute this plan, returning an iterator. @@ -482,7 +487,9 @@ def execute_to_iterator( Tuple of iterator over output blocks and the executor. """ - ctx = DataContext.get_current() + # Always used the saved context for execution. + ctx = self._context + if not ctx.use_streaming_executor or self.has_computed_output(): return ( self.execute( @@ -502,7 +509,7 @@ def execute_to_iterator( executor, self, allow_clear_input_blocks=allow_clear_input_blocks, - datastream_uuid=self._datastream_uuid, + dataset_uuid=self._dataset_uuid, ) # Since the generator doesn't run any code until we try to fetch the first # value, force execution of one bundle before we call get_stats(). @@ -529,14 +536,17 @@ def execute( preserve_order: Whether to preserve order in execution. Returns: - The blocks of the output datastream. + The blocks of the output dataset. """ - context = DataContext.get_current() + + # Always used the saved context for execution. + context = self._context + if not ray.available_resources().get("CPU"): if log_once("cpu_warning"): logger.get_logger().warning( "Warning: The Ray cluster currently does not have " - "any available CPUs. The Datastream job will hang unless more CPUs " + "any available CPUs. The Dataset job will hang unless more CPUs " "are freed up. A common reason is that cluster resources are " "used by Actors or Tune trials; see the following link " "for more details: " @@ -562,13 +572,13 @@ def execute( executor, self, allow_clear_input_blocks=allow_clear_input_blocks, - datastream_uuid=self._datastream_uuid, + dataset_uuid=self._dataset_uuid, preserve_order=preserve_order, ) # TODO(ekl) we shouldn't need to set this in the future once we move # to a fully lazy execution model, unless .materialize() is used. Th # reason we need it right now is since the user may iterate over a - # Datastream multiple times after fully executing it once. + # Dataset multiple times after fully executing it once. if not self._run_by_consumer: blocks._owned_by_consumer = False stats = executor.get_stats() @@ -597,7 +607,7 @@ def execute( stats = stats_builder.build_multistage(stage_info) else: stats = stats_builder.build(blocks) - stats.datastream_uuid = self._datastream_uuid + stats.dataset_uuid = self._dataset_uuid stats_summary_string = stats.to_summary().to_string( include_parent=False, ) @@ -608,7 +618,7 @@ def execute( # Set the snapshot to the output of the final stage. self._snapshot_blocks = blocks self._snapshot_stats = stats - self._snapshot_stats.datastream_uuid = self._datastream_uuid + self._snapshot_stats.dataset_uuid = self._dataset_uuid self._stages_before_snapshot += self._stages_after_snapshot self._stages_after_snapshot = [] if _is_lazy(self._snapshot_blocks) and force_read: @@ -633,16 +643,16 @@ def _clear_snapshot(self) -> None: ) self._stages_before_snapshot = [] - def stats(self) -> DatastreamStats: + def stats(self) -> DatasetStats: """Return stats for this plan. If the plan isn't executed, an empty stats object will be returned. """ if not self._snapshot_stats: - return DatastreamStats(stages={}, parent=None) + return DatasetStats(stages={}, parent=None) return self._snapshot_stats - def stats_summary(self) -> DatastreamStatsSummary: + def stats_summary(self) -> DatasetStatsSummary: return self.stats().to_summary() def _should_clear_input_blocks( @@ -667,11 +677,11 @@ def _should_clear_input_blocks( # execution plan, so we don't clear these. return False - def _optimize(self) -> Tuple[BlockList, DatastreamStats, List[Stage]]: + def _optimize(self) -> Tuple[BlockList, DatasetStats, List[Stage]]: """Apply stage fusion optimizations, returning an updated source block list and associated stats, and a set of optimized stages. """ - context = DataContext.get_current() + context = self._context blocks, stats, stages = self._get_source_blocks_and_stages() if context.optimize_reorder_stages: stages = _reorder_stages(stages) @@ -680,7 +690,7 @@ def _optimize(self) -> Tuple[BlockList, DatastreamStats, List[Stage]]: # If using a lazy datasource, rewrite read stage into one-to-one stage # so it can be fused into downstream stages. blocks, stats, stages = _rewrite_read_stages( - blocks, stats, stages, self._datastream_uuid + blocks, stats, stages, self._dataset_uuid ) stages = _fuse_one_to_one_stages(stages) self._last_optimized_stages = stages @@ -688,7 +698,7 @@ def _optimize(self) -> Tuple[BlockList, DatastreamStats, List[Stage]]: def _get_source_blocks_and_stages( self, - ) -> Tuple[BlockList, DatastreamStats, List[Stage]]: + ) -> Tuple[BlockList, DatasetStats, List[Stage]]: """Get the source blocks, corresponding stats, and the stages for plan execution. @@ -727,7 +737,7 @@ def is_read_stage_equivalent(self) -> bool: """Return whether this plan can be executed as only a read stage.""" from ray.data._internal.stage_impl import RandomizeBlocksStage - context = DataContext.get_current() + context = self._context remaining_stages = self._stages_after_snapshot if ( context.optimize_fuse_stages @@ -763,7 +773,7 @@ def _run_with_new_execution_backend(self) -> bool: # - Read only: handle with legacy backend # - Read->randomize_block_order: handle with new backend # Note that both are considered read equivalent, hence this extra check. - context = DataContext.get_current() + context = self._context trailing_randomize_block_order_stage = ( self._stages_after_snapshot and len(self._stages_after_snapshot) == 1 @@ -874,7 +884,7 @@ def __init__( compute: Union[str, ComputeStrategy], ray_remote_args: dict, target_block_size: Optional[int] = None, - fn: Optional[UDF] = None, + fn: Optional[UserDefinedFunction] = None, fn_args: Optional[Iterable[Any]] = None, fn_kwargs: Optional[Dict[str, Any]] = None, fn_constructor_args: Optional[Iterable[Any]] = None, @@ -882,7 +892,7 @@ def __init__( ): super().__init__(name, None) self.block_fn = block_fn - self.compute = compute or "tasks" + self.compute = compute or TaskPoolStrategy() self.ray_remote_args = ray_remote_args or {} self.target_block_size = target_block_size self.fn = fn @@ -962,7 +972,7 @@ def fuse(self, prev: Stage): def block_fn( blocks: Iterable[Block], ctx: TaskContext, - fn: UDF, + fn: UserDefinedFunction, *fn_args, **fn_kwargs, ) -> Iterable[Block]: @@ -1127,20 +1137,20 @@ def __call__( def _rewrite_read_stages( blocks: BlockList, - stats: DatastreamStats, + stats: DatasetStats, stages: List[Stage], - datastream_uuid: str, -) -> Tuple[BlockList, DatastreamStats, List[Stage]]: + dataset_uuid: str, +) -> Tuple[BlockList, DatasetStats, List[Stage]]: """Rewrites read stages into one-to-one stages, if needed.""" if _is_lazy(blocks) and stages: blocks, stats, stages = _rewrite_read_stage(blocks, stages) - stats.datastream_uuid = datastream_uuid + stats.dataset_uuid = dataset_uuid return blocks, stats, stages def _rewrite_read_stage( in_blocks: LazyBlockList, stages: List[Stage] -) -> Tuple[BlockList, DatastreamStats, List[Stage]]: +) -> Tuple[BlockList, DatasetStats, List[Stage]]: """Rewrite the read stage to a OneToOne stage over read tasks as input. For example, suppose the plan was [Read -> MapBatches(Fn)]. These stages cannot @@ -1192,10 +1202,10 @@ def block_fn( stage = OneToOneStage( name, block_fn, - "tasks", + TaskPoolStrategy(), remote_args, ) - stats = DatastreamStats(stages={}, parent=None) + stats = DatasetStats(stages={}, parent=None) stages.insert(0, stage) return block_list, stats, stages diff --git a/python/ray/data/_internal/planner/aggregate.py b/python/ray/data/_internal/planner/aggregate.py index bea062434b90..5b434a18843d 100644 --- a/python/ray/data/_internal/planner/aggregate.py +++ b/python/ray/data/_internal/planner/aggregate.py @@ -17,13 +17,12 @@ from ray.data._internal.planner.exchange.sort_task_spec import SortTaskSpec from ray.data._internal.stats import StatsDict from ray.data.aggregate import AggregateFn -from ray.data.block import KeyFn from ray.data.context import DataContext from ray.data._internal.util import unify_block_metadata_schema def generate_aggregate_fn( - key: Optional[KeyFn], + key: Optional[str], aggs: List[AggregateFn], ) -> AllToAllTransformFn: """Generate function to aggregate blocks by the specified key column or key diff --git a/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py b/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py index 5bf2f5d05099..d4f9506fa0f1 100644 --- a/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py +++ b/python/ray/data/_internal/planner/exchange/aggregate_task_spec.py @@ -8,7 +8,6 @@ BlockAccessor, BlockExecStats, BlockMetadata, - KeyFn, KeyType, ) @@ -32,7 +31,7 @@ class SortAggregateTaskSpec(ExchangeTaskSpec): def __init__( self, boundaries: List[KeyType], - key: Optional[KeyFn], + key: Optional[str], aggs: List[AggregateFn], ): super().__init__( @@ -46,7 +45,7 @@ def map( block: Block, output_num_blocks: int, boundaries: List[KeyType], - key: Optional[KeyFn], + key: Optional[str], aggs: List[AggregateFn], ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() @@ -69,7 +68,7 @@ def map( @staticmethod def reduce( - key: Optional[KeyFn], + key: Optional[str], aggs: List[AggregateFn], *mapper_outputs: List[Block], partial_reduce: bool = False, @@ -81,7 +80,7 @@ def reduce( @staticmethod def _prune_unused_columns( block: Block, - key: KeyFn, + key: str, aggs: Tuple[AggregateFn], ) -> Block: """Prune unused columns from block before aggregate.""" diff --git a/python/ray/data/_internal/planner/exchange/shuffle_task_spec.py b/python/ray/data/_internal/planner/exchange/shuffle_task_spec.py index 474d69b03279..9611041a299e 100644 --- a/python/ray/data/_internal/planner/exchange/shuffle_task_spec.py +++ b/python/ray/data/_internal/planner/exchange/shuffle_task_spec.py @@ -4,6 +4,7 @@ import numpy as np from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.execution.interfaces import MapTransformFn from ray.data._internal.planner.exchange.interfaces import ExchangeTaskSpec from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata @@ -19,9 +20,10 @@ def __init__( self, random_shuffle: bool = False, random_seed: Optional[int] = None, + upstream_map_fn: Optional[MapTransformFn] = None, ): super().__init__( - map_args=[random_shuffle, random_seed], + map_args=[upstream_map_fn, random_shuffle, random_seed], reduce_args=[random_shuffle, random_seed], ) @@ -30,11 +32,19 @@ def map( idx: int, block: Block, output_num_blocks: int, + upstream_map_fn: Optional[MapTransformFn], random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: # TODO: Support fusion with other upstream operators. stats = BlockExecStats.builder() + if upstream_map_fn: + mapped_blocks = list(upstream_map_fn([block])) + assert len(mapped_blocks) == 1, ( + "Expected upstream_map_fn to return one block, but instead" + f" returned {len(mapped_blocks)} blocks" + ) + block = mapped_blocks[0] block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. diff --git a/python/ray/data/_internal/planner/exchange/sort_task_spec.py b/python/ray/data/_internal/planner/exchange/sort_task_spec.py index 4fa17cec8588..c6c011fe5585 100644 --- a/python/ray/data/_internal/planner/exchange/sort_task_spec.py +++ b/python/ray/data/_internal/planner/exchange/sort_task_spec.py @@ -104,7 +104,7 @@ def sample_boundaries( sample_bar.close() del sample_results samples = [s for s in samples if len(s) > 0] - # The datastream is empty + # The dataset is empty if len(samples) == 0: return [None] * (num_reducers - 1) builder = DelegatingBlockBuilder() @@ -121,5 +121,5 @@ def sample_boundaries( return ret[1:] -def _sample_block(block: Block[T], n_samples: int, key: SortKeyT) -> Block[T]: +def _sample_block(block: Block, n_samples: int, key: SortKeyT) -> Block: return BlockAccessor.for_block(block).sample(n_samples, key) diff --git a/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py b/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py new file mode 100644 index 000000000000..f0a2075e740d --- /dev/null +++ b/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py @@ -0,0 +1,136 @@ +from typing import Any, Dict, List, Optional, Tuple + +import ray +from ray.data._internal.execution.interfaces import RefBundle +from ray.data._internal.planner.exchange.interfaces import ExchangeTaskScheduler +from ray.data._internal.progress_bar import ProgressBar +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.split import _split_at_indices +from ray.data._internal.stats import StatsDict +from ray.data.block import Block, BlockAccessor, BlockMetadata +from ray.types import ObjectRef + + +class SplitRepartitionTaskScheduler(ExchangeTaskScheduler): + """ + The split (non-shuffle) repartition scheduler. + + First, we calculate global splits needed to produce `output_num_blocks` blocks. + After the split blocks are generated accordingly, reduce tasks are scheduled + to combine split blocks together. + """ + + def execute( + self, + refs: List[RefBundle], + output_num_blocks: int, + map_ray_remote_args: Optional[Dict[str, Any]] = None, + reduce_ray_remote_args: Optional[Dict[str, Any]] = None, + ) -> Tuple[List[RefBundle], StatsDict]: + input_num_rows = 0 + input_owned_by_consumer = True + for ref_bundle in refs: + block_num_rows = ref_bundle.num_rows() + if block_num_rows is None: + raise ValueError( + "Cannot split partition on blocks with unknown number of rows." + ) + input_num_rows += block_num_rows + if not ref_bundle.owns_blocks: + input_owned_by_consumer = False + + # Compute the (output_num_blocks-1) indices needed for + # an equal split of the input blocks. + indices = [] + cur_idx = 0 + for _ in range(output_num_blocks - 1): + cur_idx += input_num_rows / output_num_blocks + indices.append(int(cur_idx)) + assert len(indices) < output_num_blocks, (indices, output_num_blocks) + + if map_ray_remote_args is None: + map_ray_remote_args = {} + if reduce_ray_remote_args is None: + reduce_ray_remote_args = {} + if "scheduling_strategy" not in reduce_ray_remote_args: + reduce_ray_remote_args = reduce_ray_remote_args.copy() + reduce_ray_remote_args["scheduling_strategy"] = "SPREAD" + + blocks_with_metadata: List[Tuple[ObjectRef[Block], BlockMetadata]] = [] + for ref_bundle in refs: + blocks_with_metadata.extend(ref_bundle.blocks) + if indices: + split_return = _split_at_indices( + blocks_with_metadata, indices, input_owned_by_consumer + ) + split_block_refs, split_metadata = [], [] + for b, m in zip(*split_return): + split_block_refs.append(b) + split_metadata.extend(m) + else: + split_block_refs, split_metadata = [], [] + for b, m in blocks_with_metadata: + split_block_refs.append([b]) + split_metadata.append(m) + + reduce_bar = ProgressBar("Split Repartition", total=output_num_blocks) + reduce_task = cached_remote_fn(self._exchange_spec.reduce) + reduce_return = [ + reduce_task.options(**reduce_ray_remote_args, num_returns=2).remote( + *self._exchange_spec._reduce_args, + *split_block_refs[j], + ) + for j in range(output_num_blocks) + # Only process splits which contain blocks. + if len(split_block_refs[j]) > 0 + ] + + reduce_block_refs, reduce_metadata = zip(*reduce_return) + reduce_metadata = reduce_bar.fetch_until_complete(list(reduce_metadata)) + reduce_block_refs, reduce_metadata = list(reduce_block_refs), list( + reduce_metadata + ) + reduce_bar.close() + + # Handle empty blocks. + if len(reduce_block_refs) < output_num_blocks: + from ray.data._internal.arrow_block import ArrowBlockBuilder + from ray.data._internal.pandas_block import PandasBlockBuilder + from ray.data._internal.simple_block import SimpleBlockBuilder + + import pyarrow as pa + from ray.data._internal.pandas_block import PandasBlockSchema + + num_empty_blocks = output_num_blocks - len(reduce_block_refs) + first_block_schema = reduce_metadata[0].schema + if first_block_schema is None: + raise ValueError( + "Cannot split partition on blocks with unknown block format." + ) + elif isinstance(first_block_schema, type): + builder = SimpleBlockBuilder() + elif isinstance(first_block_schema, pa.Schema): + builder = ArrowBlockBuilder() + elif isinstance(first_block_schema, PandasBlockSchema): + builder = PandasBlockBuilder() + empty_block = builder.build() + empty_meta = BlockAccessor.for_block(empty_block).get_metadata( + input_files=None, exec_stats=None + ) # No stats for empty block. + empty_block_refs, empty_metadata = zip( + *[(ray.put(empty_block), empty_meta) for _ in range(num_empty_blocks)] + ) + reduce_block_refs.extend(empty_block_refs) + reduce_metadata.extend(empty_metadata) + + output = [] + for block, meta in zip(reduce_block_refs, reduce_metadata): + output.append( + RefBundle([(block, meta)], owns_blocks=input_owned_by_consumer) + ) + stats = { + "split": split_metadata, + "reduce": reduce_metadata, + } + + return (output, stats) diff --git a/python/ray/data/_internal/planner/filter.py b/python/ray/data/_internal/planner/filter.py index afbd22f4e907..8374114de7a1 100644 --- a/python/ray/data/_internal/planner/filter.py +++ b/python/ray/data/_internal/planner/filter.py @@ -1,12 +1,12 @@ from typing import Callable, Iterator from ray.data._internal.execution.interfaces import TaskContext -from ray.data.block import Block, BlockAccessor, RowUDF +from ray.data.block import Block, BlockAccessor, UserDefinedFunction from ray.data.context import DataContext def generate_filter_fn() -> Callable[ - [Iterator[Block], TaskContext, RowUDF], Iterator[Block] + [Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block] ]: """Generate function to apply the UDF to each record of blocks, and filter out records that do not satisfy the given predicate. @@ -15,13 +15,13 @@ def generate_filter_fn() -> Callable[ context = DataContext.get_current() def fn( - blocks: Iterator[Block], ctx: TaskContext, row_fn: RowUDF + blocks: Iterator[Block], ctx: TaskContext, row_fn: UserDefinedFunction ) -> Iterator[Block]: DataContext._set_current(context) for block in blocks: block = BlockAccessor.for_block(block) builder = block.builder() - for row in block.iter_rows(): + for row in block.iter_rows(public_row_format=True): if row_fn(row): builder.add(row) # NOTE: this yields an empty block if all rows are filtered out. diff --git a/python/ray/data/_internal/planner/flat_map.py b/python/ray/data/_internal/planner/flat_map.py index c641f83f6b07..d2a09035e48a 100644 --- a/python/ray/data/_internal/planner/flat_map.py +++ b/python/ray/data/_internal/planner/flat_map.py @@ -2,12 +2,12 @@ from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.output_buffer import BlockOutputBuffer -from ray.data.block import Block, BlockAccessor, RowUDF +from ray.data.block import Block, BlockAccessor, UserDefinedFunction from ray.data.context import DataContext def generate_flat_map_fn() -> Callable[ - [Iterator[Block], TaskContext, RowUDF], Iterator[Block] + [Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block] ]: """Generate function to apply the UDF to each record of blocks, and then flatten results. @@ -16,13 +16,13 @@ def generate_flat_map_fn() -> Callable[ context = DataContext.get_current() def fn( - blocks: Iterator[Block], ctx: TaskContext, row_fn: RowUDF + blocks: Iterator[Block], ctx: TaskContext, row_fn: UserDefinedFunction ) -> Iterator[Block]: DataContext._set_current(context) output_buffer = BlockOutputBuffer(None, context.target_max_block_size) for block in blocks: block = BlockAccessor.for_block(block) - for row in block.iter_rows(): + for row in block.iter_rows(public_row_format=True): for r2 in row_fn(row): output_buffer.add(r2) if output_buffer.has_next(): diff --git a/python/ray/data/_internal/planner/map_batches.py b/python/ray/data/_internal/planner/map_batches.py index 1a03d3b6e45e..d404b5a59a57 100644 --- a/python/ray/data/_internal/planner/map_batches.py +++ b/python/ray/data/_internal/planner/map_batches.py @@ -5,8 +5,9 @@ from ray.data._internal.block_batching import batch_blocks from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.output_buffer import BlockOutputBuffer +from ray.data._internal.numpy_support import is_valid_udf_return from ray.data._internal.util import _truncated_repr -from ray.data.block import BatchUDF, Block, DataBatch +from ray.data.block import UserDefinedFunction, Block, DataBatch, StrictModeError from ray.data.context import DEFAULT_BATCH_SIZE, DataContext @@ -14,7 +15,7 @@ def generate_map_batches_fn( batch_size: Optional[int] = DEFAULT_BATCH_SIZE, batch_format: Optional[str] = "default", zero_copy_batch: bool = False, -) -> Callable[[Iterator[Block], TaskContext, BatchUDF], Iterator[Block]]: +) -> Callable[[Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block]]: """Generate function to apply the batch UDF to blocks.""" import numpy as np import pandas as pd @@ -24,8 +25,8 @@ def generate_map_batches_fn( def fn( blocks: Iterator[Block], - ctx: TaskContext, - batch_fn: BatchUDF, + task_context: TaskContext, + batch_fn: UserDefinedFunction, *fn_args, **fn_kwargs, ) -> Iterator[Block]: @@ -50,17 +51,26 @@ def validate_batch(batch: Block) -> None: "`numpy.ndarray`, `list`, or `dict[str, numpy.ndarray]`." ) + if context.strict_mode and isinstance(batch, list): + raise StrictModeError( + f"Error validating {_truncated_repr(batch)}: " + "Returning a list of objects from `map_batches` is not " + "allowed in Ray 2.5. To return Python objects, " + "wrap them in a named dict field, e.g., " + "return `{'results': objects}` instead of just `objects`." + ) + if isinstance(batch, collections.abc.Mapping): - for key, value in batch.items(): - if not isinstance(value, np.ndarray): + for key, value in list(batch.items()): + if not is_valid_udf_return(value): raise ValueError( f"Error validating {_truncated_repr(batch)}: " "The `fn` you passed to `map_batches` returned a " f"`dict`. `map_batches` expects all `dict` values " - f"to be of type `numpy.ndarray`, but the value " + f"to be `list` or `np.ndarray` type, but the value " f"corresponding to key {key!r} is of type " f"{type(value)}. To fix this issue, convert " - f"the {type(value)} to a `numpy.ndarray`." + f"the {type(value)} to a `np.ndarray`." ) def process_next_batch(batch: DataBatch) -> Iterator[Block]: diff --git a/python/ray/data/_internal/planner/map_rows.py b/python/ray/data/_internal/planner/map_rows.py index 2c38a669e52f..99405ff5ebf1 100644 --- a/python/ray/data/_internal/planner/map_rows.py +++ b/python/ray/data/_internal/planner/map_rows.py @@ -4,25 +4,25 @@ from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.output_buffer import BlockOutputBuffer from ray.data._internal.util import _truncated_repr -from ray.data.block import Block, BlockAccessor, RowUDF, StrictModeError +from ray.data.block import Block, BlockAccessor, UserDefinedFunction, StrictModeError from ray.data.context import DataContext def generate_map_rows_fn() -> Callable[ - [Iterator[Block], TaskContext, RowUDF], Iterator[Block] + [Iterator[Block], TaskContext, UserDefinedFunction], Iterator[Block] ]: """Generate function to apply the UDF to each record of blocks.""" context = DataContext.get_current() def fn( - blocks: Iterator[Block], ctx: TaskContext, row_fn: RowUDF + blocks: Iterator[Block], ctx: TaskContext, row_fn: UserDefinedFunction ) -> Iterator[Block]: DataContext._set_current(context) output_buffer = BlockOutputBuffer(None, context.target_max_block_size) for block in blocks: block = BlockAccessor.for_block(block) - for row in block.iter_rows(): + for row in block.iter_rows(public_row_format=True): item = row_fn(row) if context.strict_mode and not isinstance( item, collections.abc.Mapping @@ -30,7 +30,7 @@ def fn( raise StrictModeError( f"Error validating {_truncated_repr(item)}: " "Standalone Python objects are not " - "allowed in strict mode. To return Python objects from map(), " + "allowed in Ray 2.5. To return Python objects from map(), " "wrap them in a dict, e.g., " "return `{'item': item}` instead of just `item`." ) diff --git a/python/ray/data/_internal/planner/plan_from_items_op.py b/python/ray/data/_internal/planner/plan_from_items_op.py index a0d3a8e62abd..95507501bc02 100644 --- a/python/ray/data/_internal/planner/plan_from_items_op.py +++ b/python/ray/data/_internal/planner/plan_from_items_op.py @@ -1,3 +1,4 @@ +import collections from typing import List import ray @@ -19,6 +20,7 @@ def _plan_from_items_op(op: FromItems) -> PhysicalOperator: """ def get_input_data() -> List[RefBundle]: + ctx = ray.data.DataContext.get_current() if op._parallelism > 0: block_size, remainder = divmod(len(op._items), op._parallelism) else: @@ -34,7 +36,11 @@ def get_input_data() -> List[RefBundle]: block_start = i * block_size + min(i, remainder) block_end = (i + 1) * block_size + min(i + 1, remainder) for j in range(block_start, block_end): - builder.add(op._items[j]) + item = op._items[j] + if ctx.strict_mode: + if not isinstance(item, collections.abc.Mapping): + item = {"item": item} + builder.add(item) block: Block = builder.build() block_metadata: BlockMetadata = BlockAccessor.for_block(block).get_metadata( diff --git a/python/ray/data/_internal/planner/plan_from_numpy_op.py b/python/ray/data/_internal/planner/plan_from_numpy_op.py index c8317b9f4031..969b4c26478b 100644 --- a/python/ray/data/_internal/planner/plan_from_numpy_op.py +++ b/python/ray/data/_internal/planner/plan_from_numpy_op.py @@ -22,11 +22,8 @@ def get_input_data() -> List[RefBundle]: ndarray_to_block_remote = cached_remote_fn(ndarray_to_block, num_returns=2) - ctx = ray.data.DatasetContext.get_current() - res = [ - ndarray_to_block_remote.remote(arr_ref, ctx.strict_mode) - for arr_ref in op._ndarrays - ] + ctx = ray.data.DataContext.get_current() + res = [ndarray_to_block_remote.remote(arr_ref, ctx) for arr_ref in op._ndarrays] blocks, metadata = map(list, zip(*res)) metadata = ray.get(metadata) ref_bundles: List[RefBundle] = [ diff --git a/python/ray/data/_internal/planner/plan_udf_map_op.py b/python/ray/data/_internal/planner/plan_udf_map_op.py index 31f931d251f2..e3427443b571 100644 --- a/python/ray/data/_internal/planner/plan_udf_map_op.py +++ b/python/ray/data/_internal/planner/plan_udf_map_op.py @@ -3,7 +3,6 @@ import ray from ray.data._internal.compute import ( ActorPoolStrategy, - TaskPoolStrategy, get_compute, ) from ray.data._internal.execution.interfaces import PhysicalOperator, TaskContext @@ -20,6 +19,7 @@ from ray.data._internal.planner.flat_map import generate_flat_map_fn from ray.data._internal.planner.map_batches import generate_map_batches_fn from ray.data._internal.planner.map_rows import generate_map_rows_fn +from ray.data._internal.util import validate_compute from ray.data.block import Block, CallableClass @@ -47,14 +47,9 @@ def _plan_udf_map_op( raise ValueError(f"Found unknown logical operator during planning: {op}") compute = get_compute(op._compute) + validate_compute(op._fn, compute) if isinstance(op._fn, CallableClass): - if isinstance(compute, TaskPoolStrategy): - raise ValueError( - "``compute`` must be specified when using a callable class, " - "and must specify the actor compute strategy. " - "For example, use ``compute=ActorPoolStrategy(size=n)``." - ) assert isinstance(compute, ActorPoolStrategy) fn_constructor_args = op._fn_constructor_args or () diff --git a/python/ray/data/_internal/planner/random_shuffle.py b/python/ray/data/_internal/planner/random_shuffle.py index 8f22741aa93c..5827c34802d4 100644 --- a/python/ray/data/_internal/planner/random_shuffle.py +++ b/python/ray/data/_internal/planner/random_shuffle.py @@ -2,6 +2,7 @@ from ray.data._internal.execution.interfaces import ( AllToAllTransformFn, + MapTransformFn, RefBundle, TaskContext, ) @@ -28,7 +29,20 @@ def fn( ctx: TaskContext, ) -> Tuple[List[RefBundle], StatsDict]: num_input_blocks = sum(len(r.blocks) for r in refs) - shuffle_spec = ShuffleTaskSpec(random_shuffle=True, random_seed=seed) + + # If map_transform_fn is specified (e.g. from fusing + # MapOperator->AllToAllOperator), we pass a map function which + # is applied to each block before shuffling. + map_transform_fn: Optional[MapTransformFn] = ctx.upstream_map_transform_fn + upstream_map_fn = None + if map_transform_fn: + upstream_map_fn = lambda block: map_transform_fn(block, ctx) # noqa: E731 + + shuffle_spec = ShuffleTaskSpec( + random_shuffle=True, + random_seed=seed, + upstream_map_fn=upstream_map_fn, + ) if DataContext.get_current().use_push_based_shuffle: if num_outputs is not None: diff --git a/python/ray/data/_internal/planner/repartition.py b/python/ray/data/_internal/planner/repartition.py index 8f79b8bddc5f..01b3a0226375 100644 --- a/python/ray/data/_internal/planner/repartition.py +++ b/python/ray/data/_internal/planner/repartition.py @@ -1,10 +1,14 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple, TYPE_CHECKING from ray.data._internal.execution.interfaces import ( AllToAllTransformFn, RefBundle, TaskContext, ) + +from ray.data._internal.planner.exchange.split_repartition_task_scheduler import ( + SplitRepartitionTaskScheduler, +) from ray.data._internal.planner.exchange.push_based_shuffle_task_scheduler import ( PushBasedShuffleTaskScheduler, ) @@ -15,20 +19,32 @@ from ray.data._internal.stats import StatsDict from ray.data.context import DataContext +if TYPE_CHECKING: + from python.ray.data._internal.execution.interfaces import MapTransformFn + def generate_repartition_fn( num_outputs: int, shuffle: bool, ) -> AllToAllTransformFn: - """Generate function to randomly shuffle each records of blocks.""" - # TODO: support non-shuffle repartition as _internal/fast_repartition.py. - assert shuffle, "Execution optimizer does not support non-shuffle repartition yet." + """Generate function to partition each records of blocks.""" - def fn( + def shuffle_repartition_fn( refs: List[RefBundle], ctx: TaskContext, ) -> Tuple[List[RefBundle], StatsDict]: - shuffle_spec = ShuffleTaskSpec(random_shuffle=False) + # If map_transform_fn is specified (e.g. from fusing + # MapOperator->AllToAllOperator), we pass a map function which + # is applied to each block before shuffling. + map_transform_fn: Optional["MapTransformFn"] = ctx.upstream_map_transform_fn + upstream_map_fn = None + if map_transform_fn: + upstream_map_fn = lambda block: map_transform_fn(block, ctx) # noqa: E731 + + shuffle_spec = ShuffleTaskSpec( + random_shuffle=False, + upstream_map_fn=upstream_map_fn, + ) if DataContext.get_current().use_push_based_shuffle: scheduler = PushBasedShuffleTaskScheduler(shuffle_spec) @@ -37,4 +53,14 @@ def fn( return scheduler.execute(refs, num_outputs) - return fn + def split_repartition_fn( + refs: List[RefBundle], + ctx: TaskContext, + ) -> Tuple[List[RefBundle], StatsDict]: + shuffle_spec = ShuffleTaskSpec(random_shuffle=False) + scheduler = SplitRepartitionTaskScheduler(shuffle_spec) + return scheduler.execute(refs, num_outputs) + + if shuffle: + return shuffle_repartition_fn + return split_repartition_fn diff --git a/python/ray/data/_internal/planner/write.py b/python/ray/data/_internal/planner/write.py index ca3d63c92a8e..e0ae7d82fbf3 100644 --- a/python/ray/data/_internal/planner/write.py +++ b/python/ray/data/_internal/planner/write.py @@ -8,7 +8,7 @@ def generate_write_fn( datasource: Datasource, **write_args ) -> Callable[[Iterator[Block], TaskContext], Iterator[Block]]: - # If the write op succeeds, the resulting Datastream is a list of + # If the write op succeeds, the resulting Dataset is a list of # WriteResult (one element per write task). Otherwise, an error will # be raised. The Datasource can handle execution outcomes with the # on_write_complete() and on_write_failed(). diff --git a/python/ray/data/_internal/progress_bar.py b/python/ray/data/_internal/progress_bar.py index 5cc9c6562544..dd03ed0960f7 100644 --- a/python/ray/data/_internal/progress_bar.py +++ b/python/ray/data/_internal/progress_bar.py @@ -1,8 +1,7 @@ import threading -from typing import Any, List +from typing import Any, List, Optional import ray -from ray._private.ray_constants import env_integer from ray.experimental import tqdm_ray from ray.types import ObjectRef from ray.util.annotations import PublicAPI @@ -15,9 +14,6 @@ tqdm = None needs_warning = True -# Whether progress bars are enabled in this thread. -_enabled = not bool(env_integer("RAY_DATA_DISABLE_PROGRESS_BARS", 0)) - # Used a signal to cancel execution. _canceled_threads = set() _canceled_threads_lock = threading.Lock() @@ -35,9 +31,11 @@ def set_progress_bars(enabled: bool) -> bool: Returns: Whether progress bars were previously enabled. """ - global _enabled - old_value = _enabled - _enabled = enabled + from ray.data import DataContext + + ctx = DataContext.get_current() + old_value = ctx.enable_progress_bars + ctx.enable_progress_bars = enabled return old_value @@ -45,9 +43,13 @@ class ProgressBar: """Thin wrapper around tqdm to handle soft imports.""" def __init__( - self, name: str, total: int, position: int = 0, enabled: bool = _enabled + self, name: str, total: int, position: int = 0, enabled: Optional[bool] = None ): self._desc = name + if enabled is None: + from ray.data import DataContext + + enabled = DataContext.get_current().enable_progress_bars if not enabled: self._bar = None elif tqdm: @@ -60,9 +62,7 @@ def __init__( else: global needs_warning if needs_warning: - print( - "[datastream]: Run `pip install tqdm` to enable progress reporting." - ) + print("[dataset]: Run `pip install tqdm` to enable progress reporting.") needs_warning = False self._bar = None @@ -80,8 +80,15 @@ def fetch_until_complete(self, refs: List[ObjectRef]) -> List[Any]: ref_to_result = {} remaining = refs t = threading.current_thread() + # Triggering fetch_local redundantly for the same object is slower. + # We only need to trigger the fetch_local once for each object, + # raylet will persist these fetch requests even after ray.wait returns. + # See https://github.com/ray-project/ray/issues/30375. + fetch_local = True while remaining: - done, remaining = ray.wait(remaining, fetch_local=True, timeout=0.1) + done, remaining = ray.wait(remaining, fetch_local=fetch_local, timeout=0.1) + if fetch_local: + fetch_local = False for ref, result in zip(done, ray.get(done)): ref_to_result[ref] = result self.update(len(done)) diff --git a/python/ray/data/_internal/remote_fn.py b/python/ray/data/_internal/remote_fn.py index 4a6d93fb0938..077008e0c5aa 100644 --- a/python/ray/data/_internal/remote_fn.py +++ b/python/ray/data/_internal/remote_fn.py @@ -10,7 +10,7 @@ def cached_remote_fn(fn: Any, **ray_remote_args) -> Any: """Lazily defines a ray.remote function. - This is used in Datastreams to avoid circular import issues with ray.remote. + This is used in Datasets to avoid circular import issues with ray.remote. (ray imports ray.data in order to allow ``ray.data.read_foo()`` to work, which means ray.remote cannot be used top-level in ray.data). diff --git a/python/ray/data/_internal/simple_block.py b/python/ray/data/_internal/simple_block.py index 92ab9310e60b..57a8923ac629 100644 --- a/python/ray/data/_internal/simple_block.py +++ b/python/ray/data/_internal/simple_block.py @@ -22,13 +22,12 @@ KeyType, AggType, BlockExecStats, - KeyFn, ) from ray.data._internal.block_builder import BlockBuilder from ray.data._internal.size_estimator import SizeEstimator -class SimpleBlockBuilder(BlockBuilder[T]): +class SimpleBlockBuilder(BlockBuilder): def __init__(self): self._items = [] self._size_estimator = SizeEstimator() @@ -68,7 +67,7 @@ def __init__(self, items: List[T]): def num_rows(self) -> int: return len(self._items) - def iter_rows(self) -> Iterator[T]: + def iter_rows(self, public_row_format: bool) -> Iterator[T]: return iter(self._items) def slice(self, start: int, end: int, copy: bool = False) -> List[T]: @@ -80,14 +79,14 @@ def slice(self, start: int, end: int, copy: bool = False) -> List[T]: def take(self, indices: List[int]) -> List[T]: return [self._items[i] for i in indices] - def select(self, columns: List[KeyFn]) -> List[T]: + def select(self, columns: List[str]) -> List[T]: if len(columns) != 1 or not callable(columns[0]): raise ValueError( "Column must be a single callable when selecting on Simple blocks, " f"but got: {columns}." ) callable_col = columns[0] - return [callable_col(row) for row in self.iter_rows()] + return [callable_col(row) for row in self.iter_rows(True)] def random_shuffle(self, random_seed: Optional[int]) -> List[T]: random = np.random.RandomState(random_seed) @@ -100,9 +99,7 @@ def to_pandas(self) -> "pandas.DataFrame": return pandas.DataFrame({"value": self._items}) - def to_numpy( - self, columns: Optional[Union[KeyFn, List[KeyFn]]] = None - ) -> np.ndarray: + def to_numpy(self, columns: Optional[Union[str, List[str]]] = None) -> np.ndarray: if columns is not None: if not isinstance(columns, list): columns = [columns] @@ -128,7 +125,7 @@ def schema(self) -> Any: else: return None - def zip(self, other: "Block[T]") -> "Block[T]": + def zip(self, other: "Block") -> "Block": if not isinstance(other, list): raise ValueError( "Cannot zip {} with block of type {}".format(type(self), type(other)) @@ -142,7 +139,7 @@ def zip(self, other: "Block[T]") -> "Block[T]": return list(zip(self._items, other)) @staticmethod - def builder() -> SimpleBlockBuilder[T]: + def builder() -> SimpleBlockBuilder: return SimpleBlockBuilder() def sample(self, n_samples: int = 1, key: "SortKeyT" = None) -> List[T]: @@ -157,7 +154,7 @@ def sample(self, n_samples: int = 1, key: "SortKeyT" = None) -> List[T]: return ret return [key(x) for x in ret] - def count(self, on: KeyFn) -> Optional[U]: + def count(self, on: str) -> Optional[U]: if on is not None and not callable(on): raise ValueError( "on must be a callable or None when aggregating on Simple blocks, but " @@ -168,7 +165,7 @@ def count(self, on: KeyFn) -> Optional[U]: return None count = 0 - for r in self.iter_rows(): + for r in self.iter_rows(True): if on is not None: r = on(r) if r is not None: @@ -179,7 +176,7 @@ def _apply_accum( self, init: AggType, accum: Callable[[AggType, T], AggType], - on: KeyFn, + on: str, ignore_nulls: bool, ) -> Optional[U]: """Helper providing null handling around applying an aggregation.""" @@ -194,7 +191,7 @@ def _apply_accum( has_data = False a = init - for r in self.iter_rows(): + for r in self.iter_rows(True): if on is not None: r = on(r) if r is None: @@ -207,16 +204,16 @@ def _apply_accum( a = accum(a, r) return a if has_data else None - def sum(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def sum(self, on: str, ignore_nulls: bool) -> Optional[U]: return self._apply_accum(0, lambda a, r: a + r, on, ignore_nulls) - def min(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def min(self, on: str, ignore_nulls: bool) -> Optional[U]: return self._apply_accum(float("inf"), min, on, ignore_nulls) - def max(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def max(self, on: str, ignore_nulls: bool) -> Optional[U]: return self._apply_accum(float("-inf"), max, on, ignore_nulls) - def mean(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def mean(self, on: str, ignore_nulls: bool) -> Optional[U]: return self._apply_accum( [0, 0], lambda a, r: [a[0] + r, a[1] + 1], @@ -224,7 +221,7 @@ def mean(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: ignore_nulls, ) - def std(self, on: KeyFn, ignore_nulls: bool) -> Optional[U]: + def std(self, on: str, ignore_nulls: bool) -> Optional[U]: def accum(a: List[float], r: float) -> List[float]: # Accumulates the current count, the current mean, and the sum of # squared differences from the current mean (M2). @@ -240,7 +237,7 @@ def accum(a: List[float], r: float) -> List[float]: def sum_of_squared_diffs_from_mean( self, - on: KeyFn, + on: str, ignore_nulls: bool, mean: Optional[U] = None, ) -> Optional[U]: @@ -256,7 +253,7 @@ def sum_of_squared_diffs_from_mean( def sort_and_partition( self, boundaries: List[T], key: "SortKeyT", descending: bool - ) -> List["Block[T]"]: + ) -> List["Block"]: items = sorted(self._items, key=key, reverse=descending) if len(boundaries) == 0: return [items] @@ -291,9 +288,7 @@ def sort_and_partition( ret.append(items[prev_i:]) return ret - def combine( - self, key: KeyFn, aggs: Tuple[AggregateFn] - ) -> Block[Tuple[KeyType, AggType]]: + def combine(self, key: str, aggs: Tuple[AggregateFn]) -> Block: """Combine rows with the same key into an accumulator. This assumes the block is already sorted by key in ascending order. @@ -323,7 +318,7 @@ def iter_groups() -> Iterator[Tuple[KeyType, Block]]: return start = end = 0 - iter = self.iter_rows() + iter = self.iter_rows(True) next_row = None # Use a bool to indicate if next_row is valid # instead of checking if next_row is None @@ -364,8 +359,8 @@ def iter_groups() -> Iterator[Tuple[KeyType, Block]]: @staticmethod def merge_sorted_blocks( - blocks: List[Block[T]], key: "SortKeyT", descending: bool - ) -> Tuple[Block[T], BlockMetadata]: + blocks: List[Block], key: "SortKeyT", descending: bool + ) -> Tuple[Block, BlockMetadata]: stats = BlockExecStats.builder() ret = [x for block in blocks for x in block] ret.sort(key=key, reverse=descending) @@ -375,11 +370,11 @@ def merge_sorted_blocks( @staticmethod def aggregate_combined_blocks( - blocks: List[Block[Tuple[KeyType, AggType]]], - key: KeyFn, + blocks: List[Block], + key: str, aggs: Tuple[AggregateFn], finalize: bool, - ) -> Tuple[Block[Tuple[KeyType, Union[U, AggType]]], BlockMetadata]: + ) -> Tuple[Block, BlockMetadata]: """Aggregate sorted, partially combined blocks with the same key range. This assumes blocks are already sorted by key in ascending order, @@ -405,7 +400,8 @@ def aggregate_combined_blocks( key_fn = (lambda r: r[0]) if key else (lambda r: 0) iter = heapq.merge( - *[SimpleBlockAccessor(block).iter_rows() for block in blocks], key=key_fn + *[SimpleBlockAccessor(block).iter_rows(True) for block in blocks], + key=key_fn, ) next_row = None ret = [] diff --git a/python/ray/data/_internal/sort.py b/python/ray/data/_internal/sort.py index f31ea9a6bb7c..4699ecd0d0a2 100644 --- a/python/ray/data/_internal/sort.py +++ b/python/ray/data/_internal/sort.py @@ -14,7 +14,7 @@ Merging: a merge task would receive a block from every worker that consists of items in a certain range. It then merges the sorted blocks into one sorted -block and becomes part of the new, sorted datastream. +block and becomes part of the new, sorted dataset. """ from typing import Any, Callable, List, Optional, Tuple, TypeVar, Union @@ -113,7 +113,7 @@ def sample_boundaries( sample_bar.close() del sample_results samples = [s for s in samples if len(s) > 0] - # The datastream is empty + # The dataset is empty if len(samples) == 0: return [None] * (num_reducers - 1) builder = DelegatingBlockBuilder() @@ -173,5 +173,5 @@ def sort_impl( ) -def _sample_block(block: Block[T], n_samples: int, key: SortKeyT) -> Block[T]: +def _sample_block(block: Block, n_samples: int, key: SortKeyT) -> Block: return BlockAccessor.for_block(block).sample(n_samples, key) diff --git a/python/ray/data/_internal/stage_impl.py b/python/ray/data/_internal/stage_impl.py index 4a89454846c8..73f6ee454e6f 100644 --- a/python/ray/data/_internal/stage_impl.py +++ b/python/ray/data/_internal/stage_impl.py @@ -8,7 +8,10 @@ PushBasedShufflePartitionOp, SimpleShufflePartitionOp, ) -from ray.data._internal.split import _split_at_indices +from ray.data._internal.split import ( + _split_at_index, + _split_at_indices, +) from ray.data._internal.block_list import BlockList from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder from ray.data._internal.execution.interfaces import TaskContext @@ -19,18 +22,17 @@ _validate_key_fn, Block, BlockPartition, - KeyFn, BlockMetadata, BlockAccessor, BlockExecStats, ) if TYPE_CHECKING: - from ray.data import Datastream + from ray.data import Dataset class RepartitionStage(AllToAllStage): - """Implementation of `Datastream.repartition()`.""" + """Implementation of `Dataset.repartition()`.""" def __init__(self, num_blocks: int, shuffle: bool): if shuffle: @@ -94,7 +96,7 @@ def do_fast_repartition( class RandomizeBlocksStage(AllToAllStage): - """Implementation of `Datastream.randomize_blocks()`.""" + """Implementation of `Dataset.randomize_blocks()`.""" def __init__(self, seed: Optional[int]): self._seed = seed @@ -110,7 +112,7 @@ def do_randomize(self, block_list, *_): class RandomShuffleStage(AllToAllStage): - """Implementation of `Datastream.random_shuffle()`.""" + """Implementation of `Dataset.random_shuffle()`.""" def __init__( self, @@ -165,11 +167,11 @@ def do_shuffle( class ZipStage(AllToAllStage): - """Implementation of `Datastream.zip()`.""" + """Implementation of `Dataset.zip()`.""" - def __init__(self, other: "Datastream"): + def __init__(self, other: "Dataset"): def do_zip_all(block_list: BlockList, clear_input_blocks: bool, *_): - # Repartition other to align with the base datastream, and then zip together + # Repartition other to align with the base dataset, and then zip together # the blocks in parallel. # TODO(Clark): Port this to a streaming zip, e.g. push block pairs through # an actor that buffers and zips. @@ -188,7 +190,7 @@ def do_zip_all(block_list: BlockList, clear_input_blocks: bool, *_): ) inverted = False if sum(other_block_bytes) > sum(base_block_bytes): - # Make sure that other is the smaller datastream, so we minimize + # Make sure that other is the smaller dataset, so we minimize # splitting work when aligning other with base. # TODO(Clark): Improve this heuristic for minimizing splitting work, # e.g. by generating the splitting plans for each route (via @@ -205,14 +207,14 @@ def do_zip_all(block_list: BlockList, clear_input_blocks: bool, *_): indices = list(itertools.accumulate(base_block_rows)) indices.pop(-1) - # Check that each datastream has the same number of rows. + # Check that each dataset has the same number of rows. # TODO(Clark): Support different number of rows via user-directed # dropping/padding. total_base_rows = sum(base_block_rows) total_other_rows = sum(other_block_rows) if total_base_rows != total_other_rows: raise ValueError( - "Cannot zip datastreams of different number of rows: " + "Cannot zip datasets of different number of rows: " f"{total_base_rows}, {total_other_rows}" ) @@ -311,16 +313,16 @@ def _do_zip( class SortStage(AllToAllStage): - """Implementation of `Datastream.sort()`.""" + """Implementation of `Dataset.sort()`.""" - def __init__(self, ds: "Datastream", key: Optional[KeyFn], descending: bool): + def __init__(self, ds: "Dataset", key: Optional[str], descending: bool): def do_sort( block_list, ctx: TaskContext, clear_input_blocks: bool, *_, ): - # Handle empty datastream. + # Handle empty dataset. if block_list.initial_num_blocks() == 0: return block_list, {} if clear_input_blocks: @@ -344,3 +346,41 @@ def do_sort( do_sort, sub_stage_names=["SortSample", "ShuffleMap", "ShuffleReduce"], ) + + +class LimitStage(AllToAllStage): + """Implementation of `Dataset.limit()`.""" + + def __init__(self, limit: int): + self._limit = limit + super().__init__( + "Limit", + None, + self._do_limit, + ) + + @property + def limit(self) -> int: + return self._limit + + def _do_limit( + self, + input_block_list: BlockList, + clear_input_blocks: bool, + *_, + ): + if clear_input_blocks: + block_list = input_block_list.copy() + input_block_list.clear() + else: + block_list = input_block_list + block_list = block_list.truncate_by_rows(self._limit) + blocks, metadata, _, _ = _split_at_index(block_list, self._limit) + return ( + BlockList( + blocks, + metadata, + owned_by_consumer=block_list._owned_by_consumer, + ), + {}, + ) diff --git a/python/ray/data/_internal/stats.py b/python/ray/data/_internal/stats.py index c0621814a8a6..94bcfee10f4d 100644 --- a/python/ray/data/_internal/stats.py +++ b/python/ray/data/_internal/stats.py @@ -14,8 +14,8 @@ from ray.util.annotations import DeveloperAPI from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy -STATS_ACTOR_NAME = "datastreams_stats_actor" -STATS_ACTOR_NAMESPACE = "_datastream_stats_actor" +STATS_ACTOR_NAME = "datasets_stats_actor" +STATS_ACTOR_NAMESPACE = "_dataset_stats_actor" StatsDict = Dict[str, List[BlockMetadata]] @@ -76,24 +76,24 @@ def avg(self) -> float: return self._value / self._total_count if self._total_count else float("inf") -class _DatastreamStatsBuilder: - """Helper class for building datastream stats. +class _DatasetStatsBuilder: + """Helper class for building dataset stats. When this class is created, we record the start time. When build() is - called with the final blocks of the new datastream, the time delta is + called with the final blocks of the new dataset, the time delta is saved as part of the stats.""" def __init__( self, stage_name: str, - parent: "DatastreamStats", + parent: "DatasetStats", override_start_time: Optional[float], ): self.stage_name = stage_name self.parent = parent self.start_time = override_start_time or time.perf_counter() - def build_multistage(self, stages: StatsDict) -> "DatastreamStats": + def build_multistage(self, stages: StatsDict) -> "DatasetStats": stage_infos = {} for i, (k, v) in enumerate(stages.items()): capped_k = capfirst(k) @@ -104,7 +104,7 @@ def build_multistage(self, stages: StatsDict) -> "DatastreamStats": stage_infos[self.stage_name.split("->")[-1] + capped_k] = v else: stage_infos[self.stage_name] = v - stats = DatastreamStats( + stats = DatasetStats( stages=stage_infos, parent=self.parent, base_name=self.stage_name, @@ -112,8 +112,8 @@ def build_multistage(self, stages: StatsDict) -> "DatastreamStats": stats.time_total_s = time.perf_counter() - self.start_time return stats - def build(self, final_blocks: BlockList) -> "DatastreamStats": - stats = DatastreamStats( + def build(self, final_blocks: BlockList) -> "DatasetStats": + stats = DatasetStats( stages={self.stage_name: final_blocks.get_metadata()}, parent=self.parent, ) @@ -125,7 +125,7 @@ def build(self, final_blocks: BlockList) -> "DatastreamStats": class _StatsActor: """Actor holding stats for blocks created by LazyBlockList. - This actor is shared across all datastreams created in the same cluster. + This actor is shared across all datasets created in the same cluster. In order to cap memory usage, we set a max number of stats to keep in the actor. When this limit is exceeded, the stats will be garbage collected in FIFO order. @@ -196,31 +196,31 @@ def _get_or_create_stats_actor(): ).remote() -class DatastreamStats: - """Holds the execution times for a given Datastream. +class DatasetStats: + """Holds the execution times for a given Dataset. - This object contains a reference to the parent Datastream's stats as well, - but not the Datastream object itself, to allow its blocks to be dropped from + This object contains a reference to the parent Dataset's stats as well, + but not the Dataset object itself, to allow its blocks to be dropped from memory.""" def __init__( self, *, stages: StatsDict, - parent: Union[Optional["DatastreamStats"], List["DatastreamStats"]], + parent: Union[Optional["DatasetStats"], List["DatasetStats"]], needs_stats_actor: bool = False, stats_uuid: str = None, base_name: str = None, ): - """Create datastream stats. + """Create dataset stats. Args: - stages: Dict of stages used to create this Datastream from the + stages: Dict of stages used to create this Dataset from the previous one. Typically one entry, e.g., {"map": [...]}. - parent: Reference to parent Datastream's stats, or a list of parents + parent: Reference to parent Dataset's stats, or a list of parents if there are multiple. - needs_stats_actor: Whether this Datastream's stats needs a stats actor for - stats collection. This is currently only used for Datastreams using a + needs_stats_actor: Whether this Dataset's stats needs a stats actor for + stats collection. This is currently only used for Datasets using a lazy datasource (i.e. a LazyBlockList). stats_uuid: The uuid for the stats, used to fetch the right stats from the stats actor. @@ -230,20 +230,20 @@ def __init__( self.stages: StatsDict = stages if parent is not None and not isinstance(parent, list): parent = [parent] - self.parents: List["DatastreamStats"] = parent or [] + self.parents: List["DatasetStats"] = parent or [] self.number: int = ( 0 if not self.parents else max(p.number for p in self.parents) + 1 ) self.base_name = base_name - # TODO(ekl) deprecate and remove the notion of datastream UUID once we move + # TODO(ekl) deprecate and remove the notion of dataset UUID once we move # fully to streaming execution. - self.datastream_uuid: str = "unknown_uuid" + self.dataset_uuid: str = "unknown_uuid" self.time_total_s: float = 0 self.needs_stats_actor = needs_stats_actor self.stats_uuid = stats_uuid self._legacy_iter_batches = False - # Iteration stats, filled out if the user iterates over the datastream. + # Iteration stats, filled out if the user iterates over the dataset. self.iter_wait_s: Timer = Timer() self.iter_get_s: Timer = Timer() self.iter_next_batch_s: Timer = Timer() @@ -270,21 +270,21 @@ def stats_actor(self): def child_builder( self, name: str, override_start_time: Optional[float] = None - ) -> _DatastreamStatsBuilder: + ) -> _DatasetStatsBuilder: """Start recording stats for an op of the given name (e.g., map).""" - return _DatastreamStatsBuilder(name, self, override_start_time) + return _DatasetStatsBuilder(name, self, override_start_time) - def child_TODO(self, name: str) -> "DatastreamStats": + def child_TODO(self, name: str) -> "DatasetStats": """Placeholder for child ops not yet instrumented.""" - return DatastreamStats(stages={name + "_TODO": []}, parent=self) + return DatasetStats(stages={name + "_TODO": []}, parent=self) @staticmethod def TODO(): """Placeholder for ops not yet instrumented.""" - return DatastreamStats(stages={"TODO": []}, parent=None) + return DatasetStats(stages={"TODO": []}, parent=None) - def to_summary(self) -> "DatastreamStatsSummary": - """Generate a `DatastreamStatsSummary` object from the given `DatastreamStats` + def to_summary(self) -> "DatasetStatsSummary": + """Generate a `DatasetStatsSummary` object from the given `DatasetStats` object, which can be used to generate a summary string.""" if self.needs_stats_actor: ac = self.stats_actor @@ -330,12 +330,12 @@ def to_summary(self) -> "DatastreamStatsSummary": stats_summary_parents = [] if self.parents is not None: stats_summary_parents = [p.to_summary() for p in self.parents] - return DatastreamStatsSummary( + return DatasetStatsSummary( stages_stats, iter_stats, stats_summary_parents, self.number, - self.datastream_uuid, + self.dataset_uuid, self.time_total_s, self.base_name, self.extra_metrics, @@ -344,12 +344,12 @@ def to_summary(self) -> "DatastreamStatsSummary": @DeveloperAPI @dataclass -class DatastreamStatsSummary: +class DatasetStatsSummary: stages_stats: List["StageStatsSummary"] iter_stats: "IterStatsSummary" - parents: List["DatastreamStatsSummary"] + parents: List["DatasetStatsSummary"] number: int - datastream_uuid: str + dataset_uuid: str time_total_s: float base_name: str extra_metrics: Dict[str, Any] @@ -357,7 +357,7 @@ class DatastreamStatsSummary: def to_string( self, already_printed: Optional[Set[str]] = None, include_parent: bool = True ) -> str: - """Return a human-readable summary of this Datastream's stats. + """Return a human-readable summary of this Dataset's stats. Args: already_printed: Set of stage IDs that have already had its stats printed @@ -365,7 +365,7 @@ def to_string( include_parent: If true, also include parent stats summary; otherwise, only log stats of the latest stage. Returns: - String with summary statistics for executing the Datastream. + String with summary statistics for executing the Dataset. """ if already_printed is None: already_printed = set() @@ -380,7 +380,7 @@ def to_string( if len(self.stages_stats) == 1: stage_stats_summary = self.stages_stats[0] stage_name = stage_stats_summary.stage_name - stage_uuid = self.datastream_uuid + stage_name + stage_uuid = self.dataset_uuid + stage_name out += "Stage {} {}: ".format(self.number, stage_name) if stage_uuid in already_printed: out += "[execution cached]\n" @@ -397,7 +397,7 @@ def to_string( ) for n, stage_stats_summary in enumerate(self.stages_stats): stage_name = stage_stats_summary.stage_name - stage_uuid = self.datastream_uuid + stage_name + stage_uuid = self.dataset_uuid + stage_name out += "\n" out += "\tSubstage {} {}: ".format(n, stage_name) if stage_uuid in already_printed: @@ -426,8 +426,8 @@ def __repr__(self, level=0) -> str: parent_stats = f"\n{parent_stats},\n{indent} " if parent_stats else "" extra_metrics = f"\n{extra_metrics}\n{indent} " if extra_metrics else "" return ( - f"{indent}DatastreamStatsSummary(\n" - f"{indent} datastream_uuid={self.datastream_uuid},\n" + f"{indent}DatasetStatsSummary(\n" + f"{indent} dataset_uuid={self.dataset_uuid},\n" f"{indent} base_name={self.base_name},\n" f"{indent} number={self.number},\n" f"{indent} extra_metrics={{{extra_metrics}}},\n" @@ -463,7 +463,7 @@ class StageStatsSummary: # Whether the stage associated with this StageStatsSummary object is a substage is_substage: bool # This is the total walltime of the entire stage, typically obtained from - # `DatastreamStats.time_total_s`. An important distinction is that this is the + # `DatasetStats.time_total_s`. An important distinction is that this is the # overall runtime of the stage, pulled from the stats actor, whereas the # computed walltimes in `self.wall_time` are calculated on a substage level. time_total_s: float @@ -730,7 +730,7 @@ class IterStatsSummary: block_time: Timer # Time spent in user code, in seconds user_time: Timer - # Total time taken by Datastream iterator, in seconds + # Total time taken by Dataset iterator, in seconds total_time: Timer # Num of blocks that are in local object store iter_blocks_local: int @@ -755,7 +755,7 @@ def to_string(self) -> str: or self.format_time.get() or self.collate_time.get() ): - out += "\nDatastream iterator time breakdown:\n" + out += "\nDataset iterator time breakdown:\n" if self.block_time.get(): out += "* Total time user code is blocked: {}\n".format( fmt(self.block_time.get()) @@ -822,7 +822,7 @@ def to_string_legacy(self) -> str: or self.format_time.get() or self.get_time.get() ): - out += "\nDatastream iterator time breakdown:\n" + out += "\nDataset iterator time breakdown:\n" out += "* In ray.wait(): {}\n".format(fmt(self.wait_time.get())) out += "* In ray.get(): {}\n".format(fmt(self.get_time.get())) out += "* Num blocks local: {}\n".format(self.iter_blocks_local) @@ -854,16 +854,16 @@ def __repr__(self, level=0) -> str: class DatasetPipelineStats: - """Holds the execution times for a pipeline of Datastreams.""" + """Holds the execution times for a pipeline of Datasets.""" def __init__(self, *, max_history: int = 3): - """Create a datastream pipeline stats object. + """Create a dataset pipeline stats object. Args: - max_history: The max number of datastream window stats to track. + max_history: The max number of dataset window stats to track. """ self.max_history: int = max_history - self.history_buffer: List[Tuple[int, DatastreamStats]] = [] + self.history_buffer: List[Tuple[int, DatasetStats]] = [] self.count = 0 self.wait_time_s = [] @@ -887,7 +887,7 @@ def __getattr__(self, name): return self._iter_stats[name] raise AttributeError - def add(self, stats: DatastreamStats) -> None: + def add(self, stats: DatasetStats) -> None: """Called to add stats for a newly computed window.""" self.history_buffer.append((self.count, stats)) if len(self.history_buffer) > self.max_history: @@ -900,8 +900,8 @@ def add_pipeline_stats(self, other_stats: "DatasetPipelineStats"): `other_stats` should cover a disjoint set of windows than the current stats. """ - for _, datastream_stats in other_stats.history_buffer: - self.add(datastream_stats) + for _, dataset_stats in other_stats.history_buffer: + self.add(dataset_stats) self.wait_time_s.extend(other_stats.wait_time_s) @@ -918,7 +918,7 @@ def _summarize_iter(self) -> str: or self.iter_get_s.get() ): out += "\nDatasetPipeline iterator time breakdown:\n" - out += "* Waiting for next datastream: {}\n".format( + out += "* Waiting for next dataset: {}\n".format( fmt(self.iter_ds_wait_s.get()) ) out += "* In ray.wait(): {}\n".format(fmt(self.iter_wait_s.get())) @@ -947,7 +947,7 @@ def summary_string(self, exclude_first_window: bool = True) -> str: wait_time_s = self.wait_time_s[1 if exclude_first_window else 0 :] if wait_time_s: out += ( - "* Time stalled waiting for next datastream: " + "* Time stalled waiting for next dataset: " "{} min, {} max, {} mean, {} total\n".format( fmt(min(wait_time_s)), fmt(max(wait_time_s)), diff --git a/python/ray/data/_internal/table_block.py b/python/ray/data/_internal/table_block.py index b003746861a4..63efe612e49d 100644 --- a/python/ray/data/_internal/table_block.py +++ b/python/ray/data/_internal/table_block.py @@ -1,5 +1,5 @@ import collections -from typing import Dict, Iterator, List, Union, Any, TypeVar, TYPE_CHECKING +from typing import Dict, Iterator, List, Union, Any, TypeVar, Mapping, TYPE_CHECKING import numpy as np @@ -8,6 +8,7 @@ from ray.data.block import Block, BlockAccessor from ray.data.row import TableRow from ray.data._internal.block_builder import BlockBuilder +from ray.data._internal.numpy_support import is_array_like from ray.data._internal.size_estimator import SizeEstimator from ray.data._internal.util import _is_tensor_schema @@ -22,7 +23,7 @@ MAX_UNCOMPACTED_SIZE_BYTES = 50 * 1024 * 1024 -class TableBlockBuilder(BlockBuilder[T]): +class TableBlockBuilder(BlockBuilder): def __init__(self, block_type): # The set of uncompacted Python values buffered. self._columns = collections.defaultdict(list) @@ -46,6 +47,7 @@ def __init__(self, block_type): self._block_type = block_type def add(self, item: Union[dict, TableRow, np.ndarray]) -> None: + ctx = ray.data.DataContext.get_current() if isinstance(item, TableRow): item = item.as_pydict() elif isinstance(item, np.ndarray): @@ -70,6 +72,12 @@ def add(self, item: Union[dict, TableRow, np.ndarray]) -> None: self._column_names = item_column_names for key, value in item.items(): + if ( + ctx.strict_mode + and is_array_like(value) + and not isinstance(value, np.ndarray) + ): + value = np.array(value) self._columns[key].append(value) self._num_rows += 1 self._compact_if_needed() @@ -175,12 +183,15 @@ def to_block(self) -> Block: return self._table def is_tensor_wrapper(self) -> bool: - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: return False return _is_tensor_schema(self.column_names()) - def iter_rows(self) -> Iterator[Union[TableRow, np.ndarray]]: + def iter_rows( + self, public_row_format: bool + ) -> Iterator[Union[Mapping, np.ndarray]]: + ctx = ray.data.DataContext.get_current() outer = self class Iter: @@ -193,15 +204,23 @@ def __iter__(self): def __next__(self): self._cur += 1 if self._cur < outer.num_rows(): - return outer._get_row(self._cur) + row = outer._get_row(self._cur) + if ( + public_row_format + and ctx.strict_mode + and isinstance(row, TableRow) + ): + return row.as_pydict() + else: + return row raise StopIteration return Iter() - def _zip(self, acc: BlockAccessor) -> "Block[T]": + def _zip(self, acc: BlockAccessor) -> "Block": raise NotImplementedError - def zip(self, other: "Block[T]") -> "Block[T]": + def zip(self, other: "Block") -> "Block": acc = BlockAccessor.for_block(other) if not isinstance(acc, type(self)): raise ValueError( diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py index ac3a44d60380..d6a7efce4ff6 100644 --- a/python/ray/data/_internal/util.py +++ b/python/ray/data/_internal/util.py @@ -18,8 +18,8 @@ from ray.util.placement_group import PlacementGroup import pyarrow import pandas - from ray.data._internal.arrow_block import ArrowRow - from ray.data.block import Block, BlockMetadata + from ray.data._internal.compute import ComputeStrategy + from ray.data.block import Block, BlockMetadata, UserDefinedFunction logger = logging.getLogger(__name__) @@ -64,7 +64,7 @@ def _check_pyarrow_version(): if parse_version(version) < parse_version(MIN_PYARROW_VERSION): raise ImportError( - f"Datastream requires pyarrow >= {MIN_PYARROW_VERSION}, but " + f"Dataset requires pyarrow >= {MIN_PYARROW_VERSION}, but " f"{version} is installed. Reinstall with " f'`pip install -U "pyarrow"`. ' "If you want to disable this pyarrow version check, set the " @@ -75,7 +75,7 @@ def _check_pyarrow_version(): "You are using the 'pyarrow' module, but the exact version is unknown " "(possibly carried as an internal component by another module). Please " f"make sure you are using pyarrow >= {MIN_PYARROW_VERSION} to ensure " - "compatibility with Ray Datastream. " + "compatibility with Ray Dataset. " "If you want to disable this pyarrow version check, set the " f"environment variable {RAY_DISABLE_PYARROW_VERSION_CHECK}=1." ) @@ -104,7 +104,7 @@ def _autodetect_parallelism( Args: parallelism: The user-requested parallelism, or -1 for auto-detection. cur_pg: The current placement group, to be used for avail cpu calculation. - ctx: The current Datastream context to use for configs. + ctx: The current Dataset context to use for configs. reader: The datasource reader, to be used for data size estimation. avail_cpus: Override avail cpus detection (for testing only). @@ -142,7 +142,7 @@ def _autodetect_parallelism( def _estimate_avail_cpus(cur_pg: Optional["PlacementGroup"]) -> int: - """Estimates the available CPU parallelism for this Datastream in the cluster. + """Estimates the available CPU parallelism for this Dataset in the cluster. If we aren't in a placement group, this is trivially the number of CPUs in the cluster. Otherwise, we try to calculate how large the placement group is relative @@ -156,7 +156,7 @@ def _estimate_avail_cpus(cur_pg: Optional["PlacementGroup"]) -> int: # If we're in a placement group, we shouldn't assume the entire cluster's # resources are available for us to use. Estimate an upper bound on what's - # reasonable to assume is available for datastreams to use. + # reasonable to assume is available for datasets to use. if cur_pg: pg_cpus = 0 for bundle in cur_pg.bundle_specs: @@ -176,7 +176,7 @@ def _estimate_avail_cpus(cur_pg: Optional["PlacementGroup"]) -> int: def _estimate_available_parallelism() -> int: - """Estimates the available CPU parallelism for this Datastream in the cluster. + """Estimates the available CPU parallelism for this Dataset in the cluster. If we are currently in a placement group, take that into account.""" cur_pg = ray.util.get_current_placement_group() return _estimate_avail_cpus(cur_pg) @@ -352,18 +352,18 @@ def _consumption_api( insert_after=False, ): """Annotate the function with an indication that it's a consumption API, and that it - will trigger Datastream execution. + will trigger Dataset execution. """ base = ( " will trigger execution of the lazy transformations performed on " - "this datastream." + "this dataset." ) if delegate: message = delegate + base elif not if_more_than_read: message = "This operation" + base else: - condition = "If this datastream consists of more than a read, " + condition = "If this dataset consists of more than a read, " if datasource_metadata is not None: condition += ( f"or if the {datasource_metadata} can't be determined from the " @@ -388,7 +388,7 @@ def wrap(obj): def ConsumptionAPI(*args, **kwargs): """Annotate the function with an indication that it's a consumption API, and that it - will trigger Datastream execution. + will trigger Dataset execution. """ if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): return _consumption_api()(args[0]) @@ -412,6 +412,23 @@ def _split_list(arr: List[Any], num_splits: int) -> List[List[Any]]: return splits +def validate_compute( + fn: "UserDefinedFunction", compute: Optional[Union[str, "ComputeStrategy"]] +) -> None: + # Lazily import these objects to avoid circular imports. + from ray.data._internal.compute import TaskPoolStrategy + from ray.data.block import CallableClass + + if isinstance(fn, CallableClass) and ( + compute is None or compute == "tasks" or isinstance(compute, TaskPoolStrategy) + ): + raise ValueError( + "``compute`` must be specified when using a CallableClass, and must " + f"specify the actor compute strategy, but got: {compute}. " + "For example, use ``compute=ray.data.ActorPoolStrategy(size=n)``." + ) + + def capfirst(s: str): """Capitalize the first letter of a string @@ -436,7 +453,7 @@ def capitalize(s: str): return "".join(capfirst(x) for x in s.split("_")) -def pandas_df_to_arrow_block(df: "pandas.DataFrame") -> "Block[ArrowRow]": +def pandas_df_to_arrow_block(df: "pandas.DataFrame") -> "Block": from ray.data.block import BlockAccessor, BlockExecStats stats = BlockExecStats.builder() @@ -451,11 +468,13 @@ def pandas_df_to_arrow_block(df: "pandas.DataFrame") -> "Block[ArrowRow]": ) -def ndarray_to_block(ndarray: np.ndarray, strict_mode: bool) -> "Block[np.ndarray]": +def ndarray_to_block(ndarray: np.ndarray, ctx: DataContext) -> "Block": from ray.data.block import BlockAccessor, BlockExecStats + DataContext._set_current(ctx) + stats = BlockExecStats.builder() - if strict_mode: + if ctx.strict_mode: block = BlockAccessor.batch_to_block({"data": ndarray}) else: block = BlockAccessor.batch_to_block(ndarray) diff --git a/python/ray/data/aggregate.py b/python/ray/data/aggregate.py index c9da21735c96..8d50f09f7e85 100644 --- a/python/ray/data/aggregate.py +++ b/python/ray/data/aggregate.py @@ -9,7 +9,6 @@ BlockAccessor, KeyType, AggType, - KeyFn, _validate_key_fn, ) from ray.data._internal.null_aggregate import ( @@ -31,7 +30,7 @@ def __init__( init: Callable[[KeyType], AggType], merge: Callable[[AggType, AggType], AggType], accumulate_row: Callable[[AggType, T], AggType] = None, - accumulate_block: Callable[[AggType, Block[T]], AggType] = None, + accumulate_block: Callable[[AggType, Block], AggType] = None, finalize: Callable[[AggType], U] = lambda a: a, name: Optional[str] = None, ): @@ -59,7 +58,7 @@ def __init__( finalize: This is called once to compute the final aggregation result from the fully merged accumulator. name: The name of the aggregation. This will be used as the output - column name in the case of Arrow datastream. + column name in the case of Arrow dataset. """ if (accumulate_row is None and accumulate_block is None) or ( accumulate_row is not None and accumulate_block is not None @@ -69,9 +68,9 @@ def __init__( ) if accumulate_block is None: - def accumulate_block(a: AggType, block: Block[T]) -> AggType: + def accumulate_block(a: AggType, block: Block) -> AggType: block_acc = BlockAccessor.for_block(block) - for r in block_acc.iter_rows(): + for r in block_acc.iter_rows(public_row_format=False): a = accumulate_row(a, r) return a @@ -87,7 +86,7 @@ def _validate(self, schema: Optional[Union[type, "pa.lib.Schema"]]) -> None: class _AggregateOnKeyBase(AggregateFn): - def _set_key_fn(self, on: KeyFn): + def _set_key_fn(self, on: str): self._key_fn = on def _validate(self, schema: Optional[Union[type, "pa.lib.Schema"]]) -> None: @@ -113,8 +112,17 @@ def __init__(self): class Sum(_AggregateOnKeyBase): """Defines sum aggregation.""" - def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"sum({str(on)})" null_merge = _null_wrap_merge(ignore_nulls, lambda a1, a2: a1 + a2) @@ -127,7 +135,7 @@ def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): null_merge, ), finalize=_null_wrap_finalize(lambda a: a), - name=(f"sum({str(on)})"), + name=(self._rs_name), ) @@ -135,8 +143,17 @@ def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): class Min(_AggregateOnKeyBase): """Defines min aggregation.""" - def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"min({str(on)})" null_merge = _null_wrap_merge(ignore_nulls, min) @@ -149,7 +166,7 @@ def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): null_merge, ), finalize=_null_wrap_finalize(lambda a: a), - name=(f"min({str(on)})"), + name=(self._rs_name), ) @@ -157,8 +174,17 @@ def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): class Max(_AggregateOnKeyBase): """Defines max aggregation.""" - def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"max({str(on)})" null_merge = _null_wrap_merge(ignore_nulls, max) @@ -171,7 +197,7 @@ def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): null_merge, ), finalize=_null_wrap_finalize(lambda a: a), - name=(f"max({str(on)})"), + name=(self._rs_name), ) @@ -179,14 +205,23 @@ def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): class Mean(_AggregateOnKeyBase): """Defines mean aggregation.""" - def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"mean({str(on)})" null_merge = _null_wrap_merge( ignore_nulls, lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]] ) - def vectorized_mean(block: Block[T]) -> AggType: + def vectorized_mean(block: Block) -> AggType: block_acc = BlockAccessor.for_block(block) count = block_acc.count(on) if count == 0 or count is None: @@ -207,7 +242,7 @@ def vectorized_mean(block: Block[T]) -> AggType: null_merge, ), finalize=_null_wrap_finalize(lambda a: a[0] / a[1]), - name=(f"mean({str(on)})"), + name=(self._rs_name), ) @@ -226,11 +261,16 @@ class Std(_AggregateOnKeyBase): def __init__( self, - on: Optional[KeyFn] = None, + on: Optional[str] = None, ddof: int = 1, ignore_nulls: bool = True, + alias_name: Optional[str] = None, ): self._set_key_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"std({str(on)})" def merge(a: List[float], b: List[float]): # Merges two accumulations into one. @@ -251,7 +291,7 @@ def merge(a: List[float], b: List[float]): null_merge = _null_wrap_merge(ignore_nulls, merge) - def vectorized_std(block: Block[T]) -> AggType: + def vectorized_std(block: Block) -> AggType: block_acc = BlockAccessor.for_block(block) count = block_acc.count(on) if count == 0 or count is None: @@ -282,7 +322,7 @@ def finalize(a: List[float]): null_merge, ), finalize=_null_wrap_finalize(finalize), - name=(f"std({str(on)})"), + name=(self._rs_name), ) @@ -290,9 +330,18 @@ def finalize(a: List[float]): class AbsMax(_AggregateOnKeyBase): """Defines absolute max aggregation.""" - def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): + def __init__( + self, + on: Optional[str] = None, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): self._set_key_fn(on) on_fn = _to_on_fn(on) + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"abs_max({str(on)})" super().__init__( init=_null_wrap_init(lambda k: 0), @@ -301,14 +350,89 @@ def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True): ignore_nulls, on_fn, lambda a, r: max(a, abs(r)) ), finalize=_null_wrap_finalize(lambda a: a), - name=(f"abs_max({str(on)})"), + name=(self._rs_name), ) -def _to_on_fn(on: Optional[KeyFn]): +def _to_on_fn(on: Optional[str]): if on is None: return lambda r: r elif isinstance(on, str): return lambda r: r[on] else: return on + + +@PublicAPI +class Quantile(_AggregateOnKeyBase): + """Defines Quantile aggregation.""" + + def __init__( + self, + on: Optional[str] = None, + q: float = 0.5, + ignore_nulls: bool = True, + alias_name: Optional[str] = None, + ): + self._set_key_fn(on) + self._q = q + if alias_name: + self._rs_name = alias_name + else: + self._rs_name = f"quantile({str(on)})" + + def merge(a: List[int], b: List[int]): + if isinstance(a, List) and isinstance(b, List): + a.extend(b) + return a + if isinstance(a, List) and (not isinstance(b, List)): + if b is not None and b != "": + a.append(b) + return a + if isinstance(b, List) and (not isinstance(a, List)): + if a is not None and a != "": + b.append(a) + return b + + ls = [] + if a is not None and a != "": + ls.append(a) + if b is not None and b != "": + ls.append(b) + return ls + + null_merge = _null_wrap_merge(ignore_nulls, merge) + + def block_row_ls(block: Block) -> AggType: + block_acc = BlockAccessor.for_block(block) + ls = [] + for row in block_acc.iter_rows(public_row_format=False): + ls.append(row.get(on)) + return ls + + import math + + def percentile(input_values, key=lambda x: x): + if not input_values: + return None + input_values = sorted(input_values) + k = (len(input_values) - 1) * self._q + f = math.floor(k) + c = math.ceil(k) + if f == c: + return key(input_values[int(k)]) + d0 = key(input_values[int(f)]) * (c - k) + d1 = key(input_values[int(c)]) * (k - f) + return round(d0 + d1, 5) + + super().__init__( + init=_null_wrap_init(lambda k: [0]), + merge=null_merge, + accumulate_block=_null_wrap_accumulate_block( + ignore_nulls, + block_row_ls, + null_merge, + ), + finalize=_null_wrap_finalize(percentile), + name=(self._rs_name), + ) diff --git a/python/ray/data/block.py b/python/ray/data/block.py index d8fec85f4a2f..7037c8a4bf43 100644 --- a/python/ray/data/block.py +++ b/python/ray/data/block.py @@ -8,7 +8,6 @@ Any, Callable, Dict, - Generic, Iterator, List, Optional, @@ -18,11 +17,11 @@ ) import numpy as np +import colorama import ray from ray import ObjectRefGenerator from ray.data._internal.util import _check_pyarrow_version, _truncated_repr -from ray.data._internal.usage import record_block_format_usage from ray.types import ObjectRef from ray.util.annotations import DeveloperAPI, PublicAPI @@ -34,9 +33,9 @@ resource = None if sys.version_info >= (3, 8): - from typing import Protocol + from typing import Literal, Protocol else: - from typing_extensions import Protocol + from typing_extensions import Literal, Protocol if TYPE_CHECKING: import pandas @@ -48,35 +47,42 @@ T = TypeVar("T", contravariant=True) U = TypeVar("U", covariant=True) + KeyType = TypeVar("KeyType") AggType = TypeVar("AggType") -# A function that extracts a concrete value from a record in a Datastream, used -# in ``sort(value_fns...)``, ``groupby(value_fn).agg(Agg(value_fn), ...)``. -# It can either be None (intepreted as the identity function), the name -# of a Datastream column, or a lambda function that extracts the desired value -# from the object. -KeyFn = Union[None, str, Callable[[T], Any]] +STRICT_MODE_EXPLANATION = ( + colorama.Fore.YELLOW + + "Important: Ray Data requires schemas for all datasets in Ray 2.5. This means " + "that standalone Python objects are no longer supported. In addition, the default " + "batch format is fixed to NumPy. To revert to legacy behavior temporarily, " + "set the " + "environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n\n" + "Learn more here: https://docs.ray.io/en/master/data/faq.html#" + "migrating-to-strict-mode" + colorama.Style.RESET_ALL +) @PublicAPI class StrictModeError(ValueError): - pass + def __init__(self, message: str): + super().__init__(message + "\n\n" + STRICT_MODE_EXPLANATION) def _validate_key_fn( schema: Optional[Union[type, "pyarrow.lib.Schema"]], - key: KeyFn, + key: Optional[str], ) -> None: """Check the key function is valid on the given schema.""" if schema is None: - # Datastream is empty/cleared, validation not possible. + # Dataset is empty/cleared, validation not possible. return + ctx = ray.data.DataContext.get_current() is_simple_format = isinstance(schema, type) if isinstance(key, str): if is_simple_format: raise ValueError( - "String key '{}' requires datastream format to be " + "String key '{}' requires dataset format to be " "'arrow' or 'pandas', was 'simple'.".format(key) ) if len(schema.names) > 0 and key not in schema.names: @@ -84,16 +90,18 @@ def _validate_key_fn( "The column '{}' does not exist in the " "schema '{}'.".format(key, schema) ) + elif ctx.strict_mode: + raise StrictModeError(f"In Ray 2.5, the key must be a string, was: {key}") elif key is None: if not is_simple_format: raise ValueError( - "The `None` key '{}' requires datastream format to be " + "The `None` key '{}' requires dataset format to be " "'simple'.".format(key) ) elif callable(key): if not is_simple_format: raise ValueError( - "Callable key '{}' requires datastream format to be " + "Callable key '{}' requires dataset format to be " "'simple'".format(key) ) else: @@ -104,11 +112,12 @@ def _validate_key_fn( # # Block data can be accessed in a uniform way via ``BlockAccessors`` such as # ``SimpleBlockAccessor`` and ``ArrowBlockAccessor``. -Block = Union[List[T], "pyarrow.Table", "pandas.DataFrame", bytes] +Block = Union[list, "pyarrow.Table", "pandas.DataFrame", bytes] # User-facing data batch type. This is the data type for data that is supplied to and # returned from batch UDFs. -DataBatch = Union[Block, np.ndarray, Dict[str, np.ndarray]] +DataBatch = Union["pyarrow.Table", "pandas.DataFrame", Dict[str, np.ndarray]] + # A class type that implements __call__. CallableClass = type @@ -119,29 +128,11 @@ def __call__(self, __arg: T) -> Union[U, Iterator[U]]: ... -# A UDF on data batches. -BatchUDF = Union[ - # TODO(Clark): Once Ray only supports Python 3.8+, use protocol to constraint batch - # UDF type. - # Callable[[DataBatch, ...], DataBatch] - Callable[[DataBatch], DataBatch], - Callable[[DataBatch], Iterator[DataBatch]], - "_CallableClassProtocol", -] - -# A UDF on data rows. -RowUDF = Union[ - # TODO(Clark): Once Ray only supports Python 3.8+, use protocol to constraint batch - # UDF type. - # Callable[[T, ...], U] +# A user defined function passed to map, map_batches, ec. +UserDefinedFunction = Union[ Callable[[T], U], - "_CallableClassProtocol[T, U]", -] - - -FlatMapUDF = Union[ - RowUDF, Callable[[T], Iterator[U]], + "_CallableClassProtocol", ] # A list of block references pending computation by a single task. For example, @@ -162,18 +153,43 @@ def __call__(self, __arg: T) -> Union[U, Iterator[U]]: def _apply_strict_mode_batch_format(given_batch_format: Optional[str]) -> str: - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: if given_batch_format == "default": given_batch_format = "numpy" if given_batch_format not in VALID_BATCH_FORMATS_STRICT_MODE: raise StrictModeError( f"The given batch format {given_batch_format} is not allowed " - f"in strict mode (must be one of {VALID_BATCH_FORMATS_STRICT_MODE})." + f"in Ray 2.5 (must be one of {VALID_BATCH_FORMATS_STRICT_MODE})." ) return given_batch_format +def _apply_strict_mode_batch_size( + given_batch_size: Optional[Union[int, Literal["default"]]], use_gpu: bool +) -> Optional[int]: + ctx = ray.data.DatasetContext.get_current() + if ctx.strict_mode: + if use_gpu and (not given_batch_size or given_batch_size == "default"): + raise StrictModeError( + "`batch_size` must be provided to `map_batches` when requesting GPUs. " + "The optimal batch size depends on the model, data, and GPU used. " + "It is recommended to use the largest batch size that doesn't result " + "in your GPU device running out of memory. You can view the GPU memory " + "usage via the Ray dashboard." + ) + elif given_batch_size == "default": + return ray.data.context.STRICT_MODE_DEFAULT_BATCH_SIZE + else: + return given_batch_size + + else: + if given_batch_size == "default": + return ray.data.context.DEFAULT_BATCH_SIZE + else: + return given_batch_size + + @DeveloperAPI class BlockExecStats: """Execution stats for this block. @@ -257,7 +273,7 @@ def __post_init__(self): @DeveloperAPI -class BlockAccessor(Generic[T]): +class BlockAccessor: """Provides accessor methods for a specific block. Ideally, we wouldn't need a separate accessor classes for blocks. However, @@ -274,8 +290,13 @@ def num_rows(self) -> int: """Return the number of rows contained in this block.""" raise NotImplementedError - def iter_rows(self) -> Iterator[T]: - """Iterate over the rows of this block.""" + def iter_rows(self, public_row_format: bool) -> Iterator[T]: + """Iterate over the rows of this block. + + Args: + public_row_format: Whether to cast rows into the public Dict row + format (this incurs extra copy conversions). + """ raise NotImplementedError def slice(self, start: int, end: int, copy: bool) -> Block: @@ -302,7 +323,7 @@ def take(self, indices: List[int]) -> Block: """ raise NotImplementedError - def select(self, columns: List[KeyFn]) -> Block: + def select(self, columns: List[Optional[str]]) -> Block: """Return a new block containing the provided columns.""" raise NotImplementedError @@ -381,12 +402,12 @@ def get_metadata( exec_stats=exec_stats, ) - def zip(self, other: "Block[T]") -> "Block[T]": + def zip(self, other: "Block") -> "Block": """Zip this block with another block of the same type and size.""" raise NotImplementedError @staticmethod - def builder() -> "BlockBuilder[T]": + def builder() -> "BlockBuilder": """Create a builder for this block type.""" raise NotImplementedError @@ -397,12 +418,12 @@ def batch_to_block(batch: DataBatch) -> Block: if isinstance(batch, np.ndarray): from ray.data._internal.arrow_block import ArrowBlockAccessor - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: raise StrictModeError( f"Error validating {_truncated_repr(batch)}: " "Standalone numpy arrays are not " - "allowed in strict mode. Return a dict of field -> array, " + "allowed in Ray 2.5. Return a dict of field -> array, " "e.g., `{'data': array}` instead of `array`." ) @@ -412,10 +433,8 @@ def batch_to_block(batch: DataBatch) -> Block: import pyarrow as pa try: - return ArrowBlockAccessor.numpy_to_block( - batch, passthrough_arrow_not_implemented_errors=True - ) - except pa.ArrowNotImplementedError: + return ArrowBlockAccessor.numpy_to_block(batch) + except (pa.ArrowNotImplementedError, pa.ArrowInvalid, pa.ArrowTypeError): import pandas as pd # TODO(ekl) once we support Python objects within Arrow blocks, we @@ -433,59 +452,55 @@ def for_block(block: Block) -> "BlockAccessor[T]": if isinstance(block, pyarrow.Table): from ray.data._internal.arrow_block import ArrowBlockAccessor - record_block_format_usage("arrow") return ArrowBlockAccessor(block) elif isinstance(block, pandas.DataFrame): from ray.data._internal.pandas_block import PandasBlockAccessor - record_block_format_usage("pandas") return PandasBlockAccessor(block) elif isinstance(block, bytes): from ray.data._internal.arrow_block import ArrowBlockAccessor - record_block_format_usage("arrow") return ArrowBlockAccessor.from_bytes(block) elif isinstance(block, list): from ray.data._internal.simple_block import SimpleBlockAccessor - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: raise StrictModeError( f"Error validating {_truncated_repr(block)}: " "Standalone Python objects are not " - "allowed in strict mode. To use Python objects in a datastream, " + "allowed in Ray 2.5. To use Python objects in a dataset, " "wrap them in a dict of numpy arrays, e.g., " - "return `{'item': np.array(batch)}` instead of just `batch`." + "return `{'item': batch}` instead of just `batch`." ) - record_block_format_usage("simple") return SimpleBlockAccessor(block) else: raise TypeError("Not a block type: {} ({})".format(block, type(block))) - def sample(self, n_samples: int, key: Any) -> "Block[T]": + def sample(self, n_samples: int, key: Any) -> "Block": """Return a random sample of items from this block.""" raise NotImplementedError def sort_and_partition( self, boundaries: List[T], key: Any, descending: bool - ) -> List["Block[T]"]: + ) -> List["Block"]: """Return a list of sorted partitions of this block.""" raise NotImplementedError - def combine(self, key: KeyFn, agg: "AggregateFn") -> Block[U]: + def combine(self, key: Optional[str], agg: "AggregateFn") -> Block: """Combine rows with the same key into an accumulator.""" raise NotImplementedError @staticmethod def merge_sorted_blocks( - blocks: List["Block[T]"], key: Any, descending: bool - ) -> Tuple[Block[T], BlockMetadata]: + blocks: List["Block"], key: Any, descending: bool + ) -> Tuple[Block, BlockMetadata]: """Return a sorted block by merging a list of sorted blocks.""" raise NotImplementedError @staticmethod def aggregate_combined_blocks( - blocks: List[Block], key: KeyFn, agg: "AggregateFn" - ) -> Tuple[Block[U], BlockMetadata]: + blocks: List[Block], key: Optional[str], agg: "AggregateFn" + ) -> Tuple[Block, BlockMetadata]: """Aggregate partially combined and sorted blocks.""" raise NotImplementedError diff --git a/python/ray/data/context.py b/python/ray/data/context.py index 651a10de7c3c..519a1c3148e2 100644 --- a/python/ray/data/context.py +++ b/python/ray/data/context.py @@ -2,6 +2,7 @@ import threading from typing import Optional, TYPE_CHECKING +from ray._private.ray_constants import env_integer from ray.util.annotations import DeveloperAPI from ray.util.scheduling_strategies import SchedulingStrategyT @@ -12,7 +13,7 @@ _default_context: "Optional[DataContext]" = None _context_lock = threading.Lock() -# An estimate of what fraction of the object store a Datastream can use without too high +# An estimate of what fraction of the object store a Dataset can use without too high # a risk of triggering spilling. This is used to generate user warnings only. ESTIMATED_SAFE_MEMORY_FRACTION = 0.25 @@ -20,7 +21,7 @@ # We choose 512MiB as 8x less than the typical memory:core ratio of 4:1. DEFAULT_TARGET_MAX_BLOCK_SIZE = 512 * 1024 * 1024 -# Datastream will avoid creating blocks smaller than this size in bytes on read. +# Dataset will avoid creating blocks smaller than this size in bytes on read. # This takes precedence over DEFAULT_MIN_PARALLELISM. DEFAULT_TARGET_MIN_BLOCK_SIZE = 1 * 1024 * 1024 @@ -37,10 +38,10 @@ # TODO (kfstorm): Remove this once stable. DEFAULT_ENABLE_PANDAS_BLOCK = True -# Whether to enable stage-fusion optimizations for datastream pipelines. +# Whether to enable stage-fusion optimizations for dataset pipelines. DEFAULT_OPTIMIZE_FUSE_STAGES = True -# Whether to enable stage-reorder optimizations for datastream pipelines. +# Whether to enable stage-reorder optimizations for dataset pipelines. DEFAULT_OPTIMIZE_REORDER_STAGES = True # Whether to furthermore fuse read stages. @@ -49,12 +50,12 @@ # Whether to furthermore fuse prior map tasks with shuffle stages. DEFAULT_OPTIMIZE_FUSE_SHUFFLE_STAGES = True -# Minimum amount of parallelism to auto-detect for a datastream. Note that the min +# Minimum amount of parallelism to auto-detect for a dataset. Note that the min # block size config takes precedence over this. DEFAULT_MIN_PARALLELISM = 200 # Wether to use actor based block prefetcher. -DEFAULT_ACTOR_PREFETCHER_ENABLED = True +DEFAULT_ACTOR_PREFETCHER_ENABLED = False # Whether to use push-based shuffle by default. DEFAULT_USE_PUSH_BASED_SHUFFLE = bool( @@ -64,7 +65,7 @@ # The default global scheduling strategy. DEFAULT_SCHEDULING_STRATEGY = "DEFAULT" -# Whether to use Polars for tabular datastream sorts, groupbys, and aggregations. +# Whether to use Polars for tabular dataset sorts, groupbys, and aggregations. DEFAULT_USE_POLARS = False # Whether to use the new executor backend. @@ -92,8 +93,8 @@ # extension columns. DEFAULT_ENABLE_TENSOR_EXTENSION_CASTING = True -# Whether to automatically print Datastream stats after execution. -# If disabled, users can still manually print stats with Datastream.stats(). +# Whether to automatically print Dataset stats after execution. +# If disabled, users can still manually print stats with Dataset.stats(). DEFAULT_AUTO_LOG_STATS = False # Whether to enable optimizer. @@ -106,7 +107,7 @@ # Enable strict schema mode (experimental). In this mode, we only allow structured # schemas, and default to numpy as the batch format. -DEFAULT_STRICT_MODE = bool(int(os.environ.get("RAY_DATA_STRICT_MODE", "0"))) +DEFAULT_STRICT_MODE = bool(int(os.environ.get("RAY_DATA_STRICT_MODE", "1"))) # Set this to True to use the legacy iter_batches codepath prior to 2.4. DEFAULT_USE_LEGACY_ITER_BATCHES = False @@ -120,10 +121,18 @@ # Default batch size for batch transformations. DEFAULT_BATCH_SIZE = 4096 +# Default batch size for batch transformations in strict mode. +STRICT_MODE_DEFAULT_BATCH_SIZE = 1024 + +# Whether to enable progress bars. +DEFAULT_ENABLE_PROGRESS_BARS = not bool( + env_integer("RAY_DATA_DISABLE_PROGRESS_BARS", 0) +) + @DeveloperAPI class DataContext: - """Singleton for shared Datastream resources and configurations. + """Singleton for shared Dataset resources and configurations. This object is automatically propagated to workers and can be retrieved from the driver and remote workers via DataContext.get_current(). @@ -158,6 +167,7 @@ def __init__( use_ray_tqdm: bool, use_legacy_iter_batches: bool, strict_mode: bool, + enable_progress_bars: bool, ): """Private constructor (use get_current() instead).""" self.block_splitting_enabled = block_splitting_enabled @@ -185,11 +195,12 @@ def __init__( self.enable_auto_log_stats = enable_auto_log_stats self.trace_allocations = trace_allocations self.optimizer_enabled = optimizer_enabled - # TODO: expose execution options in Datastream public APIs. + # TODO: expose execution options in Dataset public APIs. self.execution_options = execution_options self.use_ray_tqdm = use_ray_tqdm self.use_legacy_iter_batches = use_legacy_iter_batches self.strict_mode = strict_mode + self.enable_progress_bars = enable_progress_bars @staticmethod def get_current() -> "DataContext": @@ -238,6 +249,7 @@ def get_current() -> "DataContext": use_ray_tqdm=DEFAULT_USE_RAY_TQDM, use_legacy_iter_batches=DEFAULT_USE_LEGACY_ITER_BATCHES, strict_mode=DEFAULT_STRICT_MODE, + enable_progress_bars=DEFAULT_ENABLE_PROGRESS_BARS, ) return _default_context @@ -246,7 +258,7 @@ def get_current() -> "DataContext": def _set_current(context: "DataContext") -> None: """Set the current context in a remote worker. - This is used internally by Datastream to propagate the driver context to + This is used internally by Dataset to propagate the driver context to remote workers used for parallelization. """ global _default_context diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index b922ac2611a9..e1fccf0ac89a 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -1,4 +1,4541 @@ -from ray.data.datastream import Datastream +import collections +import itertools +import logging +import sys +import time +import html +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Generic, + Iterable, + Iterator, + List, + Type, + Optional, + Tuple, + Union, + Mapping, +) +from uuid import uuid4 -# Backwards compatibility alias. -Dataset = Datastream +import numpy as np + +import ray +from ray._private.thirdparty.tabulate.tabulate import tabulate +from ray._private.usage import usage_lib +from ray.air.util.tensor_extensions.utils import _create_possibly_ragged_ndarray +import ray.cloudpickle as pickle +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.util.data_batch_conversion import BlockFormat +from ray.data._internal.logical.operators.all_to_all_operator import ( + RandomShuffle, + RandomizeBlocks, + Repartition, + Sort, +) +from ray.data._internal.logical.operators.n_ary_operator import Zip +from ray.data._internal.logical.optimizers import LogicalPlan +from ray.data._internal.logical.operators.limit_operator import Limit +from ray.data._internal.logical.operators.map_operator import ( + Filter, + FlatMap, + MapRows, + MapBatches, +) +from ray.data._internal.logical.operators.write_operator import Write +from ray.data._internal.planner.filter import generate_filter_fn +from ray.data._internal.planner.flat_map import generate_flat_map_fn +from ray.data._internal.planner.map_batches import generate_map_batches_fn +from ray.data._internal.planner.map_rows import generate_map_rows_fn +from ray.data._internal.planner.write import generate_write_fn +from ray.data.iterator import DataIterator +from ray.data._internal.block_list import BlockList +from ray.data._internal.iterator.iterator_impl import ( + DataIteratorImpl, +) +from ray.data._internal.iterator.stream_split_iterator import ( + StreamSplitDataIterator, +) +from ray.data._internal.compute import ( + ActorPoolStrategy, + CallableClass, + ComputeStrategy, + TaskPoolStrategy, +) +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.equalize import _equalize +from ray.data._internal.lazy_block_list import LazyBlockList +from ray.data._internal.util import ( + _estimate_available_parallelism, + _is_local_scheme, + validate_compute, + ConsumptionAPI, +) +from ray.data._internal.pandas_block import PandasBlockSchema +from ray.data._internal.plan import ( + ExecutionPlan, + OneToOneStage, +) +from ray.data._internal.stage_impl import ( + RandomizeBlocksStage, + RepartitionStage, + RandomShuffleStage, + ZipStage, + SortStage, + LimitStage, +) +from ray.data._internal.progress_bar import ProgressBar +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.split import _split_at_indices, _get_num_rows +from ray.data._internal.stats import DatasetStats, DatasetStatsSummary +from ray.data.aggregate import AggregateFn, Max, Mean, Min, Std, Sum +from ray.data.block import ( + VALID_BATCH_FORMATS, + STRICT_MODE_EXPLANATION, + _apply_strict_mode_batch_format, + _apply_strict_mode_batch_size, + UserDefinedFunction, + Block, + BlockAccessor, + BlockMetadata, + BlockPartition, + DataBatch, + StrictModeError, + T, + U, + _validate_key_fn, +) +from ray.data.context import ( + DataContext, + WARN_PREFIX, + OK_PREFIX, + ESTIMATED_SAFE_MEMORY_FRACTION, +) +from ray.data.datasource import ( + BlockWritePathProvider, + CSVDatasource, + Datasource, + DefaultBlockWritePathProvider, + JSONDatasource, + NumpyDatasource, + ParquetDatasource, + ReadTask, + TFRecordDatasource, + WriteResult, +) +from ray.data.datasource.file_based_datasource import ( + _unwrap_arrow_serialization_workaround, + _wrap_arrow_serialization_workaround, +) +from ray.data.random_access_dataset import RandomAccessDataset +from ray.types import ObjectRef +from ray.util.annotations import DeveloperAPI, PublicAPI, Deprecated +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy +from ray.widgets import Template +from ray.widgets.util import ( + ensure_ipywidgets_dep, + repr_fallback_if_colab, +) + +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal + +if TYPE_CHECKING: + import dask + import mars + import modin + import pandas + import pyarrow + import pyspark + import tensorflow as tf + import torch + import torch.utils.data + + from ray.data.dataset_pipeline import DatasetPipeline + from ray.data.grouped_data import GroupedData + from ray.data._internal.execution.interfaces import Executor, NodeIdStr + from ray.data._internal.torch_iterable_dataset import TorchTensorBatchType + from tensorflow_metadata.proto.v0 import schema_pb2 + + +logger = logging.getLogger(__name__) + +TensorflowFeatureTypeSpec = Union[ + "tf.TypeSpec", List["tf.TypeSpec"], Dict[str, "tf.TypeSpec"] +] + +TensorFlowTensorBatchType = Union["tf.Tensor", Dict[str, "tf.Tensor"]] + + +@PublicAPI +class Dataset: + """A Dataset is a distributed data collection for data loading and processing. + + Datasets are distributed pipelines that produce ``ObjectRef[Block]`` outputs, + where each block holds data in Arrow format, representing a shard of the overall + data collection. The block also determines the unit of parallelism. + + Datasets can be created in multiple ways: from synthetic data via ``range_*()`` + APIs, from existing memory data via ``from_*()`` APIs (this creates a subclass + of Dataset called ``MaterializedDataset``), or from external storage + systems such as local disk, S3, HDFS etc. via the ``read_*()`` APIs. The + (potentially processed) Dataset can be saved back to external storage systems + via the ``write_*()`` APIs. + + Examples: + >>> import ray + >>> # Create dataset from synthetic data. + >>> ds = ray.data.range(1000) + >>> # Create dataset from in-memory data. + >>> ds = ray.data.from_items( + ... [{"col1": i, "col2": i * 2} for i in range(1000)]) + >>> # Create dataset from external storage system. + >>> ds = ray.data.read_parquet("s3://bucket/path") # doctest: +SKIP + >>> # Save dataset back to external storage system. + >>> ds.write_csv("s3://bucket/output") # doctest: +SKIP + + Dataset has two kinds of operations: transformation, which takes in Dataset + and outputs a new Dataset (e.g. :py:meth:`.map_batches()`); and consumption, + which produces values (not Datatream) as output (e.g. :py:meth:`.iter_batches()`). + + Dataset transformations are lazy, with execution of the transformations being + triggered by downstream consumption. + + Dataset supports parallel processing at scale: transformations such as + :py:meth:`.map_batches()`, aggregations such as + :py:meth:`.min()`/:py:meth:`.max()`/:py:meth:`.mean()`, grouping via + :py:meth:`.groupby()`, shuffling operations such as :py:meth:`.sort()`, + :py:meth:`.random_shuffle()`, and :py:meth:`.repartition()`. + + Examples: + >>> import ray + >>> ds = ray.data.range(1000) + >>> # Transform batches (Dict[str, np.ndarray]) with map_batches(). + >>> ds.map_batches(lambda batch: {"id": batch["id"] * 2}) + MapBatches() + +- Dataset(num_blocks=17, num_rows=1000, schema={id: int64}) + >>> # Compute the maximum. + >>> ds.max("id") + 999 + >>> # Shuffle this dataset randomly. + >>> ds.random_shuffle() + RandomShuffle + +- Dataset(num_blocks=..., num_rows=1000, schema={id: int64}) + >>> # Sort it back in order. + >>> ds.sort("id") + Sort + +- Dataset(num_blocks=..., num_rows=1000, schema={id: int64}) + + Both unexecuted and materialized Datasets can be passed between Ray tasks and + actors without incurring a copy. Dataset supports conversion to/from several + more featureful dataframe libraries (e.g., Spark, Dask, Modin, MARS), and are also + compatible with distributed TensorFlow / PyTorch. + """ + + def __init__( + self, + plan: ExecutionPlan, + epoch: int, + lazy: bool = True, + logical_plan: Optional[LogicalPlan] = None, + ): + """Construct a Dataset (internal API). + + The constructor is not part of the Dataset API. Use the ``ray.data.*`` + read methods to construct a dataset. + """ + assert isinstance(plan, ExecutionPlan) + usage_lib.record_library_usage("dataset") # Legacy telemetry name. + + if ray.util.log_once("strict_mode_explanation"): + logger.warning(STRICT_MODE_EXPLANATION) + + self._plan = plan + self._uuid = uuid4().hex + self._epoch = epoch + self._lazy = lazy + self._logical_plan = logical_plan + if logical_plan is not None: + self._plan.link_logical_plan(logical_plan) + + if not lazy: + self._plan.execute(allow_clear_input_blocks=False) + + # Handle to currently running executor for this dataset. + self._current_executor: Optional["Executor"] = None + + @staticmethod + def copy( + ds: "Dataset", _deep_copy: bool = False, _as: Optional[type] = None + ) -> "Dataset": + if not _as: + _as = Dataset + if _deep_copy: + return _as(ds._plan.deep_copy(), ds._epoch, ds._lazy, ds._logical_plan) + else: + return _as(ds._plan.copy(), ds._epoch, ds._lazy, ds._logical_plan) + + def map( + self, + fn: UserDefinedFunction[Dict[str, Any], Dict[str, Any]], + *, + compute: Optional[ComputeStrategy] = None, + **ray_remote_args, + ) -> "Dataset": + """Apply the given function to each record of this dataset. + + Note that mapping individual records can be quite slow. Consider using + `.map_batches()` for performance. + + Examples: + >>> import ray + >>> # Transform python objects. + >>> ds = ray.data.range(1000) + >>> # The function goes from record (Dict[str, Any]) to record. + >>> ds.map(lambda record: {"id": record["id"] * 2}) + Map + +- Dataset(num_blocks=..., num_rows=1000, schema={id: int64}) + >>> # Transform Arrow records. + >>> ds = ray.data.from_items( + ... [{"value": i} for i in range(1000)]) + >>> ds.map(lambda record: {"v2": record["value"] * 2}) + Map + +- Dataset(num_blocks=200, num_rows=1000, schema={value: int64}) + >>> # Define a callable class that persists state across + >>> # function invocations for efficiency. + >>> init_model = ... # doctest: +SKIP + >>> class CachedModel: + ... def __init__(self): + ... self.model = init_model() + ... def __call__(self, batch): + ... return self.model(batch) + >>> # Apply the transform in parallel on GPUs. Since + >>> # compute=ActorPoolStrategy(size=8) the transform will be applied on a + >>> # pool of 8 Ray actors, each allocated 1 GPU by Ray. + >>> ds.map(CachedModel, # doctest: +SKIP + ... compute=ray.data.ActorPoolStrategy(size=8), + ... num_gpus=1) + + Time complexity: O(dataset size / parallelism) + + Args: + fn: The function to apply to each record, or a class type + that can be instantiated to create such a callable. Callable classes are + only supported for the actor compute strategy. + compute: The compute strategy, either None (default) to use Ray + tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor + pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an + autoscaling actor pool. + ray_remote_args: Additional resource requirements to request from + ray (e.g., num_gpus=1 to request GPUs for the map tasks). + + .. seealso:: + + :meth:`~Dataset.flat_map`: + Call this method to create new records from existing ones. Unlike + :meth:`~Dataset.map`, a function passed to + :meth:`~Dataset.flat_map` can return multiple records. + + :meth:`~Dataset.flat_map` isn't recommended because it's slow; call + :meth:`~Dataset.map_batches` instead. + + :meth:`~Dataset.map_batches` + Call this method to transform batches of data. It's faster and more + flexible than :meth:`~Dataset.map` and :meth:`~Dataset.flat_map`. + """ + validate_compute(fn, compute) + self._warn_slow() + + transform_fn = generate_map_rows_fn() + + plan = self._plan.with_stage( + OneToOneStage( + "Map", + transform_fn, + compute, + ray_remote_args, + fn=fn, + ) + ) + + logical_plan = self._logical_plan + if logical_plan is not None: + map_op = MapRows( + logical_plan.dag, + fn, + compute=compute, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(map_op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def map_batches( + self, + fn: UserDefinedFunction[DataBatch, DataBatch], + *, + batch_size: Union[int, None, Literal["default"]] = "default", + compute: Optional[ComputeStrategy] = None, + batch_format: Optional[str] = "default", + zero_copy_batch: bool = False, + fn_args: Optional[Iterable[Any]] = None, + fn_kwargs: Optional[Dict[str, Any]] = None, + fn_constructor_args: Optional[Iterable[Any]] = None, + fn_constructor_kwargs: Optional[Dict[str, Any]] = None, + **ray_remote_args, + ) -> "Dataset": + """Apply the given function to batches of data. + + This applies the ``fn`` in parallel with map tasks, with each task handling + a batch of data (typically Dict[str, np.ndarray] or pd.DataFrame). + + To learn more about writing functions for :meth:`~Dataset.map_batches`, read + :ref:`writing user-defined functions `. + + .. tip:: + If ``fn`` does not mutate its input, set ``zero_copy_batch=True`` to elide a + batch copy, which can improve performance and decrease memory utilization. + ``fn`` will then receive zero-copy read-only batches. + If ``fn`` mutates its input, you will need to ensure that the batch provided + to ``fn`` is writable by setting ``zero_copy_batch=False`` (default). This + will create an extra, mutable copy of each batch before handing it to + ``fn``. + + .. note:: + The size of the batches provided to ``fn`` may be smaller than the provided + ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent to + a given map task. When ``batch_size`` is specified, each map task will be + sent a single block if the block is equal to or larger than ``batch_size``, + and will be sent a bundle of blocks up to (but not exceeding) + ``batch_size`` if blocks are smaller than ``batch_size``. + + Examples: + + >>> import numpy as np + >>> import ray + >>> ds = ray.data.from_items([ + ... {"name": "Luna", "age": 4}, + ... {"name": "Rory", "age": 14}, + ... {"name": "Scout", "age": 9}, + ... ]) + >>> ds # doctest: +SKIP + MaterializedDataset( + num_blocks=3, + num_rows=3, + schema={name: string, age: int64} + ) + + Here ``fn`` returns the same batch type as the input, but your ``fn`` can + also return a different batch type (e.g., pd.DataFrame). Read more about + :ref:`Transforming Data `. + + >>> from typing import Dict + >>> def map_fn(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + ... batch["age_in_dog_years"] = 7 * batch["age"] + ... return batch + >>> ds = ds.map_batches(map_fn) + >>> ds + MapBatches(map_fn) + +- Dataset(num_blocks=3, num_rows=3, schema={name: string, age: int64}) + + :ref:`Actors ` can improve the performance of some workloads. + For example, you can use :ref:`actors ` to load a model once + per worker instead of once per inference. + + To transform batches with :ref:`actors `, pass a callable type + to ``fn`` and specify an :class:`~ray.data.ActorPoolStrategy`. + + In the example below, ``CachedModel`` is called on an autoscaling pool of + two to eight :ref:`actors `, each allocated one GPU by Ray. + + >>> init_large_model = ... # doctest: +SKIP + >>> class CachedModel: + ... def __init__(self): + ... self.model = init_large_model() + ... def __call__(self, item): + ... return self.model(item) + >>> ds.map_batches( # doctest: +SKIP + ... CachedModel, # doctest: +SKIP + ... batch_size=256, # doctest: +SKIP + ... compute=ray.data.ActorPoolStrategy(size=8), # doctest: +SKIP + ... num_gpus=1, + ... ) # doctest: +SKIP + + ``fn`` can also be a generator, yielding multiple batches in a single + invocation. This is useful when returning large objects. Instead of + returning a very large output batch, ``fn`` can instead yield the + output batch in chunks. + + >>> def map_fn_with_large_output(batch): + ... for i in range(3): + ... yield {"large_output": np.ones((100, 1000))} + >>> ds = ray.data.from_items([1]) + >>> ds = ds.map_batches(map_fn_with_large_output) + >>> ds + MapBatches(map_fn_with_large_output) + +- Dataset(num_blocks=1, num_rows=1, schema={item: int64}) + + + Args: + fn: The function or generator to apply to each record batch, or a class type + that can be instantiated to create such a callable. Callable classes are + only supported for the actor compute strategy. Note ``fn`` must be + pickle-able. + batch_size: The desired number of rows in each batch, or None to use entire + blocks as batches (blocks may contain different number of rows). + The actual size of the batch provided to ``fn`` may be smaller than + ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent + to a given map task. Default batch_size is 4096 with "default". + compute: The compute strategy, either "tasks" (default) to use Ray + tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor + pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an + autoscaling actor pool. + batch_format: Specify ``"default"`` to use the default block format + (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to + select ``pyarrow.Table``, or ``"numpy"`` to select + ``Dict[str, numpy.ndarray]``, or None to return the underlying block + exactly as is with no additional formatting. + zero_copy_batch: Whether ``fn`` should be provided zero-copy, read-only + batches. If this is ``True`` and no copy is required for the + ``batch_format`` conversion, the batch will be a zero-copy, read-only + view on data in Ray's object store, which can decrease memory + utilization and improve performance. If this is ``False``, the batch + will be writable, which will require an extra copy to guarantee. + If ``fn`` mutates its input, this will need to be ``False`` in order to + avoid "assignment destination is read-only" or "buffer source array is + read-only" errors. Default is ``False``. + fn_args: Positional arguments to pass to ``fn`` after the first argument. + These arguments are top-level arguments to the underlying Ray task. + fn_kwargs: Keyword arguments to pass to ``fn``. These arguments are + top-level arguments to the underlying Ray task. + fn_constructor_args: Positional arguments to pass to ``fn``'s constructor. + You can only provide this if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + fn_constructor_kwargs: Keyword arguments to pass to ``fn``'s constructor. + This can only be provided if ``fn`` is a callable class. These arguments + are top-level arguments in the underlying Ray actor construction task. + ray_remote_args: Additional resource requirements to request from + ray (e.g., ``num_gpus=1`` to request GPUs for the map tasks). + + .. seealso:: + + :meth:`~Dataset.iter_batches` + Call this function to iterate over batches of data. + + :meth:`~Dataset.flat_map`: + Call this method to create new records from existing ones. Unlike + :meth:`~Dataset.map`, a function passed to :meth:`~Dataset.flat_map` + can return multiple records. + + :meth:`~Dataset.flat_map` isn't recommended because it's slow; call + :meth:`~Dataset.map_batches` instead. + + :meth:`~Dataset.map` + Call this method to transform one record at time. + + This method isn't recommended because it's slow; call + :meth:`~Dataset.map_batches` instead. + """ # noqa: E501 + + batch_format = _apply_strict_mode_batch_format(batch_format) + if batch_format == "native": + logger.warning("The 'native' batch format has been renamed 'default'.") + + target_block_size = None + if batch_size is not None and batch_size != "default": + if batch_size < 1: + raise ValueError("Batch size cannot be negative or 0") + # Enable blocks bundling when batch_size is specified by caller. + target_block_size = batch_size + + batch_size = _apply_strict_mode_batch_size( + batch_size, use_gpu="num_gpus" in ray_remote_args + ) + + if batch_format not in VALID_BATCH_FORMATS: + raise ValueError( + f"The batch format must be one of {VALID_BATCH_FORMATS}, got: " + f"{batch_format}" + ) + + validate_compute(fn, compute) + + if fn_constructor_args is not None or fn_constructor_kwargs is not None: + if compute is None or ( + compute != "actors" and not isinstance(compute, ActorPoolStrategy) + ): + raise ValueError( + "fn_constructor_args and fn_constructor_kwargs can only be " + "specified if using the actor pool compute strategy, but got: " + f"{compute}" + ) + if not isinstance(fn, CallableClass): + raise ValueError( + "fn_constructor_args and fn_constructor_kwargs can only be " + "specified if providing a CallableClass instance for fn, but got: " + f"{fn}" + ) + + transform_fn = generate_map_batches_fn( + batch_size=batch_size, + batch_format=batch_format, + zero_copy_batch=zero_copy_batch, + ) + + # TODO(chengsu): pass function name to MapBatches logical operator. + if hasattr(fn, "__self__") and isinstance( + fn.__self__, ray.data.preprocessor.Preprocessor + ): + stage_name = fn.__self__.__class__.__name__ + else: + stage_name = f'MapBatches({getattr(fn, "__name__", type(fn))})' + + stage = OneToOneStage( + stage_name, + transform_fn, + compute, + ray_remote_args, + # TODO(Clark): Add a strict cap here. + target_block_size=target_block_size, + fn=fn, + fn_args=fn_args, + fn_kwargs=fn_kwargs, + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + ) + plan = self._plan.with_stage(stage) + + logical_plan = self._logical_plan + if logical_plan is not None: + map_batches_op = MapBatches( + logical_plan.dag, + fn, + batch_size=batch_size, + batch_format=batch_format, + zero_copy_batch=zero_copy_batch, + target_block_size=target_block_size, + fn_args=fn_args, + fn_kwargs=fn_kwargs, + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + compute=compute, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(map_batches_op) + + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def add_column( + self, + col: str, + fn: Callable[["pandas.DataFrame"], "pandas.Series"], + *, + compute: Optional[str] = None, + **ray_remote_args, + ) -> "Dataset": + """Add the given column to the dataset. + + This is only supported for datasets convertible to pandas format. + A function generating the new column values given the batch in pandas + format must be specified. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> # Add a new column equal to value * 2. + >>> ds = ds.add_column("new_col", lambda df: df["id"] * 2) + >>> # Overwrite the existing "value" with zeros. + >>> ds = ds.add_column("id", lambda df: 0) + + Time complexity: O(dataset size / parallelism) + + Args: + col: Name of the column to add. If the name already exists, the + column will be overwritten. + fn: Map function generating the column values given a batch of + records in pandas format. + compute: The compute strategy, either "tasks" (default) to use Ray + tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor + pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an + autoscaling actor pool. + ray_remote_args: Additional resource requirements to request from + ray (e.g., num_gpus=1 to request GPUs for the map tasks). + """ + + def process_batch(batch: "pandas.DataFrame") -> "pandas.DataFrame": + batch.loc[:, col] = fn(batch) + return batch + + if not callable(fn): + raise ValueError("`fn` must be callable, got {}".format(fn)) + + return self.map_batches( + process_batch, + batch_format="pandas", # TODO(ekl) we should make this configurable. + compute=compute, + zero_copy_batch=False, + **ray_remote_args, + ) + + def drop_columns( + self, + cols: List[str], + *, + compute: Optional[str] = None, + **ray_remote_args, + ) -> "Dataset": + """Drop one or more columns from the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> # Add a new column equal to value * 2. + >>> ds = ds.add_column("new_col", lambda df: df["id"] * 2) + >>> # Drop the existing "value" column. + >>> ds = ds.drop_columns(["id"]) + + + Time complexity: O(dataset size / parallelism) + + Args: + cols: Names of the columns to drop. If any name does not exist, + an exception will be raised. + compute: The compute strategy, either "tasks" (default) to use Ray + tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor + pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an + autoscaling actor pool. + ray_remote_args: Additional resource requirements to request from + ray (e.g., num_gpus=1 to request GPUs for the map tasks). + """ + + return self.map_batches( + lambda batch: batch.drop(columns=cols), + batch_format="pandas", + zero_copy_batch=True, + compute=compute, + **ray_remote_args, + ) + + def select_columns( + self, + cols: List[str], + *, + compute: Union[str, ComputeStrategy] = None, + **ray_remote_args, + ) -> "Dataset": + """Select one or more columns from the dataset. + + All input columns used to select need to be in the schema of the dataset. + + Examples: + >>> import ray + >>> # Create a dataset with 3 columns + >>> ds = ray.data.from_items([{"col1": i, "col2": i+1, "col3": i+2} + ... for i in range(10)]) + >>> # Select only "col1" and "col2" columns. + >>> ds = ds.select_columns(cols=["col1", "col2"]) + >>> ds + MapBatches() + +- Dataset( + num_blocks=10, + num_rows=10, + schema={col1: int64, col2: int64, col3: int64} + ) + + + Time complexity: O(dataset size / parallelism) + + Args: + cols: Names of the columns to select. If any name is not included in the + dataset schema, an exception will be raised. + compute: The compute strategy, either "tasks" (default) to use Ray + tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor + pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an + autoscaling actor pool. + ray_remote_args: Additional resource requirements to request from + ray (e.g., num_gpus=1 to request GPUs for the map tasks). + """ # noqa: E501 + return self.map_batches( + lambda batch: BlockAccessor.for_block(batch).select(columns=cols), + batch_format="pandas", + zero_copy_batch=True, + compute=compute, + **ray_remote_args, + ) + + def flat_map( + self, + fn: UserDefinedFunction[Dict[str, Any], List[Dict[str, Any]]], + *, + compute: Optional[ComputeStrategy] = None, + **ray_remote_args, + ) -> "Dataset": + """Apply the given function to each record and then flatten results. + + Consider using ``.map_batches()`` for better performance (the batch size can be + altered in map_batches). + + Examples: + >>> import ray + >>> ds = ray.data.range(1000) + >>> ds.flat_map(lambda x: [{"id": 1}, {"id": 2}, {"id": 4}]) + FlatMap + +- Dataset(num_blocks=..., num_rows=1000, schema={id: int64}) + + Time complexity: O(dataset size / parallelism) + + Args: + fn: The function or generator to apply to each record, or a class type + that can be instantiated to create such a callable. Callable classes are + only supported for the actor compute strategy. + compute: The compute strategy, either "tasks" (default) to use Ray + tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor + pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an + autoscaling actor pool. + ray_remote_args: Additional resource requirements to request from + ray (e.g., num_gpus=1 to request GPUs for the map tasks). + + .. seealso:: + + :meth:`~Dataset.map_batches` + Call this method to transform batches of data. It's faster and more + flexible than :meth:`~Dataset.map` and :meth:`~Dataset.flat_map`. + + :meth:`~Dataset.map` + Call this method to transform one record at time. + + This method isn't recommended because it's slow; call + :meth:`~Dataset.map_batches` instead. + """ + validate_compute(fn, compute) + self._warn_slow() + + transform_fn = generate_flat_map_fn() + + plan = self._plan.with_stage( + OneToOneStage("FlatMap", transform_fn, compute, ray_remote_args, fn=fn) + ) + + logical_plan = self._logical_plan + if logical_plan is not None: + op = FlatMap( + input_op=logical_plan.dag, + fn=fn, + compute=compute, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def filter( + self, + fn: UserDefinedFunction[Dict[str, Any], bool], + *, + compute: Union[str, ComputeStrategy] = None, + **ray_remote_args, + ) -> "Dataset": + """Filter out records that do not satisfy the given predicate. + + Consider using ``.map_batches()`` for better performance (you can implement + filter by dropping records). + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> ds.filter(lambda x: x["id"] % 2 == 0) + Filter + +- Dataset(num_blocks=..., num_rows=100, schema={id: int64}) + + Time complexity: O(dataset size / parallelism) + + Args: + fn: The predicate to apply to each record, or a class type + that can be instantiated to create such a callable. Callable classes are + only supported for the actor compute strategy. + compute: The compute strategy, either "tasks" (default) to use Ray + tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor + pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an + autoscaling actor pool. + ray_remote_args: Additional resource requirements to request from + ray (e.g., num_gpus=1 to request GPUs for the map tasks). + """ + validate_compute(fn, compute) + self._warn_slow() + + transform_fn = generate_filter_fn() + + plan = self._plan.with_stage( + OneToOneStage("Filter", transform_fn, compute, ray_remote_args, fn=fn) + ) + + logical_plan = self._logical_plan + if logical_plan is not None: + op = Filter( + input_op=logical_plan.dag, + fn=fn, + compute=compute, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(op) + + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def repartition(self, num_blocks: int, *, shuffle: bool = False) -> "Dataset": + """Repartition the dataset into exactly this number of blocks. + + After repartitioning, all blocks in the returned dataset will have + approximately the same number of rows. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> # Set the number of output partitions to write to disk. + >>> ds.repartition(10).write_parquet("/tmp/test") + + Time complexity: O(dataset size / parallelism) + + Args: + num_blocks: The number of blocks. + shuffle: Whether to perform a distributed shuffle during the + repartition. When shuffle is enabled, each output block + contains a subset of data rows from each input block, which + requires all-to-all data movement. When shuffle is disabled, + output blocks are created from adjacent input blocks, + minimizing data movement. + + Returns: + The repartitioned dataset. + """ + + plan = self._plan.with_stage(RepartitionStage(num_blocks, shuffle)) + + logical_plan = self._logical_plan + if logical_plan is not None: + op = Repartition( + logical_plan.dag, + num_outputs=num_blocks, + shuffle=shuffle, + ) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def random_shuffle( + self, + *, + seed: Optional[int] = None, + num_blocks: Optional[int] = None, + **ray_remote_args, + ) -> "Dataset": + """Randomly shuffle the elements of this dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) + >>> # Shuffle this dataset randomly. + >>> ds.random_shuffle() + RandomShuffle + +- Dataset(num_blocks=..., num_rows=100, schema={id: int64}) + >>> # Shuffle this dataset with a fixed random seed. + >>> ds.random_shuffle(seed=12345) + RandomShuffle + +- Dataset(num_blocks=..., num_rows=100, schema={id: int64}) + + Time complexity: O(dataset size / parallelism) + + Args: + seed: Fix the random seed to use, otherwise one will be chosen + based on system randomness. + num_blocks: The number of output blocks after the shuffle, or None + to retain the number of blocks. + + Returns: + The shuffled dataset. + """ + + plan = self._plan.with_stage( + RandomShuffleStage(seed, num_blocks, ray_remote_args) + ) + + logical_plan = self._logical_plan + if logical_plan is not None: + op = RandomShuffle( + logical_plan.dag, + seed=seed, + num_outputs=num_blocks, + ray_remote_args=ray_remote_args, + ) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def randomize_block_order( + self, + *, + seed: Optional[int] = None, + ) -> "Dataset": + """Randomly shuffle the blocks of this dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> # Randomize the block order. + >>> ds.randomize_block_order() # doctest: +SKIP + >>> # Randomize the block order with a fixed random seed. + >>> ds.randomize_block_order(seed=12345) # doctest: +SKIP + + Args: + seed: Fix the random seed to use, otherwise one will be chosen + based on system randomness. + + Returns: + The block-shuffled dataset. + """ + + plan = self._plan.with_stage(RandomizeBlocksStage(seed)) + + logical_plan = self._logical_plan + if logical_plan is not None: + op = RandomizeBlocks( + logical_plan.dag, + seed=seed, + ) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def random_sample( + self, fraction: float, *, seed: Optional[int] = None + ) -> "Dataset": + """Randomly samples a fraction of the elements of this dataset. + + Note that the exact number of elements returned is not guaranteed, + and that the number of elements being returned is roughly fraction * total_rows. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> ds.random_sample(0.1) # doctest: +SKIP + >>> ds.random_sample(0.2, seed=12345) # doctest: +SKIP + + Args: + fraction: The fraction of elements to sample. + seed: Seeds the python random pRNG generator. + + Returns: + Returns a Dataset containing the sampled elements. + """ + import random + + import pandas as pd + import pyarrow as pa + + if self.num_blocks() == 0: + raise ValueError("Cannot sample from an empty Dataset.") + + if fraction < 0 or fraction > 1: + raise ValueError("Fraction must be between 0 and 1.") + + if seed is not None: + random.seed(seed) + + def process_batch(batch): + if isinstance(batch, list): + return [row for row in batch if random.random() <= fraction] + if isinstance(batch, pa.Table): + # Lets the item pass if weight generated for that item <= fraction + return batch.filter( + pa.array(random.random() <= fraction for _ in range(len(batch))) + ) + if isinstance(batch, pd.DataFrame): + return batch.sample(frac=fraction) + if isinstance(batch, np.ndarray): + return _create_possibly_ragged_ndarray( + [row for row in batch if random.random() <= fraction] + ) + raise ValueError(f"Unsupported batch type: {type(batch)}") + + return self.map_batches(process_batch, batch_format=None) + + @ConsumptionAPI + def streaming_split( + self, + n: int, + *, + equal: bool = False, + locality_hints: Optional[List["NodeIdStr"]] = None, + ) -> List[DataIterator]: + """Returns ``n`` :class:`DataIterators ` that can + be used to read disjoint subsets of the dataset in parallel. + + This method is the recommended way to consume Datasets from multiple + processes (e.g., for distributed training), and requires streaming execution + mode. + + Streaming split works by delegating the execution of this Dataset to a + coordinator actor. The coordinator pulls block references from the executed + stream, and divides those blocks among `n` output iterators. Iterators pull + blocks from the coordinator actor to return to their caller on `next`. + + The returned iterators are also repeatable; each iteration will trigger a + new execution of the Dataset. There is an implicit barrier at the start of + each iteration, which means that `next` must be called on all iterators before + the iteration starts. + + Warning: because iterators are pulling blocks from the same Dataset + execution, if one iterator falls behind other iterators may be stalled. + + Examples: + >>> import ray + >>> ds = ray.data.range(1000000) + >>> it1, it2 = ds.streaming_split(2, equal=True) + + >>> # Can consume from both iterators in parallel. + >>> @ray.remote + ... def consume(it): + ... for batch in it.iter_batches(): + ... print(batch) + >>> ray.get([consume.remote(it1), consume.remote(it2)]) # doctest: +SKIP + + >>> # Can loop over the iterators multiple times (multiple epochs). + >>> @ray.remote + ... def train(it): + ... NUM_EPOCHS = 100 + ... for _ in range(NUM_EPOCHS): + ... for batch in it.iter_batches(): + ... print(batch) + >>> ray.get([train.remote(it1), train.remote(it2)]) # doctest: +SKIP + + >>> # ERROR: this will block waiting for a read on `it2` to start. + >>> ray.get(train.remote(it1)) # doctest: +SKIP + + Args: + n: Number of output iterators to return. + equal: If True, each output iterator will see an exactly equal number + of rows, dropping data if necessary. If False, some iterators may see + slightly more or less rows than other, but no data will be dropped. + locality_hints: Specify the node ids corresponding to each iterator + location. Dataset will try to minimize data movement based on the + iterator output locations. This list must have length ``n``. You can + get the current node id of a task or actor by calling + ``ray.get_runtime_context().get_node_id()``. + + Returns: + The output iterator splits. These iterators are Ray-serializable and can + be freely passed to any Ray task or actor. + """ + return StreamSplitDataIterator.create(self, n, equal, locality_hints) + + @ConsumptionAPI + def split( + self, n: int, *, equal: bool = False, locality_hints: Optional[List[Any]] = None + ) -> List["MaterializedDataset"]: + """Materialize and split the dataset into ``n`` disjoint pieces. + + This returns a list of MaterializedDatasets that can be passed to Ray tasks + and actors and used to read the dataset records in parallel. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> workers = ... # doctest: +SKIP + >>> # Split up a dataset to process over `n` worker actors. + >>> shards = ds.split(len(workers), locality_hints=workers) # doctest: +SKIP + >>> for shard, worker in zip(shards, workers): # doctest: +SKIP + ... worker.consume.remote(shard) # doctest: +SKIP + + Time complexity: O(1) + + See also: ``Dataset.split_at_indices``, ``Dataset.split_proportionately``, + and ``Dataset.streaming_split``. + + Args: + n: Number of child datasets to return. + equal: Whether to guarantee each split has an equal + number of records. This may drop records if they cannot be + divided equally among the splits. + locality_hints: [Experimental] A list of Ray actor handles of size ``n``. + The system will try to co-locate the blocks of the i-th dataset + with the i-th actor to maximize data locality. + + Returns: + A list of ``n`` disjoint dataset splits. + """ + if n <= 0: + raise ValueError(f"The number of splits {n} is not positive.") + + # fallback to split_at_indices for equal split without locality hints. + # simple benchmarks shows spilit_at_indices yields more stable performance. + # https://github.com/ray-project/ray/pull/26641 for more context. + if equal and locality_hints is None: + count = self.count() + split_index = count // n + # we are creating n split_indices which will generate + # n + 1 splits; the last split will at most contains (n - 1) + # rows, which could be safely dropped. + split_indices = [split_index * i for i in range(1, n + 1)] + shards = self.split_at_indices(split_indices) + return shards[:n] + + if locality_hints and len(locality_hints) != n: + raise ValueError( + f"The length of locality_hints {len(locality_hints)} " + f"doesn't equal the number of splits {n}." + ) + # TODO: this is unreachable code. + if len(set(locality_hints)) != len(locality_hints): + raise ValueError( + "locality_hints must not contain duplicate actor handles" + ) + + blocks = self._plan.execute() + owned_by_consumer = blocks._owned_by_consumer + stats = self._plan.stats() + block_refs, metadata = zip(*blocks.get_blocks_with_metadata()) + + if locality_hints is None: + blocks = np.array_split(block_refs, n) + meta = np.array_split(metadata, n) + return [ + MaterializedDataset( + ExecutionPlan( + BlockList( + b.tolist(), m.tolist(), owned_by_consumer=owned_by_consumer + ), + stats, + run_by_consumer=owned_by_consumer, + ), + self._epoch, + self._lazy, + ) + for b, m in zip(blocks, meta) + ] + + metadata_mapping = {b: m for b, m in zip(block_refs, metadata)} + + # If the locality_hints is set, we use a two-round greedy algorithm + # to co-locate the blocks with the actors based on block + # and actor's location (node_id). + # + # The split algorithm tries to allocate equally-sized blocks regardless + # of locality. Thus we first calculate the expected number of blocks + # for each split. + # + # In the first round, for each actor, we look for all blocks that + # match the actor's node_id, then allocate those matched blocks to + # this actor until we reach the limit(expected number). + # + # In the second round: fill each actor's allocation with + # remaining unallocated blocks until we reach the limit. + + def build_allocation_size_map( + num_blocks: int, actors: List[Any] + ) -> Dict[Any, int]: + """Given the total number of blocks and a list of actors, calcuate + the expected number of blocks to allocate for each actor. + """ + num_actors = len(actors) + num_blocks_per_actor = num_blocks // num_actors + num_blocks_left = num_blocks - num_blocks_per_actor * n + num_blocks_by_actor = {} + for i, actor in enumerate(actors): + num_blocks_by_actor[actor] = num_blocks_per_actor + if i < num_blocks_left: + num_blocks_by_actor[actor] += 1 + return num_blocks_by_actor + + def build_block_refs_by_node_id( + blocks: List[ObjectRef[Block]], + ) -> Dict[str, List[ObjectRef[Block]]]: + """Build the reverse index from node_id to block_refs. For + simplicity, if the block is stored on multiple nodes we + only pick the first one. + """ + block_ref_locations = ray.experimental.get_object_locations(blocks) + block_refs_by_node_id = collections.defaultdict(list) + for block_ref in blocks: + node_ids = block_ref_locations.get(block_ref, {}).get("node_ids", []) + node_id = node_ids[0] if node_ids else None + block_refs_by_node_id[node_id].append(block_ref) + return block_refs_by_node_id + + def build_node_id_by_actor(actors: List[Any]) -> Dict[Any, str]: + """Build a map from a actor to its node_id.""" + actors_state = ray._private.state.actors() + return { + actor: actors_state.get(actor._actor_id.hex(), {}) + .get("Address", {}) + .get("NodeID") + for actor in actors + } + + # expected number of blocks to be allocated for each actor + expected_block_count_by_actor = build_allocation_size_map( + len(block_refs), locality_hints + ) + # the reverse index from node_id to block_refs + block_refs_by_node_id = build_block_refs_by_node_id(block_refs) + # the map from actor to its node_id + node_id_by_actor = build_node_id_by_actor(locality_hints) + + allocation_per_actor = collections.defaultdict(list) + + # In the first round, for each actor, we look for all blocks that + # match the actor's node_id, then allocate those matched blocks to + # this actor until we reach the limit(expected number) + for actor in locality_hints: + node_id = node_id_by_actor[actor] + matching_blocks = block_refs_by_node_id[node_id] + expected_block_count = expected_block_count_by_actor[actor] + allocation = [] + while matching_blocks and len(allocation) < expected_block_count: + allocation.append(matching_blocks.pop()) + allocation_per_actor[actor] = allocation + + # In the second round: fill each actor's allocation with + # remaining unallocated blocks until we reach the limit + remaining_block_refs = list( + itertools.chain.from_iterable(block_refs_by_node_id.values()) + ) + for actor in locality_hints: + while ( + len(allocation_per_actor[actor]) < expected_block_count_by_actor[actor] + ): + allocation_per_actor[actor].append(remaining_block_refs.pop()) + + assert len(remaining_block_refs) == 0, len(remaining_block_refs) + + per_split_block_lists = [ + BlockList( + allocation_per_actor[actor], + [metadata_mapping[b] for b in allocation_per_actor[actor]], + owned_by_consumer=owned_by_consumer, + ) + for actor in locality_hints + ] + + if equal: + # equalize the splits + per_split_block_lists = _equalize(per_split_block_lists, owned_by_consumer) + + return [ + MaterializedDataset( + ExecutionPlan( + block_split, + stats, + run_by_consumer=owned_by_consumer, + ), + self._epoch, + self._lazy, + ) + for block_split in per_split_block_lists + ] + + @ConsumptionAPI + def split_at_indices(self, indices: List[int]) -> List["MaterializedDataset"]: + """Materialize and split the dataset at the given indices (like np.split). + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> d1, d2, d3 = ds.split_at_indices([2, 5]) + >>> d1.take_batch() + {'id': array([0, 1])} + >>> d2.take_batch() + {'id': array([2, 3, 4])} + >>> d3.take_batch() + {'id': array([5, 6, 7, 8, 9])} + + Time complexity: O(num splits) + + See also: ``Dataset.split_at_indices``, ``Dataset.split_proportionately``, + and ``Dataset.streaming_split``. + + Args: + indices: List of sorted integers which indicate where the dataset + will be split. If an index exceeds the length of the dataset, + an empty dataset will be returned. + + Returns: + The dataset splits. + """ + + if len(indices) < 1: + raise ValueError("indices must be at least of length 1") + if sorted(indices) != indices: + raise ValueError("indices must be sorted") + if indices[0] < 0: + raise ValueError("indices must be positive") + start_time = time.perf_counter() + block_list = self._plan.execute() + blocks, metadata = _split_at_indices( + block_list.get_blocks_with_metadata(), + indices, + block_list._owned_by_consumer, + ) + split_duration = time.perf_counter() - start_time + parent_stats = self._plan.stats() + splits = [] + for bs, ms in zip(blocks, metadata): + stats = DatasetStats(stages={"Split": ms}, parent=parent_stats) + stats.time_total_s = split_duration + splits.append( + MaterializedDataset( + ExecutionPlan( + BlockList( + bs, ms, owned_by_consumer=block_list._owned_by_consumer + ), + stats, + run_by_consumer=block_list._owned_by_consumer, + ), + self._epoch, + self._lazy, + ) + ) + return splits + + @ConsumptionAPI + def split_proportionately( + self, proportions: List[float] + ) -> List["MaterializedDataset"]: + """Materialize and split the dataset using proportions. + + A common use case for this would be splitting the dataset into train + and test sets (equivalent to eg. scikit-learn's ``train_test_split``). + See also ``Dataset.train_test_split`` for a higher level abstraction. + + The indices to split at will be calculated in such a way so that all splits + always contains at least one element. If that is not possible, + an exception will be raised. + + This is equivalent to caulculating the indices manually and calling + ``Dataset.split_at_indices``. + + Examples: + >>> import ray + >>> ds = ray.data.range(10) + >>> d1, d2, d3 = ds.split_proportionately([0.2, 0.5]) + >>> d1.take_batch() + {'id': array([0, 1])} + >>> d2.take_batch() + {'id': array([2, 3, 4, 5, 6])} + >>> d3.take_batch() + {'id': array([7, 8, 9])} + + Time complexity: O(num splits) + + See also: ``Dataset.split``, ``Dataset.split_at_indices``, + ``Dataset.train_test_split`` + + Args: + proportions: List of proportions to split the dataset according to. + Must sum up to less than 1, and each proportion has to be bigger + than 0. + + Returns: + The dataset splits. + """ + + if len(proportions) < 1: + raise ValueError("proportions must be at least of length 1") + if sum(proportions) >= 1: + raise ValueError("proportions must sum to less than 1") + if any(p <= 0 for p in proportions): + raise ValueError("proportions must be bigger than 0") + + dataset_length = self.count() + cumulative_proportions = np.cumsum(proportions) + split_indices = [ + int(dataset_length * proportion) for proportion in cumulative_proportions + ] + + # Ensure each split has at least one element + subtract = 0 + for i in range(len(split_indices) - 2, -1, -1): + split_indices[i] -= subtract + if split_indices[i] == split_indices[i + 1]: + subtract += 1 + split_indices[i] -= 1 + if any(i <= 0 for i in split_indices): + raise ValueError( + "Couldn't create non-empty splits with the given proportions." + ) + + return self.split_at_indices(split_indices) + + @ConsumptionAPI + def train_test_split( + self, + test_size: Union[int, float], + *, + shuffle: bool = False, + seed: Optional[int] = None, + ) -> Tuple["MaterializedDataset", "MaterializedDataset"]: + """Materialize and split the dataset into train and test subsets. + + Examples: + + >>> import ray + >>> ds = ray.data.range(8) + >>> train, test = ds.train_test_split(test_size=0.25) + >>> train.take_batch() + {'id': array([0, 1, 2, 3, 4, 5])} + >>> test.take_batch() + {'id': array([6, 7])} + + Args: + test_size: If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If int, + represents the absolute number of test samples. The train split will + always be the compliment of the test split. + shuffle: Whether or not to globally shuffle the dataset before splitting. + Defaults to False. This may be a very expensive operation with large + dataset. + seed: Fix the random seed to use for shuffle, otherwise one will be chosen + based on system randomness. Ignored if ``shuffle=False``. + + Returns: + Train and test subsets as two MaterializedDatasets. + """ + ds = self + + if shuffle: + ds = ds.random_shuffle(seed=seed) + + if not isinstance(test_size, (int, float)): + raise TypeError(f"`test_size` must be int or float got {type(test_size)}.") + if isinstance(test_size, float): + if test_size <= 0 or test_size >= 1: + raise ValueError( + "If `test_size` is a float, it must be bigger than 0 and smaller " + f"than 1. Got {test_size}." + ) + return ds.split_proportionately([1 - test_size]) + else: + ds_length = ds.count() + if test_size <= 0 or test_size >= ds_length: + raise ValueError( + "If `test_size` is an int, it must be bigger than 0 and smaller " + f"than the size of the dataset ({ds_length}). " + f"Got {test_size}." + ) + return ds.split_at_indices([ds_length - test_size]) + + @ConsumptionAPI(pattern="Args:") + def union(self, *other: List["Dataset"]) -> "Dataset": + """Materialize and combine this dataset with others of the same type. + + The order of the blocks in the datasets is preserved, as is the + relative ordering between the datasets passed in the argument list. + + .. note:: + Unioned datasets are not lineage-serializable, i.e. they can not be + used as a tunable hyperparameter in Ray Tune. + + Args: + other: List of datasets to combine with this one. The datasets + must have the same schema as this dataset, otherwise the + behavior is undefined. + + Returns: + A new dataset holding the union of their data. + """ + + start_time = time.perf_counter() + + owned_by_consumer = self._plan.execute()._owned_by_consumer + datasets = [self] + list(other) + bls = [] + has_nonlazy = False + for ds in datasets: + bl = ds._plan.execute() + if not isinstance(bl, LazyBlockList): + has_nonlazy = True + bls.append(bl) + if has_nonlazy: + blocks = [] + metadata = [] + for bl in bls: + if isinstance(bl, LazyBlockList): + bs, ms = bl._get_blocks_with_metadata() + else: + bs, ms = bl._blocks, bl._metadata + blocks.extend(bs) + metadata.extend(ms) + blocklist = BlockList(blocks, metadata, owned_by_consumer=owned_by_consumer) + else: + tasks: List[ReadTask] = [] + block_partition_refs: List[ObjectRef[BlockPartition]] = [] + block_partition_meta_refs: List[ObjectRef[BlockMetadata]] = [] + + # Gather read task names from input blocks of unioned Datasets, + # and concat them before passing to resulting LazyBlockList + read_task_names = [] + self_read_name = self._plan._in_blocks._read_stage_name or "Read" + read_task_names.append(self_read_name) + other_read_names = [ + o._plan._in_blocks._read_stage_name or "Read" for o in other + ] + read_task_names.extend(other_read_names) + + for bl in bls: + tasks.extend(bl._tasks) + block_partition_refs.extend(bl._block_partition_refs) + block_partition_meta_refs.extend(bl._block_partition_meta_refs) + blocklist = LazyBlockList( + tasks, + f"Union({','.join(read_task_names)})", + block_partition_refs, + block_partition_meta_refs, + owned_by_consumer=owned_by_consumer, + ) + + epochs = [ds._get_epoch() for ds in datasets] + max_epoch = max(*epochs) + if len(set(epochs)) > 1: + if ray.util.log_once("dataset_epoch_warned"): + logger.warning( + "Dataset contains data from multiple epochs: {}, " + "likely due to a `rewindow()` call. The higher epoch " + "number {} will be used. This warning will not " + "be shown again.".format(set(epochs), max_epoch) + ) + stats = DatasetStats( + stages={"Union": []}, + parent=[d._plan.stats() for d in datasets], + ) + stats.time_total_s = time.perf_counter() - start_time + return Dataset( + ExecutionPlan(blocklist, stats, run_by_consumer=owned_by_consumer), + max_epoch, + self._lazy, + ) + + def groupby(self, key: Optional[str]) -> "GroupedData": + """Group the dataset by the key function or column name. + + Examples: + >>> import ray + >>> # Group by a table column and aggregate. + >>> ray.data.from_items([ + ... {"A": x % 3, "B": x} for x in range(100)]).groupby( + ... "A").count() + Aggregate + +- Dataset(num_blocks=100, num_rows=100, schema={A: int64, B: int64}) + + Time complexity: O(dataset size * log(dataset size / parallelism)) + + Args: + key: A column name. If this is None, the grouping is global. + + Returns: + A lazy GroupedData that can be aggregated later. + """ + from ray.data.grouped_data import GroupedData + + # Always allow None since groupby interprets that as grouping all + # records into a single global group. + if key is not None: + _validate_key_fn(self.schema(fetch_if_missing=True), key) + + return GroupedData(self, key) + + @ConsumptionAPI + def aggregate(self, *aggs: AggregateFn) -> Union[Any, Dict[str, Any]]: + """Aggregate the entire dataset as one group. + + Examples: + >>> import ray + >>> from ray.data.aggregate import Max, Mean + >>> ray.data.range(100).aggregate(Max("id"), Mean("id")) + {'max(id)': 99, 'mean(id)': 49.5} + + Time complexity: O(dataset size / parallelism) + + Args: + aggs: Aggregations to do. + + Returns: + If the input dataset is a simple dataset then the output is + a tuple of ``(agg1, agg2, ...)`` where each tuple element is + the corresponding aggregation result. + If the input dataset is an Arrow dataset then the output is + an dict where each column is the corresponding aggregation result. + If the dataset is empty, return ``None``. + """ + ret = self.groupby(None).aggregate(*aggs).take(1) + return ret[0] if len(ret) > 0 else None + + @ConsumptionAPI + def sum( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Compute sum over entire dataset. + + Examples: + >>> import ray + >>> ray.data.range(100).sum("id") + 4950 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100)]).sum(["A", "B"]) + {'sum(A)': 4950, 'sum(B)': 328350} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values will be ignored when computing the sum; if ``False``, + if a null value is encountered, the output will be None. + We consider np.nan, None, and pd.NaT to be null values. + Default is ``True``. + + Returns: + The sum result. + + For different values of ``on``, the return varies: + + - ``on=None``: a dict containing the column-wise sum of all + columns, + - ``on="col"``: a scalar representing the sum of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column ``dict`` + containing the column-wise sum of the provided columns. + + If the dataset is empty, all values are null, or any value is null + AND ``ignore_nulls`` is ``False``, then the output will be None. + """ + ret = self._aggregate_on(Sum, on, ignore_nulls) + return self._aggregate_result(ret) + + @ConsumptionAPI + def min( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Compute minimum over entire dataset. + + Examples: + >>> import ray + >>> ray.data.range(100).min("id") + 0 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100)]).min(["A", "B"]) + {'min(A)': 0, 'min(B)': 0} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values will be ignored when computing the min; if ``False``, + if a null value is encountered, the output will be None. + We consider np.nan, None, and pd.NaT to be null values. + Default is ``True``. + + Returns: + The min result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise min of + all columns, + - ``on="col"``: a scalar representing the min of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise min of the provided columns. + + If the dataset is empty, all values are null, or any value is null + AND ``ignore_nulls`` is ``False``, then the output will be None. + """ + ret = self._aggregate_on(Min, on, ignore_nulls) + return self._aggregate_result(ret) + + @ConsumptionAPI + def max( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Compute maximum over entire dataset. + + Examples: + >>> import ray + >>> ray.data.range(100).max("id") + 99 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100)]).max(["A", "B"]) + {'max(A)': 99, 'max(B)': 9801} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values will be ignored when computing the max; if ``False``, + if a null value is encountered, the output will be None. + We consider np.nan, None, and pd.NaT to be null values. + Default is ``True``. + + Returns: + The max result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise max of + all columns, + - ``on="col"``: a scalar representing the max of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise max of the provided columns. + + If the dataset is empty, all values are null, or any value is null + AND ``ignore_nulls`` is ``False``, then the output will be None. + """ + ret = self._aggregate_on(Max, on, ignore_nulls) + return self._aggregate_result(ret) + + @ConsumptionAPI + def mean( + self, on: Optional[Union[str, List[str]]] = None, ignore_nulls: bool = True + ) -> Union[Any, Dict[str, Any]]: + """Compute mean over entire dataset. + + Examples: + >>> import ray + >>> ray.data.range(100).mean("id") + 49.5 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100)]).mean(["A", "B"]) + {'mean(A)': 49.5, 'mean(B)': 3283.5} + + Args: + on: a column name or a list of column names to aggregate. + ignore_nulls: Whether to ignore null values. If ``True``, null + values will be ignored when computing the mean; if ``False``, + if a null value is encountered, the output will be None. + We consider np.nan, None, and pd.NaT to be null values. + Default is ``True``. + + Returns: + The mean result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise mean of + all columns, + - ``on="col"``: a scalar representing the mean of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise mean of the provided columns. + + If the dataset is empty, all values are null, or any value is null + AND ``ignore_nulls`` is ``False``, then the output will be None. + """ + ret = self._aggregate_on(Mean, on, ignore_nulls) + return self._aggregate_result(ret) + + @ConsumptionAPI + def std( + self, + on: Optional[Union[str, List[str]]] = None, + ddof: int = 1, + ignore_nulls: bool = True, + ) -> Union[Any, Dict[str, Any]]: + """Compute standard deviation over entire dataset. + + Examples: + >>> import ray + >>> round(ray.data.range(100).std("id", ddof=0), 5) + 28.86607 + >>> ray.data.from_items([ + ... {"A": i, "B": i**2} + ... for i in range(100)]).std(["A", "B"]) + {'std(A)': 29.011491975882016, 'std(B)': 2968.1748039269296} + + .. note:: This uses Welford's online method for an accumulator-style computation + of the standard deviation. This method was chosen due to it's numerical + stability, and it being computable in a single pass. This may give different + (but more accurate) results than NumPy, Pandas, and sklearn, which use a + less numerically stable two-pass algorithm. + See + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm + + Args: + on: a column name or a list of column names to aggregate. + ddof: Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + ignore_nulls: Whether to ignore null values. If ``True``, null + values will be ignored when computing the std; if ``False``, + if a null value is encountered, the output will be None. + We consider np.nan, None, and pd.NaT to be null values. + Default is ``True``. + + Returns: + The standard deviation result. + + For different values of ``on``, the return varies: + + - ``on=None``: an dict containing the column-wise std of + all columns, + - ``on="col"``: a scalar representing the std of all items in + column ``"col"``, + - ``on=["col_1", ..., "col_n"]``: an n-column dict + containing the column-wise std of the provided columns. + + If the dataset is empty, all values are null, or any value is null + AND ``ignore_nulls`` is ``False``, then the output will be None. + """ + ret = self._aggregate_on(Std, on, ignore_nulls, ddof=ddof) + return self._aggregate_result(ret) + + def sort(self, key: Optional[str] = None, descending: bool = False) -> "Dataset": + """Sort the dataset by the specified key column or key function. + + Examples: + >>> import ray + >>> # Sort by a single column in descending order. + >>> ds = ray.data.from_items( + ... [{"value": i} for i in range(1000)]) + >>> ds.sort("value", descending=True) + Sort + +- Dataset(num_blocks=200, num_rows=1000, schema={value: int64}) + + Time complexity: O(dataset size * log(dataset size / parallelism)) + + Args: + key: The column to sort by. To sort by multiple columns, use a map function + to generate the sort column beforehand. + descending: Whether to sort in descending order. + + Returns: + A new, sorted dataset. + """ + + plan = self._plan.with_stage(SortStage(self, key, descending)) + + logical_plan = self._logical_plan + if logical_plan is not None: + op = Sort( + logical_plan.dag, + key=key, + descending=descending, + ) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + def zip(self, other: "Dataset") -> "Dataset": + """Materialize and zip this dataset with the elements of another. + + The datasets must have the same number of rows. Their column sets will be + merged, and any duplicate column names disambiguated with _1, _2, etc. suffixes. + + .. note:: + The smaller of the two datasets will be repartitioned to align the number + of rows per block with the larger dataset. + + .. note:: + Zipped datasets are not lineage-serializable, i.e. they can not be used + as a tunable hyperparameter in Ray Tune. + + Examples: + >>> import ray + >>> ds1 = ray.data.range(5) + >>> ds2 = ray.data.range(5) + >>> ds1.zip(ds2).take_batch() + {'id': array([0, 1, 2, 3, 4]), 'id_1': array([0, 1, 2, 3, 4])} + + Time complexity: O(dataset size / parallelism) + + Args: + other: The dataset to zip with on the right hand side. + + Returns: + A ``Dataset`` containing the columns of the second dataset + concatenated horizontally with the columns of the first dataset, + with duplicate column names disambiguated with _1, _2, etc. suffixes. + """ + + plan = self._plan.with_stage(ZipStage(other)) + + logical_plan = self._logical_plan + other_logical_plan = other._logical_plan + if logical_plan is not None and other_logical_plan is not None: + op = Zip(logical_plan.dag, other_logical_plan.dag) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + @ConsumptionAPI + def limit(self, limit: int) -> "Dataset": + """Materialize and truncate the dataset to the first ``limit`` records. + + Contrary to :meth`.take`, this will not move any data to the caller's + machine. Instead, it will return a new ``Dataset`` pointing to the truncated + distributed data. + + Examples: + >>> import ray + >>> ds = ray.data.range(1000) + >>> ds.limit(5).take_batch() + {'id': array([0, 1, 2, 3, 4])} + + Time complexity: O(limit specified) + + Args: + limit: The size of the dataset to truncate to. + + Returns: + The truncated dataset. + """ + plan = self._plan.with_stage(LimitStage(limit)) + logical_plan = self._logical_plan + if logical_plan is not None: + op = Limit(logical_plan.dag, limit=limit) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) + + @ConsumptionAPI(pattern="Time complexity:") + def take_batch( + self, batch_size: int = 20, *, batch_format: Optional[str] = "default" + ) -> DataBatch: + """Return up to ``batch_size`` records from the dataset in a batch. + + Unlike take(), the records are returned in the same format as used for + `iter_batches` and `map_batches`. + + This will move up to ``batch_size`` records to the caller's machine; if + ``batch_size`` is very large, this can result in an OutOfMemory crash on + the caller. + + Time complexity: O(batch_size specified) + + Args: + batch_size: The max number of records to return. + batch_format: Specify ``"default"`` to use the default block format + (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to + select ``pyarrow.Table``, or ``"numpy"`` to select + ``Dict[str, numpy.ndarray]``, or None to return the underlying block + exactly as is with no additional formatting. + + Returns: + A batch of up to ``batch_size`` records from the dataset. + + Raises: + ValueError if the dataset is empty. + """ + batch_format = _apply_strict_mode_batch_format(batch_format) + try: + res = next( + self.iter_batches( + batch_size=batch_size, prefetch_batches=0, batch_format=batch_format + ) + ) + except StopIteration: + raise ValueError("The dataset is empty.") + self._synchronize_progress_bar() + return res + + @ConsumptionAPI(pattern="Time complexity:") + def take(self, limit: int = 20) -> List[Dict[str, Any]]: + """Return up to ``limit`` records from the dataset. + + This will move up to ``limit`` records to the caller's machine; if + ``limit`` is very large, this can result in an OutOfMemory crash on + the caller. + + Time complexity: O(limit specified) + + Args: + limit: The max number of records to return. + + Returns: + A list of up to ``limit`` records from the dataset. + """ + if ray.util.log_once("dataset_take"): + logger.info( + "Tip: Use `take_batch()` instead of `take() / show()` to return " + "records in pandas or numpy batch format." + ) + output = [] + for row in self.iter_rows(): + output.append(row) + if len(output) >= limit: + break + self._synchronize_progress_bar() + return output + + @ConsumptionAPI(pattern="Time complexity:") + def take_all(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: + """Return all of the records in the dataset. + + This will move the entire dataset to the caller's machine; if the + dataset is very large, this can result in an OutOfMemory crash on + the caller. + + Time complexity: O(dataset size) + + Args: + limit: Raise an error if the size exceeds the specified limit. + + Returns: + A list of all the records in the dataset. + """ + output = [] + for row in self.iter_rows(): + output.append(row) + if limit is not None and len(output) > limit: + raise ValueError( + f"The dataset has more than the given limit of {limit} records." + ) + self._synchronize_progress_bar() + return output + + @ConsumptionAPI(pattern="Time complexity:") + def show(self, limit: int = 20) -> None: + """Print up to the given number of records from the dataset. + + Time complexity: O(limit specified) + + Args: + limit: The max number of records to print. + """ + for row in self.take(limit): + print(row) + + @ConsumptionAPI( + if_more_than_read=True, + datasource_metadata="row count", + pattern="Time complexity:", + ) + def count(self) -> int: + """Count the number of records in the dataset. + + Time complexity: O(dataset size / parallelism), O(1) for parquet + + Returns: + The number of records in the dataset. + """ + # Handle empty dataset. + if self.num_blocks() == 0: + return 0 + + # For parquet, we can return the count directly from metadata. + meta_count = self._meta_count() + if meta_count is not None: + return meta_count + + get_num_rows = cached_remote_fn(_get_num_rows) + + return sum( + ray.get( + [get_num_rows.remote(block) for block in self.get_internal_block_refs()] + ) + ) + + @ConsumptionAPI( + if_more_than_read=True, + datasource_metadata="schema", + extra_condition="or if ``fetch_if_missing=True`` (the default)", + pattern="Time complexity:", + ) + def schema(self, fetch_if_missing: bool = True) -> Optional["Schema"]: + """Return the schema of the dataset. + + Time complexity: O(1) + + Args: + fetch_if_missing: If True, synchronously fetch the schema if it's + not known. If False, None is returned if the schema is not known. + Default is True. + + Returns: + The ``ray.data.Schema`` class of the records, or None if the + schema is not known and fetch_if_missing is False. + """ + ctx = DataContext.get_current() + base_schema = self._plan.schema(fetch_if_missing=fetch_if_missing) + if ctx.strict_mode: + if base_schema: + return Schema(base_schema) + else: + return None + else: + return base_schema + + @ConsumptionAPI( + if_more_than_read=True, + datasource_metadata="schema", + extra_condition="or if ``fetch_if_missing=True`` (the default)", + pattern="Time complexity:", + ) + def columns(self, fetch_if_missing: bool = True) -> Optional[List[str]]: + """Returns the columns of this Dataset. + + Time complexity: O(1) + + Example: + >>> import ray + >>> # Create dataset from synthetic data. + >>> ds = ray.data.range(1000) + >>> ds.columns() + ['id'] + + Args: + fetch_if_missing: If True, synchronously fetch the column names from the + schema if it's not known. If False, None is returned if the schema is + not known. Default is True. + + Returns: + A list of the column names for this Dataset or None if schema is not known + and `fetch_if_missing` is False. + + """ + schema = self.schema(fetch_if_missing=fetch_if_missing) + if schema is not None: + return schema.names + return None + + def num_blocks(self) -> int: + """Return the number of blocks of this dataset. + + Note that during read and transform operations, the number of blocks + may be dynamically adjusted to respect memory limits, increasing the + number of blocks at runtime. + + Time complexity: O(1) + + Returns: + The number of blocks of this dataset. + """ + return self._plan.initial_num_blocks() + + @ConsumptionAPI(if_more_than_read=True, pattern="Time complexity:") + def size_bytes(self) -> int: + """Return the in-memory size of the dataset. + + Time complexity: O(1) + + Returns: + The in-memory size of the dataset in bytes, or None if the + in-memory size is not known. + """ + metadata = self._plan.execute().get_metadata() + if not metadata or metadata[0].size_bytes is None: + return None + return sum(m.size_bytes for m in metadata) + + @ConsumptionAPI(if_more_than_read=True, pattern="Time complexity:") + def input_files(self) -> List[str]: + """Return the list of input files for the dataset. + + Time complexity: O(num input files) + + Returns: + The list of input files used to create the dataset, or an empty + list if the input files is not known. + """ + metadata = self._plan.execute().get_metadata() + files = set() + for m in metadata: + for f in m.input_files: + files.add(f) + return list(files) + + @ConsumptionAPI + def write_parquet( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), + arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, + ray_remote_args: Dict[str, Any] = None, + **arrow_parquet_args, + ) -> None: + """Write the dataset to parquet. + + This is only supported for datasets convertible to Arrow records. + To control the number of files, use ``.repartition()``. + + Unless a custom block path provider is given, the format of the output + files will be {uuid}_{block_idx}.parquet, where ``uuid`` is an unique + id for the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> ds.write_parquet("s3://bucket/path") # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where Parquet + files will be written to. + filesystem: The filesystem implementation to write to. + try_create_dir: Try to create all directories in destination path + if True. Does nothing if all directories already exist. + arrow_open_stream_args: kwargs passed to + pyarrow.fs.FileSystem.open_output_stream + block_path_provider: BlockWritePathProvider implementation to + write each dataset block to a custom output path. + arrow_parquet_args_fn: Callable that returns a dictionary of write + arguments to use when writing each block to a file. Overrides + any duplicate keys from arrow_parquet_args. This should be used + instead of arrow_parquet_args if any of your write arguments + cannot be pickled, or if you'd like to lazily resolve the write + arguments for each dataset block. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + arrow_parquet_args: Options to pass to + pyarrow.parquet.write_table(), which is used to write out each + block to a file. + """ + self.write_datasource( + ParquetDatasource(), + ray_remote_args=ray_remote_args, + path=path, + dataset_uuid=self._uuid, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + block_path_provider=block_path_provider, + write_args_fn=arrow_parquet_args_fn, + **arrow_parquet_args, + ) + + @ConsumptionAPI + def write_json( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), + pandas_json_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, + ray_remote_args: Dict[str, Any] = None, + **pandas_json_args, + ) -> None: + """Write the dataset to json. + + This is only supported for datasets convertible to Arrow records. + To control the number of files, use ``.repartition()``. + + Unless a custom block path provider is given, the format of the output + files will be {self._uuid}_{block_idx}.json, where ``uuid`` is an + unique id for the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> ds.write_json("s3://bucket/path") # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where json + files will be written to. + filesystem: The filesystem implementation to write to. + try_create_dir: Try to create all directories in destination path + if True. Does nothing if all directories already exist. + arrow_open_stream_args: kwargs passed to + pyarrow.fs.FileSystem.open_output_stream + block_path_provider: BlockWritePathProvider implementation to + write each dataset block to a custom output path. + pandas_json_args_fn: Callable that returns a dictionary of write + arguments to use when writing each block to a file. Overrides + any duplicate keys from pandas_json_args. This should be used + instead of pandas_json_args if any of your write arguments + cannot be pickled, or if you'd like to lazily resolve the write + arguments for each dataset block. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + pandas_json_args: These args will be passed to + pandas.DataFrame.to_json(), which we use under the hood to + write out each Dataset block. These + are dict(orient="records", lines=True) by default. + """ + self.write_datasource( + JSONDatasource(), + ray_remote_args=ray_remote_args, + path=path, + dataset_uuid=self._uuid, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + block_path_provider=block_path_provider, + write_args_fn=pandas_json_args_fn, + **pandas_json_args, + ) + + @ConsumptionAPI + def write_csv( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), + arrow_csv_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, + ray_remote_args: Dict[str, Any] = None, + **arrow_csv_args, + ) -> None: + """Write the dataset to csv. + + This is only supported for datasets convertible to Arrow records. + To control the number of files, use ``.repartition()``. + + Unless a custom block path provider is given, the format of the output + files will be {uuid}_{block_idx}.csv, where ``uuid`` is an unique id + for the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> ds.write_csv("s3://bucket/path") # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where csv + files will be written to. + filesystem: The filesystem implementation to write to. + try_create_dir: Try to create all directories in destination path + if True. Does nothing if all directories already exist. + arrow_open_stream_args: kwargs passed to + pyarrow.fs.FileSystem.open_output_stream + block_path_provider: BlockWritePathProvider implementation to + write each dataset block to a custom output path. + arrow_csv_args_fn: Callable that returns a dictionary of write + arguments to use when writing each block to a file. Overrides + any duplicate keys from arrow_csv_args. This should be used + instead of arrow_csv_args if any of your write arguments + cannot be pickled, or if you'd like to lazily resolve the write + arguments for each dataset block. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + arrow_csv_args: Other CSV write options to pass to pyarrow. + """ + self.write_datasource( + CSVDatasource(), + ray_remote_args=ray_remote_args, + path=path, + dataset_uuid=self._uuid, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + block_path_provider=block_path_provider, + write_args_fn=arrow_csv_args_fn, + **arrow_csv_args, + ) + + @ConsumptionAPI + def write_tfrecords( + self, + path: str, + *, + tf_schema: Optional["schema_pb2.Schema"] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), + ray_remote_args: Dict[str, Any] = None, + ) -> None: + """Write the dataset to TFRecord files. + + The `TFRecord `_ + files will contain + `tf.train.Example `_ # noqa: E501 + records, with one Example record for each row in the dataset. + + .. warning:: + tf.train.Feature only natively stores ints, floats, and bytes, + so this function only supports datasets with these data types, + and will error if the dataset contains unsupported types. + + This is only supported for datasets convertible to Arrow records. + To control the number of files, use ``.repartition()``. + + Unless a custom block path provider is given, the format of the output + files will be {uuid}_{block_idx}.tfrecords, where ``uuid`` is an unique id + for the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.from_items([ + ... { "name": "foo", "score": 42 }, + ... { "name": "bar", "score": 43 }, + ... ]) + >>> ds.write_tfrecords("s3://bucket/path") # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where tfrecords + files will be written to. + filesystem: The filesystem implementation to write to. + try_create_dir: Try to create all directories in destination path + if True. Does nothing if all directories already exist. + arrow_open_stream_args: kwargs passed to + pyarrow.fs.FileSystem.open_output_stream + block_path_provider: BlockWritePathProvider implementation to + write each dataset block to a custom output path. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + + """ + + self.write_datasource( + TFRecordDatasource(), + ray_remote_args=ray_remote_args, + path=path, + dataset_uuid=self._uuid, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + block_path_provider=block_path_provider, + tf_schema=tf_schema, + ) + + @PublicAPI(stability="alpha") + @ConsumptionAPI + def write_webdataset( + self, + path: str, + *, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), + ray_remote_args: Dict[str, Any] = None, + encoder: Optional[Union[bool, str, callable, list]] = True, + ) -> None: + """Write the dataset to WebDataset files. + + The `TFRecord `_ + files will contain + `tf.train.Example `_ # noqa: E501 + records, with one Example record for each row in the dataset. + + .. warning:: + tf.train.Feature only natively stores ints, floats, and bytes, + so this function only supports datasets with these data types, + and will error if the dataset contains unsupported types. + + This is only supported for datasets convertible to Arrow records. + To control the number of files, use ``.repartition()``. + + Unless a custom block path provider is given, the format of the output + files will be {uuid}_{block_idx}.tfrecords, where ``uuid`` is an unique id + for the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.from_items([ + ... { "name": "foo", "score": 42 }, + ... { "name": "bar", "score": 43 }, + ... ]) + >>> ds.write_webdataset("s3://bucket/path") # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where tfrecords + files will be written to. + filesystem: The filesystem implementation to write to. + try_create_dir: Try to create all directories in destination path + if True. Does nothing if all directories already exist. + arrow_open_stream_args: kwargs passed to + pyarrow.fs.FileSystem.open_output_stream + block_path_provider: BlockWritePathProvider implementation to + write each dataset block to a custom output path. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + + """ + + from ray.data.datasource.webdataset_datasource import WebDatasetDatasource + + self.write_datasource( + WebDatasetDatasource(), + ray_remote_args=ray_remote_args, + path=path, + dataset_uuid=self._uuid, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + block_path_provider=block_path_provider, + encoder=encoder, + ) + + @ConsumptionAPI + def write_numpy( + self, + path: str, + *, + column: Optional[str] = None, + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + try_create_dir: bool = True, + arrow_open_stream_args: Optional[Dict[str, Any]] = None, + block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), + ray_remote_args: Dict[str, Any] = None, + ) -> None: + """Write a tensor column of the dataset to npy files. + + This is only supported for datasets convertible to Arrow records that + contain a TensorArray column. To control the number of files, use + ``.repartition()``. + + Unless a custom block path provider is given, the format of the output + files will be {self._uuid}_{block_idx}.npy, where ``uuid`` is an unique + id for the dataset. + + Examples: + >>> import ray + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> ds.write_numpy("s3://bucket/path") # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + path: The path to the destination root directory, where npy + files will be written to. + column: The name of the table column that contains the tensor to + be written. + filesystem: The filesystem implementation to write to. + try_create_dir: Try to create all directories in destination path + if True. Does nothing if all directories already exist. + arrow_open_stream_args: kwargs passed to + pyarrow.fs.FileSystem.open_output_stream + block_path_provider: BlockWritePathProvider implementation to + write each dataset block to a custom output path. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + """ + context = DataContext.get_current() + if context.strict_mode and not column: + raise StrictModeError( + "In Ray 2.5, the column must be specified " + "(e.g., `write_numpy(column='data')`)." + ) + column = column or TENSOR_COLUMN_NAME + + self.write_datasource( + NumpyDatasource(), + ray_remote_args=ray_remote_args, + path=path, + dataset_uuid=self._uuid, + column=column, + filesystem=filesystem, + try_create_dir=try_create_dir, + open_stream_args=arrow_open_stream_args, + block_path_provider=block_path_provider, + ) + + @ConsumptionAPI + def write_mongo( + self, + uri: str, + database: str, + collection: str, + ray_remote_args: Dict[str, Any] = None, + ) -> None: + """Write the dataset to a MongoDB datasource. + + This is only supported for datasets convertible to Arrow records. + To control the number of parallel write tasks, use ``.repartition()`` + before calling this method. + + .. note:: + Currently, this supports only a subset of the pyarrow's types, due to the + limitation of pymongoarrow which is used underneath. Writing unsupported + types will fail on type checking. See all the supported types at: + https://mongo-arrow.readthedocs.io/en/latest/data_types.html. + + .. note:: + The records will be inserted into MongoDB as new documents. If a record has + the _id field, this _id must be non-existent in MongoDB, otherwise the write + will be rejected and fail (hence preexisting documents are protected from + being mutated). It's fine to not have _id field in record and MongoDB will + auto generate one at insertion. + + Examples: + >>> import ray + >>> import pandas as pd + >>> docs = [{"title": "MongoDB Datasource test"} for key in range(4)] + >>> ds = ray.data.from_pandas(pd.DataFrame(docs)) + >>> ds.write_mongo( # doctest: +SKIP + >>> MongoDatasource(), # doctest: +SKIP + >>> uri="mongodb://username:password@mongodb0.example.com:27017/?authSource=admin", # noqa: E501 # doctest: +SKIP + >>> database="my_db", # doctest: +SKIP + >>> collection="my_collection", # doctest: +SKIP + >>> ) # doctest: +SKIP + + Args: + uri: The URI to the destination MongoDB where the dataset will be + written to. For the URI format, see details in + https://www.mongodb.com/docs/manual/reference/connection-string/. + database: The name of the database. This database must exist otherwise + ValueError will be raised. + collection: The name of the collection in the database. This collection + must exist otherwise ValueError will be raised. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + """ + from ray.data.datasource import MongoDatasource + + self.write_datasource( + MongoDatasource(), + ray_remote_args=ray_remote_args, + uri=uri, + database=database, + collection=collection, + ) + + @ConsumptionAPI + def write_datasource( + self, + datasource: Datasource, + *, + ray_remote_args: Dict[str, Any] = None, + **write_args, + ) -> None: + """Write the dataset to a custom datasource. + + Examples: + >>> import ray + >>> from ray.data.datasource import Datasource + >>> ds = ray.data.range(100) # doctest: +SKIP + >>> class CustomDatasource(Datasource): # doctest: +SKIP + ... # define custom data source + ... pass # doctest: +SKIP + >>> ds.write_datasource(CustomDatasource(...)) # doctest: +SKIP + + Time complexity: O(dataset size / parallelism) + + Args: + datasource: The datasource to write to. + ray_remote_args: Kwargs passed to ray.remote in the write tasks. + write_args: Additional write args to pass to the datasource. + """ + if ray_remote_args is None: + ray_remote_args = {} + path = write_args.get("path", None) + if path and _is_local_scheme(path): + if ray.util.client.ray.is_connected(): + raise ValueError( + f"The local scheme paths {path} are not supported in Ray Client." + ) + ray_remote_args["scheduling_strategy"] = NodeAffinitySchedulingStrategy( + ray.get_runtime_context().get_node_id(), + soft=False, + ) + + if type(datasource).write != Datasource.write: + write_fn = generate_write_fn(datasource, **write_args) + + def write_fn_wrapper(blocks: Iterator[Block], ctx, fn) -> Iterator[Block]: + return write_fn(blocks, ctx) + + plan = self._plan.with_stage( + OneToOneStage( + "Write", + write_fn_wrapper, + TaskPoolStrategy(), + ray_remote_args, + fn=lambda x: x, + ) + ) + + logical_plan = self._logical_plan + if logical_plan is not None: + write_op = Write( + logical_plan.dag, + datasource, + ray_remote_args=ray_remote_args, + **write_args, + ) + logical_plan = LogicalPlan(write_op) + + try: + import pandas as pd + + self._write_ds = Dataset( + plan, self._epoch, self._lazy, logical_plan + ).materialize() + blocks = ray.get(self._write_ds._plan.execute().get_blocks()) + assert all( + isinstance(block, pd.DataFrame) and len(block) == 1 + for block in blocks + ) + write_results = [block["write_result"][0] for block in blocks] + datasource.on_write_complete(write_results) + except Exception as e: + datasource.on_write_failed([], e) + raise + else: + logger.warning( + "The Datasource.do_write() is deprecated in " + "Ray 2.4 and will be removed in future release. Use " + "Datasource.write() instead." + ) + + ctx = DataContext.get_current() + blocks, metadata = zip(*self._plan.execute().get_blocks_with_metadata()) + # Prepare write in a remote task so that in Ray client mode, we + # don't do metadata resolution from the client machine. + do_write = cached_remote_fn(_do_write, retry_exceptions=False, num_cpus=0) + write_results: List[ObjectRef[WriteResult]] = ray.get( + do_write.remote( + datasource, + ctx, + blocks, + metadata, + ray_remote_args, + _wrap_arrow_serialization_workaround(write_args), + ) + ) + + progress = ProgressBar("Write Progress", len(write_results)) + try: + progress.block_until_complete(write_results) + datasource.on_write_complete(ray.get(write_results)) + except Exception as e: + datasource.on_write_failed(write_results, e) + raise + finally: + progress.close() + + @ConsumptionAPI( + delegate=( + "Calling any of the consumption methods on the returned ``DataIterator``" + ) + ) + def iterator(self) -> DataIterator: + """Return a :class:`~ray.data.DataIterator` that + can be used to repeatedly iterate over the dataset. + + Examples: + >>> import ray + >>> for batch in ray.data.range( + ... 1000000 + ... ).iterator().iter_batches(): # doctest: +SKIP + ... print(batch) # doctest: +SKIP + + .. note:: + It is recommended to use ``DataIterator`` methods over directly + calling methods such as ``iter_batches()``. + """ + return DataIteratorImpl(self) + + @ConsumptionAPI + def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Dict[str, Any]]: + """Return a local row iterator over the dataset. + + Examples: + >>> import ray + >>> for i in ray.data.range(1000000).iter_rows(): # doctest: +SKIP + ... print(i) # doctest: +SKIP + + Time complexity: O(1) + + Args: + prefetch_blocks: The number of blocks to prefetch ahead of the + current block during the scan. + + Returns: + A local iterator over the entire dataset. + """ + + return self.iterator().iter_rows(prefetch_blocks=prefetch_blocks) + + @ConsumptionAPI + def iter_batches( + self, + *, + prefetch_batches: int = 1, + batch_size: Optional[int] = 256, + batch_format: Optional[str] = "default", + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + _collate_fn: Optional[Callable[[DataBatch], Any]] = None, + # Deprecated. + prefetch_blocks: int = 0, + ) -> Iterator[DataBatch]: + """Return a local batched iterator over the dataset. + + Examples: + >>> import ray + >>> for batch in ray.data.range(1000000).iter_batches(): # doctest: +SKIP + ... print(batch) # doctest: +SKIP + + Time complexity: O(1) + + Args: + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool will be used + to fetch the objects to the local node, format the batches, and apply + the collate_fn. Defaults to 1. You can revert back to the old + prefetching behavior that uses `prefetch_blocks` by setting + `use_legacy_iter_batches` to True in the datasetContext. + batch_size: The number of rows in each batch, or None to use entire blocks + as batches (blocks may contain different number of rows). + The final batch may include fewer than ``batch_size`` rows if + ``drop_last`` is ``False``. Defaults to 256. + batch_format: Specify ``"default"`` to use the default block format + (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to + select ``pyarrow.Table``, or ``"numpy"`` to select + ``Dict[str, numpy.ndarray]``, or None to return the underlying block + exactly as is with no additional formatting. + drop_last: Whether to drop the last batch if it's incomplete. + local_shuffle_buffer_size: If non-None, the data will be randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer will be drained. + local_shuffle_seed: The seed to use for the local random shuffle. + + Returns: + An iterator over record batches. + """ + batch_format = _apply_strict_mode_batch_format(batch_format) + if batch_format == "native": + logger.warning("The 'native' batch format has been renamed 'default'.") + return self.iterator().iter_batches( + prefetch_batches=prefetch_batches, + prefetch_blocks=prefetch_blocks, + batch_size=batch_size, + batch_format=batch_format, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + _collate_fn=_collate_fn, + ) + + @ConsumptionAPI + def iter_torch_batches( + self, + *, + prefetch_batches: int = 1, + batch_size: Optional[int] = 256, + dtypes: Optional[Union["torch.dtype", Dict[str, "torch.dtype"]]] = None, + device: Optional[str] = None, + collate_fn: Optional[ + Callable[[Union[np.ndarray, Dict[str, np.ndarray]]], Any] + ] = None, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + # Deprecated + prefetch_blocks: int = 0, + ) -> Iterator["TorchTensorBatchType"]: + """Return a local batched iterator of Torch Tensors over the dataset. + + This iterator will yield single-tensor batches if the underlying dataset + consists of a single column; otherwise, it will yield a dictionary of + column-tensors. If looking for more flexibility in the tensor conversion (e.g. + casting dtypes) or the batch format, try use `.iter_batches` directly, which is + a lower-level API. + + Examples: + >>> import ray + >>> for batch in ray.data.range( # doctest: +SKIP + ... 12, + ... ).iter_torch_batches(batch_size=4): + ... print(batch.shape) # doctest: +SKIP + torch.Size([4, 1]) + torch.Size([4, 1]) + torch.Size([4, 1]) + + Time complexity: O(1) + + Args: + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool will be used + to fetch the objects to the local node, format the batches, and apply + the collate_fn. Defaults to 1. You can revert back to the old + prefetching behavior that uses `prefetch_blocks` by setting + `use_legacy_iter_batches` to True in the datasetContext. + batch_size: The number of rows in each batch, or None to use entire blocks + as batches (blocks may contain different number of rows). + The final batch may include fewer than ``batch_size`` rows if + ``drop_last`` is ``False``. Defaults to 256. + dtypes: The Torch dtype(s) for the created tensor(s); if None, the dtype + will be inferred from the tensor data. + device: The device on which the tensor should be placed; if None, the Torch + tensor will be constructed on the CPU. + collate_fn: A function to convert a Numpy batch to a PyTorch tensor batch. + Potential use cases include collating along a dimension other than the + first, padding sequences of various lengths, or generally handling + batches of different length tensors. If not provided, the default + collate function is used which simply converts the batch of numpy + arrays to a batch of PyTorch tensors. This API is still experimental + and is subject to change. + drop_last: Whether to drop the last batch if it's incomplete. + local_shuffle_buffer_size: If non-None, the data will be randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer will be drained. This + buffer size must be greater than or equal to ``batch_size``, and + therefore ``batch_size`` must also be specified when using local + shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + + Returns: + An iterator over Torch Tensor batches. + """ + return self.iterator().iter_torch_batches( + prefetch_batches=prefetch_batches, + prefetch_blocks=prefetch_blocks, + batch_size=batch_size, + dtypes=dtypes, + device=device, + collate_fn=collate_fn, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + ) + + @ConsumptionAPI + def iter_tf_batches( + self, + *, + prefetch_batches: int = 1, + batch_size: Optional[int] = 256, + dtypes: Optional[Union["tf.dtypes.DType", Dict[str, "tf.dtypes.DType"]]] = None, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + # Deprecated + prefetch_blocks: int = 0, + ) -> Iterator[TensorFlowTensorBatchType]: + """Return a local batched iterator of TensorFlow Tensors over the dataset. + + This iterator will yield single-tensor batches of the underlying dataset + consists of a single column; otherwise, it will yield a dictionary of + column-tensors. + + .. tip:: + If you don't need the additional flexibility provided by this method, + consider using :meth:`~ray.data.Dataset.to_tf` instead. It's easier + to use. + + Examples: + >>> import ray + >>> for batch in ray.data.range( # doctest: +SKIP + ... 12, + ... ).iter_tf_batches(batch_size=4): + ... print(batch.shape) # doctest: +SKIP + (4, 1) + (4, 1) + (4, 1) + + Time complexity: O(1) + + Args: + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool will be used + to fetch the objects to the local node, format the batches, and apply + the collate_fn. Defaults to 1. You can revert back to the old + prefetching behavior that uses `prefetch_blocks` by setting + `use_legacy_iter_batches` to True in the datasetContext. + batch_size: The number of rows in each batch, or None to use entire blocks + as batches (blocks may contain different number of rows). + The final batch may include fewer than ``batch_size`` rows if + ``drop_last`` is ``False``. Defaults to 256. + dtypes: The TensorFlow dtype(s) for the created tensor(s); if None, the + dtype will be inferred from the tensor data. + drop_last: Whether to drop the last batch if it's incomplete. + local_shuffle_buffer_size: If non-None, the data will be randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer will be drained. This + buffer size must be greater than or equal to ``batch_size``, and + therefore ``batch_size`` must also be specified when using local + shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + + Returns: + An iterator over TensorFlow Tensor batches. + """ + return self.iterator().iter_tf_batches( + prefetch_batches=prefetch_batches, + prefetch_blocks=prefetch_blocks, + batch_size=batch_size, + dtypes=dtypes, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + ) + + @ConsumptionAPI(pattern="Time complexity:") + def to_torch( + self, + *, + label_column: Optional[str] = None, + feature_columns: Optional[ + Union[List[str], List[List[str]], Dict[str, List[str]]] + ] = None, + label_column_dtype: Optional["torch.dtype"] = None, + feature_column_dtypes: Optional[ + Union["torch.dtype", List["torch.dtype"], Dict[str, "torch.dtype"]] + ] = None, + batch_size: int = 1, + prefetch_batches: int = 1, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + unsqueeze_label_tensor: bool = True, + unsqueeze_feature_tensors: bool = True, + # Deprecated + prefetch_blocks: int = 0, + ) -> "torch.utils.data.IterableDataset": + """Return a Torch IterableDataset over this dataset. + + This is only supported for datasets convertible to Arrow records. + + It is recommended to use the returned ``IterableDataset`` directly + instead of passing it into a torch ``DataLoader``. + + Each element in IterableDataset will be a tuple consisting of 2 + elements. The first item contains the feature tensor(s), and the + second item is the label tensor. Those can take on different + forms, depending on the specified arguments. + + For the features tensor (N is the ``batch_size`` and n, m, k + are the number of features per tensor): + + * If ``feature_columns`` is a ``List[str]``, the features will be + a tensor of shape (N, n), with columns corresponding to + ``feature_columns`` + + * If ``feature_columns`` is a ``List[List[str]]``, the features will be + a list of tensors of shape [(N, m),...,(N, k)], with columns of each + tensor corresponding to the elements of ``feature_columns`` + + * If ``feature_columns`` is a ``Dict[str, List[str]]``, the features + will be a dict of key-tensor pairs of shape + {key1: (N, m),..., keyN: (N, k)}, with columns of each + tensor corresponding to the value of ``feature_columns`` under the + key. + + If ``unsqueeze_label_tensor=True`` (default), the label tensor will be + of shape (N, 1). Otherwise, it will be of shape (N,). + If ``label_column`` is specified as ``None``, then no column from the + ``Dataset`` will be treated as the label, and the output label tensor + will be ``None``. + + Note that you probably want to call ``.split()`` on this dataset if + there are to be multiple Torch workers consuming the data. + + Time complexity: O(1) + + Args: + label_column: The name of the column used as the + label (second element of the output list). Can be None for + prediction, in which case the second element of returned + tuple will also be None. + feature_columns: The names of the columns + to use as the features. Can be a list of lists or + a dict of string-list pairs for multi-tensor output. + If None, then use all columns except the label column as + the features. + label_column_dtype: The torch dtype to + use for the label column. If None, then automatically infer + the dtype. + feature_column_dtypes: The dtypes to use for the feature + tensors. This should match the format of ``feature_columns``, + or be a single dtype, in which case it will be applied to + all tensors. If None, then automatically infer the dtype. + batch_size: How many samples per batch to yield at a time. + Defaults to 1. + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool will be used + to fetch the objects to the local node, format the batches, and apply + the collate_fn. Defaults to 1. You can revert back to the old + prefetching behavior that uses `prefetch_blocks` by setting + `use_legacy_iter_batches` to True in the datasetContext. + drop_last: Set to True to drop the last incomplete batch, + if the dataset size is not divisible by the batch size. If + False and the size of the stream is not divisible by the batch + size, then the last batch will be smaller. Defaults to False. + local_shuffle_buffer_size: If non-None, the data will be randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer will be drained. This + buffer size must be greater than or equal to ``batch_size``, and + therefore ``batch_size`` must also be specified when using local + shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + unsqueeze_label_tensor: If set to True, the label tensor + will be unsqueezed (reshaped to (N, 1)). Otherwise, it will + be left as is, that is (N, ). In general, regression loss + functions expect an unsqueezed tensor, while classification + loss functions expect a squeezed one. Defaults to True. + unsqueeze_feature_tensors: If set to True, the features tensors + will be unsqueezed (reshaped to (N, 1)) before being concatenated into + the final features tensor. Otherwise, they will be left as is, that is + (N, ). Defaults to True. + + Returns: + A torch IterableDataset. + """ + + return self.iterator().to_torch( + label_column=label_column, + feature_columns=feature_columns, + label_column_dtype=label_column_dtype, + feature_column_dtypes=feature_column_dtypes, + batch_size=batch_size, + prefetch_blocks=prefetch_blocks, + prefetch_batches=prefetch_batches, + drop_last=drop_last, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + unsqueeze_label_tensor=unsqueeze_label_tensor, + unsqueeze_feature_tensors=unsqueeze_feature_tensors, + ) + + @ConsumptionAPI + def to_tf( + self, + feature_columns: Union[str, List[str]], + label_columns: Union[str, List[str]], + *, + prefetch_batches: int = 1, + batch_size: int = 1, + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + # Deprecated + prefetch_blocks: int = 0, + ) -> "tf.data.Dataset": + """Return a TF Dataset over this dataset. + + .. warning:: + If your dataset contains ragged tensors, this method errors. To prevent + errors, :ref:`resize your tensors `. + + Examples: + >>> import ray + >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") + >>> ds + Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) + + If your model accepts a single tensor as input, specify a single feature column. + + >>> ds.to_tf(feature_columns="sepal length (cm)", label_columns="target") # doctest: +SKIP + <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> + + If your model accepts a dictionary as input, specify a list of feature columns. + + >>> ds.to_tf(["sepal length (cm)", "sepal width (cm)"], "target") # doctest: +SKIP + <_OptionsDataset element_spec=({'sepal length (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), 'sepal width (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal width (cm)')}, TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> + + If your dataset contains multiple features but your model accepts a single + tensor as input, combine features with + :class:`~ray.data.preprocessors.Concatenator`. + + >>> from ray.data.preprocessors import Concatenator + >>> preprocessor = Concatenator(output_column_name="features", exclude="target") + >>> ds = preprocessor.transform(ds) + >>> ds + Concatenator + +- Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) + >>> ds.to_tf("features", "target") # doctest: +SKIP + <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> + + Args: + feature_columns: Columns that correspond to model inputs. If this is a + string, the input data is a tensor. If this is a list, the input data + is a ``dict`` that maps column names to their tensor representation. + label_column: Columns that correspond to model targets. If this is a + string, the target data is a tensor. If this is a list, the target data + is a ``dict`` that maps column names to their tensor representation. + prefetch_batches: The number of batches to fetch ahead of the current batch + to fetch. If set to greater than 0, a separate threadpool will be used + to fetch the objects to the local node, format the batches, and apply + the collate_fn. Defaults to 1. You can revert back to the old + prefetching behavior that uses `prefetch_blocks` by setting + `use_legacy_iter_batches` to True in the datasetContext. + batch_size: Record batch size. Defaults to 1. + drop_last: Set to True to drop the last incomplete batch, + if the dataset size is not divisible by the batch size. If + False and the size of the stream is not divisible by the batch + size, then the last batch will be smaller. Defaults to False. + local_shuffle_buffer_size: If non-None, the data will be randomly shuffled + using a local in-memory shuffle buffer, and this value will serve as the + minimum number of rows that must be in the local in-memory shuffle + buffer in order to yield a batch. When there are no more rows to add to + the buffer, the remaining rows in the buffer will be drained. This + buffer size must be greater than or equal to ``batch_size``, and + therefore ``batch_size`` must also be specified when using local + shuffling. + local_shuffle_seed: The seed to use for the local random shuffle. + + Returns: + A ``tf.data.Dataset`` that yields inputs and targets. + + .. seealso:: + + :meth:`~ray.data.Dataset.iter_tf_batches` + Call this method if you need more flexibility. + + """ # noqa: E501 + + return self.iterator().to_tf( + feature_columns=feature_columns, + label_columns=label_columns, + prefetch_batches=prefetch_batches, + prefetch_blocks=prefetch_blocks, + drop_last=drop_last, + batch_size=batch_size, + local_shuffle_buffer_size=local_shuffle_buffer_size, + local_shuffle_seed=local_shuffle_seed, + ) + + @ConsumptionAPI(pattern="Time complexity:") + def to_dask( + self, + meta: Union[ + "pandas.DataFrame", + "pandas.Series", + Dict[str, Any], + Iterable[Any], + Tuple[Any], + None, + ] = None, + ) -> "dask.DataFrame": + """Convert this dataset into a Dask DataFrame. + + This is only supported for datasets convertible to Arrow records. + + Note that this function will set the Dask scheduler to Dask-on-Ray + globally, via the config. + + Time complexity: O(dataset size / parallelism) + + Args: + meta: An empty pandas DataFrame or Series that matches the dtypes and column + names of the stream. This metadata is necessary for many algorithms in + dask dataframe to work. For ease of use, some alternative inputs are + also available. Instead of a DataFrame, a dict of ``{name: dtype}`` or + iterable of ``(name, dtype)`` can be provided (note that the order of + the names should match the order of the columns). Instead of a series, a + tuple of ``(name, dtype)`` can be used. + By default, this will be inferred from the underlying Dataset schema, + with this argument supplying an optional override. + + Returns: + A Dask DataFrame created from this dataset. + """ + import dask + import dask.dataframe as dd + import pandas as pd + + try: + import pyarrow as pa + except Exception: + pa = None + + from ray.data._internal.pandas_block import PandasBlockSchema + from ray.util.client.common import ClientObjectRef + from ray.util.dask import ray_dask_get + + dask.config.set(scheduler=ray_dask_get) + + @dask.delayed + def block_to_df(block: Block): + if isinstance(block, (ray.ObjectRef, ClientObjectRef)): + raise ValueError( + "Dataset.to_dask() must be used with Dask-on-Ray, please " + "set the Dask scheduler to ray_dask_get (located in " + "ray.util.dask)." + ) + return _block_to_df(block) + + if meta is None: + from ray.data.extensions import TensorDtype + + # Infer Dask metadata from Dataset schema. + schema = self.schema(fetch_if_missing=True) + if isinstance(schema, PandasBlockSchema): + meta = pd.DataFrame( + { + col: pd.Series( + dtype=( + dtype + if not isinstance(dtype, TensorDtype) + else np.object_ + ) + ) + for col, dtype in zip(schema.names, schema.types) + } + ) + elif pa is not None and isinstance(schema, pa.Schema): + from ray.data.extensions import ArrowTensorType + + if any(isinstance(type_, ArrowTensorType) for type_ in schema.types): + meta = pd.DataFrame( + { + col: pd.Series( + dtype=( + dtype.to_pandas_dtype() + if not isinstance(dtype, ArrowTensorType) + else np.object_ + ) + ) + for col, dtype in zip(schema.names, schema.types) + } + ) + else: + meta = schema.empty_table().to_pandas() + + ddf = dd.from_delayed( + [block_to_df(block) for block in self.get_internal_block_refs()], + meta=meta, + ) + return ddf + + @ConsumptionAPI(pattern="Time complexity:") + def to_mars(self) -> "mars.DataFrame": + """Convert this dataset into a MARS dataframe. + + Time complexity: O(dataset size / parallelism) + + Returns: + A MARS dataframe created from this dataset. + """ + import pandas as pd + import pyarrow as pa + from mars.dataframe.datasource.read_raydataset import DataFrameReadRayDataset + from mars.dataframe.utils import parse_index + + from ray.data._internal.pandas_block import PandasBlockSchema + + refs = self.to_pandas_refs() + # remove this when https://github.com/mars-project/mars/issues/2945 got fixed + schema = self.schema() + if isinstance(schema, Schema): + schema = schema.base_schema # Backwards compat with non strict mode. + if isinstance(schema, PandasBlockSchema): + dtypes = pd.Series(schema.types, index=schema.names) + elif isinstance(schema, pa.Schema): + dtypes = schema.empty_table().to_pandas().dtypes + else: + raise NotImplementedError(f"Unsupported format of schema {schema}") + index_value = parse_index(pd.RangeIndex(-1)) + columns_value = parse_index(dtypes.index, store_data=True) + op = DataFrameReadRayDataset(refs=refs) + return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes) + + @ConsumptionAPI(pattern="Time complexity:") + def to_modin(self) -> "modin.DataFrame": + """Convert this dataset into a Modin dataframe. + + This works by first converting this dataset into a distributed set of + Pandas dataframes (using ``.to_pandas_refs()``). Please see caveats + there. Then the individual dataframes are used to create the modin + DataFrame using + ``modin.distributed.dataframe.pandas.partitions.from_partitions()``. + + This is only supported for datasets convertible to Arrow records. + This function induces a copy of the data. For zero-copy access to the + underlying data, consider using ``.to_arrow()`` or + ``.get_internal_block_refs()``. + + Time complexity: O(dataset size / parallelism) + + Returns: + A Modin dataframe created from this dataset. + """ + + from modin.distributed.dataframe.pandas.partitions import from_partitions + + pd_objs = self.to_pandas_refs() + return from_partitions(pd_objs, axis=0) + + @ConsumptionAPI(pattern="Time complexity:") + def to_spark(self, spark: "pyspark.sql.SparkSession") -> "pyspark.sql.DataFrame": + """Convert this dataset into a Spark dataframe. + + Time complexity: O(dataset size / parallelism) + + Returns: + A Spark dataframe created from this dataset. + """ + import raydp + + schema = self.schema() + if isinstance(schema, Schema): + schema = schema.base_schema # Backwards compat with non strict mode. + return raydp.spark.ray_dataset_to_spark_dataframe( + spark, schema, self.get_internal_block_refs() + ) + + @ConsumptionAPI(pattern="Time complexity:") + def to_pandas(self, limit: int = 100000) -> "pandas.DataFrame": + """Convert this dataset into a single Pandas DataFrame. + + This is only supported for datasets convertible to Arrow or Pandas + records. An error is raised if the number of records exceeds the + provided limit. Note that you can use ``.limit()`` on the dataset + beforehand to truncate the dataset manually. + + Time complexity: O(dataset size) + + Args: + limit: The maximum number of records to return. An error will be + raised if the limit is exceeded. + + Returns: + A Pandas DataFrame created from this dataset, containing a limited + number of records. + """ + count = self.count() + if count > limit: + raise ValueError( + f"the dataset has more than the given limit of {limit} " + f"records: {count}. If you are sure that a DataFrame with " + f"{count} rows will fit in local memory, use " + f"ds.to_pandas(limit={count})." + ) + blocks = self.get_internal_block_refs() + output = DelegatingBlockBuilder() + for block in blocks: + output.add_block(ray.get(block)) + block = output.build() + return _block_to_df(block) + + @ConsumptionAPI(pattern="Time complexity:") + @DeveloperAPI + def to_pandas_refs(self) -> List[ObjectRef["pandas.DataFrame"]]: + """Convert this dataset into a distributed set of Pandas dataframes. + + This is only supported for datasets convertible to Arrow records. + This function induces a copy of the data. For zero-copy access to the + underlying data, consider using ``.to_arrow()`` or + ``.get_internal_block_refs()``. + + Time complexity: O(dataset size / parallelism) + + Returns: + A list of remote Pandas dataframes created from this dataset. + """ + + block_to_df = cached_remote_fn(_block_to_df) + return [block_to_df.remote(block) for block in self.get_internal_block_refs()] + + @DeveloperAPI + def to_numpy_refs( + self, *, column: Optional[str] = None + ) -> List[ObjectRef[np.ndarray]]: + """Convert this dataset into a distributed set of NumPy ndarrays. + + This is only supported for datasets convertible to NumPy ndarrays. + This function induces a copy of the data. For zero-copy access to the + underlying data, consider using ``.to_arrow()`` or + ``.get_internal_block_refs()``. + + Time complexity: O(dataset size / parallelism) + + Args: + column: The name of the column to convert to numpy, or None to specify the + entire row. If not specified for Arrow or Pandas blocks, each returned + future will represent a dict of column ndarrays. + + Returns: + A list of remote NumPy ndarrays created from this dataset. + """ + block_to_ndarray = cached_remote_fn(_block_to_ndarray) + return [ + block_to_ndarray.remote(block, column=column) + for block in self.get_internal_block_refs() + ] + + @ConsumptionAPI(pattern="Time complexity:") + @DeveloperAPI + def to_arrow_refs(self) -> List[ObjectRef["pyarrow.Table"]]: + """Convert this dataset into a distributed set of Arrow tables. + + This is only supported for datasets convertible to Arrow records. + This function is zero-copy if the existing data is already in Arrow + format. Otherwise, the data will be converted to Arrow format. + + Time complexity: O(1) unless conversion is required. + + Returns: + A list of remote Arrow tables created from this dataset. + """ + import pyarrow as pa + + blocks: List[ObjectRef["pyarrow.Table"]] = self.get_internal_block_refs() + # Schema is safe to call since we have already triggered execution with + # get_internal_block_refs. + schema = self.schema(fetch_if_missing=True) + if isinstance(schema, Schema): + schema = schema.base_schema # Backwards compat with non strict mode. + if isinstance(schema, pa.Schema): + # Zero-copy path. + return blocks + + block_to_arrow = cached_remote_fn(_block_to_arrow) + return [block_to_arrow.remote(block) for block in blocks] + + @ConsumptionAPI(pattern="Args:") + def to_random_access_dataset( + self, + key: str, + num_workers: Optional[int] = None, + ) -> RandomAccessDataset: + """Convert this dataset into a distributed RandomAccessDataset (EXPERIMENTAL). + + RandomAccessDataset partitions the dataset across the cluster by the given + sort key, providing efficient random access to records via binary search. A + number of worker actors are created, each of which has zero-copy access to the + underlying sorted data blocks of the dataset. + + Note that the key must be unique in the dataset. If there are duplicate keys, + an arbitrary value is returned. + + This is only supported for Arrow-format datasets. + + Args: + key: The key column over which records can be queried. + num_workers: The number of actors to use to serve random access queries. + By default, this is determined by multiplying the number of Ray nodes + in the cluster by four. As a rule of thumb, you can expect each worker + to provide ~3000 records / second via ``get_async()``, and + ~10000 records / second via ``multiget()``. + """ + if num_workers is None: + num_workers = 4 * len(ray.nodes()) + return RandomAccessDataset(self, key, num_workers=num_workers) + + @ConsumptionAPI + def repeat(self, times: Optional[int] = None) -> "DatasetPipeline": + """Convert this into a DatasetPipeline by looping over this dataset. + + Transformations prior to the call to ``repeat()`` are evaluated once. + Transformations done on the returned pipeline are evaluated on each + loop of the pipeline over the base dataset. + + Note that every repeat of the dataset is considered an "epoch" for + the purposes of ``DatasetPipeline.iter_epochs()``. + + Examples: + >>> import ray + >>> ds = ray.data.range(5, parallelism=1) + >>> # Infinite pipeline of numbers [0, 5) + >>> ds.repeat().take_batch() + {'id': array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, ...])} + >>> # Can shuffle each epoch (dataset) in the pipeline. + >>> ds.repeat().random_shuffle().take_batch() # doctest: +SKIP + {'id': array([2, 3, 0, 4, 1, 4, 0, 2, 1, 3, ...])} + + Args: + times: The number of times to loop over this dataset, or None + to repeat indefinitely. + """ + from ray.data._internal.plan import _rewrite_read_stage + from ray.data.dataset_pipeline import DatasetPipeline + + ctx = DataContext.get_current() + if self._plan.is_read_stage_equivalent() and ctx.optimize_fuse_read_stages: + blocks, _, stages = self._plan._get_source_blocks_and_stages() + blocks.clear() + blocks, outer_stats, stages = _rewrite_read_stage(blocks, stages) + read_stage = stages[0] + else: + blocks = self._plan.execute() + outer_stats = self._plan.stats() + read_stage = None + uuid = self._get_uuid() + outer_stats.dataset_uuid = uuid + + if times is not None and times < 1: + raise ValueError("`times` must be >= 1, got {}".format(times)) + + class Iterator: + def __init__(self, blocks): + self._blocks = blocks + self._i = 0 + + def __next__(self) -> Callable[[], "Dataset"]: + if times and self._i >= times: + raise StopIteration + epoch = self._i + blocks = self._blocks + self._i += 1 + + def gen(): + ds = Dataset( + ExecutionPlan( + blocks, + outer_stats, + dataset_uuid=uuid, + run_by_consumer=True, + ), + epoch, + lazy=False, + ) + ds._set_uuid(uuid) + return ds + + return gen + + class Iterable: + def __init__(self, blocks): + self._blocks = blocks + + def __iter__(self): + return Iterator(self._blocks) + + pipe = DatasetPipeline(Iterable(blocks), False, length=times or float("inf")) + if read_stage: + pipe = pipe.foreach_window( + lambda ds, read_stage=read_stage: Dataset( + ds._plan.with_stage(read_stage), ds._epoch, True + ) + ) + return pipe + + def window( + self, + *, + blocks_per_window: Optional[int] = None, + bytes_per_window: Optional[int] = None, + ) -> "DatasetPipeline": + """Convert this into a DatasetPipeline by windowing over data blocks. + + Transformations prior to the call to ``window()`` are evaluated in + bulk on the entire dataset. Transformations done on the returned + pipeline are evaluated incrementally per window of blocks as data is + read from the output of the pipeline. + + Windowing execution allows for output to be read sooner without + waiting for all transformations to fully execute, and can also improve + efficiency if transforms use different resources (e.g., GPUs). + + Without windowing:: + + [preprocessing......] + [inference.......] + [write........] + Time -----------------------------------------------------------> + + With windowing:: + + [prep1] [prep2] [prep3] + [infer1] [infer2] [infer3] + [write1] [write2] [write3] + Time -----------------------------------------------------------> + + Examples: + >>> import ray + >>> # Create an inference pipeline. + >>> ds = ray.data.read_binary_files(dir) # doctest: +SKIP + >>> infer = ... # doctest: +SKIP + >>> pipe = ds.window(blocks_per_window=10).map(infer) # doctest: +SKIP + DatasetPipeline(num_windows=40, num_stages=2) + >>> # The higher the stage parallelism, the shorter the pipeline. + >>> pipe = ds.window(blocks_per_window=20).map(infer) # doctest: +SKIP + DatasetPipeline(num_windows=20, num_stages=2) + >>> # Outputs can be incrementally read from the pipeline. + >>> for item in pipe.iter_rows(): # doctest: +SKIP + ... print(item) # doctest: +SKIP + + Args: + blocks_per_window: The window size (parallelism) in blocks. + Increasing window size increases pipeline throughput, but also + increases the latency to initial output, since it decreases the + length of the pipeline. Setting this to infinity effectively + disables pipelining. + bytes_per_window: Specify the window size in bytes instead of blocks. + This will be treated as an upper bound for the window size, but each + window will still include at least one block. This is mutually + exclusive with ``blocks_per_window``. + """ + from ray.data._internal.plan import _rewrite_read_stage + from ray.data.dataset_pipeline import DatasetPipeline + + if blocks_per_window is not None and bytes_per_window is not None: + raise ValueError("Only one windowing scheme can be specified.") + + if blocks_per_window is None: + blocks_per_window = 10 + + ctx = DataContext.get_current() + if self._plan.is_read_stage_equivalent() and ctx.optimize_fuse_read_stages: + blocks, _, stages = self._plan._get_source_blocks_and_stages() + blocks.clear() + blocks, outer_stats, stages = _rewrite_read_stage(blocks, stages) + read_stage = stages[0] + else: + blocks = self._plan.execute() + outer_stats = self._plan.stats() + read_stage = None + + class Iterator: + def __init__(self, splits, epoch): + self._splits = splits.copy() + self._epoch = epoch + + def __next__(self) -> "Dataset": + if not self._splits: + raise StopIteration + + blocks = self._splits.pop(0) + + def gen(): + ds = Dataset( + ExecutionPlan(blocks, outer_stats, run_by_consumer=True), + self._epoch, + lazy=True, + ) + return ds + + return gen + + class Iterable: + def __init__(self, blocks, epoch): + if bytes_per_window: + self._splits = blocks.split_by_bytes(bytes_per_window) + else: + self._splits = blocks.split(split_size=blocks_per_window) + try: + sizes = [s.size_bytes() for s in self._splits] + num_blocks = [s.initial_num_blocks() for s in self._splits] + assert [s > 0 for s in sizes], sizes + + def fmt(size_bytes): + if size_bytes > 1024 * 1024 * 1024: + return "{}GiB".format( + round(size_bytes / (1024 * 1024 * 1024), 2) + ) + elif size_bytes > 10 * 1024: + return "{}MiB".format(round(size_bytes / (1024 * 1024), 2)) + else: + return "{}b".format(size_bytes) + + mean_bytes = int(np.mean(sizes)) + logger.info( + "Created DatasetPipeline with {} windows: " + "{} min, {} max, {} mean".format( + len(self._splits), + fmt(min(sizes)), + fmt(max(sizes)), + fmt(mean_bytes), + ) + ) + mean_num_blocks = int(np.mean(num_blocks)) + logger.info( + "Blocks per window: " + "{} min, {} max, {} mean".format( + min(num_blocks), + max(num_blocks), + mean_num_blocks, + ) + ) + # TODO(ekl) we should try automatically choosing the default + # windowing settings to meet these best-practice constraints. + avail_parallelism = _estimate_available_parallelism() + if mean_num_blocks < avail_parallelism: + logger.warning( + f"{WARN_PREFIX} This pipeline's parallelism is limited " + f"by its blocks per window to ~{mean_num_blocks} " + "concurrent tasks per window. To maximize " + "performance, increase the blocks per window to at least " + f"{avail_parallelism}. This may require increasing the " + "base dataset's parallelism and/or adjusting the " + "windowing parameters." + ) + else: + logger.info( + f"{OK_PREFIX} This pipeline's per-window parallelism " + "is high enough to fully utilize the cluster." + ) + obj_store_mem = ray.cluster_resources().get( + "object_store_memory", 0 + ) + safe_mem_bytes = int(obj_store_mem * ESTIMATED_SAFE_MEMORY_FRACTION) + if mean_bytes > safe_mem_bytes: + logger.warning( + f"{WARN_PREFIX} This pipeline's windows are " + f"~{fmt(mean_bytes)} in size each and may not fit in " + "object store memory without spilling. To improve " + "performance, consider reducing the size of each window " + f"to {fmt(safe_mem_bytes)} or less." + ) + else: + logger.info( + f"{OK_PREFIX} This pipeline's windows likely fit in " + "object store memory without spilling." + ) + except Exception as e: + logger.info( + "Created DatasetPipeline with {} windows; " + "error getting sizes: {}".format( + len(self._splits), + e, + ) + ) + self._epoch = epoch + + def __iter__(self): + return Iterator(self._splits, self._epoch) + + it = Iterable(blocks, self._epoch) + pipe = DatasetPipeline(it, False, length=len(it._splits)) + if read_stage: + pipe = pipe.foreach_window( + lambda ds, read_stage=read_stage: Dataset( + ds._plan.with_stage(read_stage), ds._epoch, True + ) + ) + return pipe + + @Deprecated(message="Use `Dataset.materialize()` instead.") + def fully_executed(self) -> "MaterializedDataset": + logger.warning( + "Deprecation warning: use Dataset.materialize() instead of " + "fully_executed()." + ) + self._plan.execute(force_read=True) + return self + + @Deprecated(message="Check `isinstance(Dataset, MaterializedDataset)` instead.") + def is_fully_executed(self) -> bool: + logger.warning( + "Deprecation warning: Check " + "`isinstance(Dataset, MaterializedDataset)` " + "instead of using is_fully_executed()." + ) + return self._plan.has_computed_output() + + @ConsumptionAPI(pattern="store memory.", insert_after=True) + def materialize(self) -> "MaterializedDataset": + """Execute and materialize this dataset into object store memory. + + This can be used to read all blocks into memory. By default, Dataset + doesn't read blocks from the datasource until the first transform. + + Note that this does not mutate the original Dataset. Only the blocks of the + returned MaterializedDataset class are pinned in memory. + + Returns: + A MaterializedDataset holding the materialized data blocks. + """ + copy = Dataset.copy(self, _deep_copy=True, _as=MaterializedDataset) + copy._plan.execute(force_read=True) + return copy + + @ConsumptionAPI(pattern="timing information.", insert_after=True) + def stats(self) -> str: + """Returns a string containing execution timing information. + + Note that this does not trigger execution, so if the dataset has not yet + executed, an empty string will be returned. + """ + return self._get_stats_summary().to_string() + + def _get_stats_summary(self) -> DatasetStatsSummary: + return self._plan.stats_summary() + + @ConsumptionAPI(pattern="Time complexity:") + @DeveloperAPI + def get_internal_block_refs(self) -> List[ObjectRef[Block]]: + """Get a list of references to the underlying blocks of this dataset. + + This function can be used for zero-copy access to the data. It blocks + until the underlying blocks are computed. + + Time complexity: O(1) + + Returns: + A list of references to this dataset's blocks. + """ + blocks = self._plan.execute().get_blocks() + self._synchronize_progress_bar() + return blocks + + @Deprecated( + message="Dataset is lazy by default, so this conversion call is no longer " + "needed and this API will be removed in a future release" + ) + def lazy(self) -> "Dataset": + """Enable lazy evaluation. + + Dataset is lazy by default, so this is only useful for datasets created + from :func:`ray.data.from_items() `, which is + eager. + + The returned dataset is a lazy dataset, where all subsequent operations + on the stream won't be executed until the dataset is consumed + (e.g. ``.take()``, ``.iter_batches()``, ``.to_torch()``, ``.to_tf()``, etc.) + or execution is manually triggered via ``.materialize()``. + """ + ds = Dataset( + self._plan, self._epoch, lazy=True, logical_plan=self._logical_plan + ) + ds._set_uuid(self._get_uuid()) + return ds + + def has_serializable_lineage(self) -> bool: + """Whether this dataset's lineage is able to be serialized for storage and + later deserialized, possibly on a different cluster. + + Only datasets that are created from data that we know will still exist at + deserialization time, e.g. data external to this Ray cluster such as persistent + cloud object stores, support lineage-based serialization. All of the + ray.data.read_*() APIs support lineage-based serialization. + """ + return self._plan.has_lazy_input() + + @DeveloperAPI + def serialize_lineage(self) -> bytes: + """ + Serialize this dataset's lineage, not the actual data or the existing data + futures, to bytes that can be stored and later deserialized, possibly on a + different cluster. + + Note that this will drop all computed data, and that everything will be + recomputed from scratch after deserialization. + + Use :py:meth:`Dataset.deserialize_lineage` to deserialize the serialized + bytes returned from this method into a Dataset. + + .. note:: + Unioned and zipped datasets, produced by :py:meth`Dataset.union` and + :py:meth:`Dataset.zip`, are not lineage-serializable. + + Returns: + Serialized bytes containing the lineage of this dataset. + """ + if not self.has_serializable_lineage(): + raise ValueError( + "Lineage-based serialization is not supported for this stream, which " + "means that it cannot be used as a tunable hyperparameter. " + "Lineage-based serialization is explicitly NOT supported for unioned " + "or zipped datasets (see docstrings for those methods), and is only " + "supported for datasets created from data that we know will still " + "exist at deserialization time, e.g. external data in persistent cloud " + "object stores or in-memory data from long-lived clusters. Concretely, " + "all ray.data.read_*() APIs should support lineage-based " + "serialization, while all of the ray.data.from_*() APIs do not. To " + "allow this stream to be serialized to storage, write the data to an " + "external store (such as AWS S3, GCS, or Azure Blob Storage) using the " + "Dataset.write_*() APIs, and serialize a new dataset reading " + "from the external store using the ray.data.read_*() APIs." + ) + # Copy Dataset and clear the blocks from the execution plan so only the + # Dataset's lineage is serialized. + plan_copy = self._plan.deep_copy(preserve_uuid=True) + ds = Dataset(plan_copy, self._get_epoch(), self._lazy) + ds._plan.clear_block_refs() + ds._set_uuid(self._get_uuid()) + + def _reduce_remote_fn(rf: ray.remote_function.RemoteFunction): + # Custom reducer for Ray remote function handles that allows for + # cross-cluster serialization. + # This manually unsets the last export session and job to force re-exporting + # of the function when the handle is deserialized on a new cluster. + # TODO(Clark): Fix this in core Ray, see issue: + # https://github.com/ray-project/ray/issues/24152. + reconstructor, args, state = rf.__reduce__() + state["_last_export_session_and_job"] = None + return reconstructor, args, state + + context = ray._private.worker.global_worker.get_serialization_context() + try: + context._register_cloudpickle_reducer( + ray.remote_function.RemoteFunction, _reduce_remote_fn + ) + serialized = pickle.dumps(ds) + finally: + context._unregister_cloudpickle_reducer(ray.remote_function.RemoteFunction) + return serialized + + @staticmethod + @DeveloperAPI + def deserialize_lineage(serialized_ds: bytes) -> "Dataset": + """ + Deserialize the provided lineage-serialized Dataset. + + This assumes that the provided serialized bytes were serialized using + :py:meth:`Dataset.serialize_lineage`. + + Args: + serialized_ds: The serialized Dataset that we wish to deserialize. + + Returns: + A deserialized ``Dataset`` instance. + """ + return pickle.loads(serialized_ds) + + @property + @DeveloperAPI + def context(self) -> DataContext: + """Return the DataContext used to create this Dataset.""" + return self._plan._context + + def _divide(self, block_idx: int) -> ("Dataset", "Dataset"): + block_list = self._plan.execute() + left, right = block_list.divide(block_idx) + l_ds = Dataset( + ExecutionPlan( + left, self._plan.stats(), run_by_consumer=block_list._owned_by_consumer + ), + self._epoch, + self._lazy, + ) + r_ds = Dataset( + ExecutionPlan( + right, self._plan.stats(), run_by_consumer=block_list._owned_by_consumer + ), + self._epoch, + self._lazy, + ) + return l_ds, r_ds + + @Deprecated(message="The batch format is no longer exposed as a public API.") + def default_batch_format(self) -> Type: + context = DataContext.get_current() + if context.strict_mode: + raise StrictModeError("default_batch_format() is not allowed in Ray 2.5") + + import pandas as pd + import pyarrow as pa + + schema = self.schema() + assert isinstance(schema, (type, PandasBlockSchema, pa.Schema)) + + if isinstance(schema, type): + return list + + if isinstance(schema, (PandasBlockSchema, pa.Schema)): + if schema.names == [TENSOR_COLUMN_NAME]: + return np.ndarray + return pd.DataFrame + + @Deprecated(message="The dataset format is no longer exposed as a public API.") + def dataset_format(self) -> BlockFormat: + context = DataContext.get_current() + if context.strict_mode: + raise StrictModeError("dataset_format() is not allowed in Ray 2.5") + + if context.use_streaming_executor: + raise DeprecationWarning( + "`dataset_format` is deprecated for streaming execution. To use " + "`dataset_format`, you must explicitly enable bulk execution by " + "setting `use_streaming_executor` to False in the `DataContext`" + ) + + # We need schema to properly validate, so synchronously + # fetch it if necessary. + schema = self.schema(fetch_if_missing=True) + if schema is None: + raise ValueError( + "Dataset is empty or cleared, can't determine the format of " + "the dataset." + ) + + try: + import pyarrow as pa + + if isinstance(schema, pa.Schema): + return BlockFormat.ARROW + except ModuleNotFoundError: + pass + from ray.data._internal.pandas_block import PandasBlockSchema + + if isinstance(schema, PandasBlockSchema): + return BlockFormat.PANDAS + return BlockFormat.SIMPLE + + def _aggregate_on( + self, agg_cls: type, on: Optional[Union[str, List[str]]], *args, **kwargs + ): + """Helper for aggregating on a particular subset of the dataset. + + This validates the `on` argument, and converts a list of column names + or lambdas to a multi-aggregation. A null `on` results in a + multi-aggregation on all columns for an Arrow Dataset, and a single + aggregation on the entire row for a simple Dataset. + """ + aggs = self._build_multicolumn_aggs(agg_cls, on, *args, **kwargs) + return self.aggregate(*aggs) + + def _build_multicolumn_aggs( + self, + agg_cls: type, + on: Optional[Union[str, List[str]]], + ignore_nulls: bool, + *args, + skip_cols: Optional[List[str]] = None, + **kwargs, + ): + """Build set of aggregations for applying a single aggregation to + multiple columns. + """ + # Expand None into an aggregation for each column. + if on is None: + schema = self.schema(fetch_if_missing=True) + if schema is not None and not isinstance(schema, type): + if not skip_cols: + skip_cols = [] + if len(schema.names) > 0: + on = [col for col in schema.names if col not in skip_cols] + + if not isinstance(on, list): + on = [on] + return [agg_cls(on_, *args, ignore_nulls=ignore_nulls, **kwargs) for on_ in on] + + def _aggregate_result(self, result: Union[Tuple, Mapping]) -> U: + if result is not None and len(result) == 1: + if isinstance(result, tuple): + return result[0] + else: + # NOTE (kfstorm): We cannot call `result[0]` directly on + # `PandasRow` because indexing a column with position is not + # supported by pandas. + return list(result.values())[0] + else: + return result + + @ensure_ipywidgets_dep("8") + @repr_fallback_if_colab + def _repr_mimebundle_(self, **kwargs): + """Return a mimebundle with an ipywidget repr and a simple text repr. + + Depending on the frontend where the data is being displayed, + different mimetypes will be used from this bundle. + See https://ipython.readthedocs.io/en/stable/config/integrating.html + for information about this method, and + https://ipywidgets.readthedocs.io/en/latest/embedding.html + for more information about the jupyter widget mimetype. + + Returns: + A mimebundle containing an ipywidget repr and a simple text repr. + """ + import ipywidgets + + title = ipywidgets.HTML(f"

    {self.__class__.__name__}

    ") + tab = self._tab_repr_() + widget = ipywidgets.VBox([title, tab], layout=ipywidgets.Layout(width="100%")) + + # Get the widget mime bundle, but replace the plaintext + # with the Datastream repr + bundle = widget._repr_mimebundle_(**kwargs) + bundle.update( + { + "text/plain": repr(self), + } + ) + return bundle + + def _tab_repr_(self): + from ipywidgets import Tab, HTML + + metadata = { + "num_blocks": self._plan.initial_num_blocks(), + "num_rows": self._meta_count(), + } + # Show metadata if available, but don't trigger execution. + schema = self.schema(fetch_if_missing=False) + if schema is None: + schema_repr = Template("rendered_html_common.html.j2").render( + content="
    Unknown schema
    " + ) + elif isinstance(schema, type): + schema_repr = Template("rendered_html_common.html.j2").render( + content=f"
    Data type: {html.escape(str(schema))}
    " + ) + else: + schema_data = {} + for sname, stype in zip(schema.names, schema.types): + schema_data[sname] = getattr(stype, "__name__", str(stype)) + + schema_repr = Template("scrollableTable.html.j2").render( + table=tabulate( + tabular_data=schema_data.items(), + tablefmt="html", + showindex=False, + headers=["Name", "Type"], + ), + max_height="300px", + ) + + children = [] + children.append( + HTML( + Template("scrollableTable.html.j2").render( + table=tabulate( + tabular_data=metadata.items(), + tablefmt="html", + showindex=False, + headers=["Field", "Value"], + ), + max_height="300px", + ) + ) + ) + children.append(HTML(schema_repr)) + return Tab(children, titles=["Metadata", "Schema"]) + + def __repr__(self) -> str: + return self._plan.get_plan_as_string(self.__class__.__name__) + + def __str__(self) -> str: + return repr(self) + + def __bool__(self) -> bool: + # Prevents `__len__` from being called to check if it is None + # see: issue #25152 + return True + + def __len__(self) -> int: + raise AttributeError( + "Use `ds.count()` to compute the length of a distributed Dataset. " + "This may be an expensive operation." + ) + + def __iter__(self): + raise TypeError( + "`Dataset` objects aren't iterable. To iterate records, call " + "`ds.iter_rows()` or `ds.iter_batches()`. For more information, read " + "https://docs.ray.io/en/latest/data/consuming-datasets.html." + ) + + def _block_num_rows(self) -> List[int]: + get_num_rows = cached_remote_fn(_get_num_rows) + return ray.get([get_num_rows.remote(b) for b in self.get_internal_block_refs()]) + + def _block_size_bytes(self) -> List[int]: + get_size_bytes = cached_remote_fn(_get_size_bytes) + return ray.get( + [get_size_bytes.remote(b) for b in self.get_internal_block_refs()] + ) + + def _meta_count(self) -> Optional[int]: + return self._plan.meta_count() + + def _get_uuid(self) -> str: + return self._uuid + + def _set_uuid(self, uuid: str) -> None: + self._uuid = uuid + + def _get_epoch(self) -> int: + return self._epoch + + def _set_epoch(self, epoch: int) -> None: + self._epoch = epoch + + def _warn_slow(self): + if ray.util.log_once("dataset_slow_warned"): + logger.warning( + "The `map`, `flat_map`, and `filter` operations are unvectorized and " + "can be very slow. If you're using a vectorized transformation, " + "consider using `.map_batches()` instead." + ) + + def _synchronize_progress_bar(self): + """Flush progress bar output by shutting down the current executor. + + This should be called at the end of all blocking APIs (e.g., `take`), but not + async APIs (e.g., `iter_batches`). + + The streaming executor runs in a separate generator / thread, so it is + possible the shutdown logic runs even after a call to retrieve rows from the + stream has finished. Explicit shutdown avoids this, which can clobber console + output (https://github.com/ray-project/ray/issues/32414). + """ + if self._current_executor: + self._current_executor.shutdown() + self._current_executor = None + + def __getstate__(self): + # Note: excludes _current_executor which is not serializable. + return { + "plan": self._plan, + "uuid": self._uuid, + "epoch": self._epoch, + "lazy": self._lazy, + "logical_plan": self._logical_plan, + } + + def __setstate__(self, state): + self._plan = state["plan"] + self._uuid = state["uuid"] + self._epoch = state["epoch"] + self._lazy = state["lazy"] + self._logical_plan = state["logical_plan"] + self._current_executor = None + + def __del__(self): + if self._current_executor and ray is not None and ray.is_initialized(): + self._current_executor.shutdown() + + +@PublicAPI +class MaterializedDataset(Dataset, Generic[T]): + """A Dataset materialized in Ray memory, e.g., via `.materialize()`. + + The blocks of a MaterializedDataset object are materialized into Ray object store + memory, which means that this class can be shared or iterated over by multiple Ray + tasks without re-executing the underlying computations for producing the stream. + """ + + pass + + +@PublicAPI(stability="beta") +class Schema: + """Dataset schema. + + Attributes: + names: List of column names of this Dataset. + types: List of Arrow types of the Dataset. Note that the "object" type is + not Arrow compatible and hence will be returned as `object`. + base_schema: The underlying Arrow or Pandas schema. + """ + + def __init__(self, base_schema: Union["pyarrow.lib.Schema", "PandasBlockSchema"]): + self.base_schema = base_schema + + @property + def names(self) -> List[str]: + """Lists the columns of this Dataset.""" + return self.base_schema.names + + @property + def types(self) -> List[Union[Literal[object], "pyarrow.DataType"]]: + """Lists the types of this Dataset in Arrow format + + For non-Arrow compatible types, we return "object". + """ + import pyarrow as pa + from ray.data.extensions import TensorDtype, ArrowTensorType + + if isinstance(self.base_schema, pa.lib.Schema): + return list(self.base_schema.types) + + arrow_types = [] + for dtype in self.base_schema.types: + if isinstance(dtype, TensorDtype): + # Manually convert our Pandas tensor extension type to Arrow. + arrow_types.append( + ArrowTensorType( + shape=dtype._shape, dtype=pa.from_numpy_dtype(dtype._dtype) + ) + ) + else: + try: + arrow_types.append(pa.from_numpy_dtype(dtype)) + except pa.ArrowNotImplementedError: + arrow_types.append(object) + except Exception: + logger.exception(f"Error converting dtype {dtype} to Arrow.") + arrow_types.append(None) + return arrow_types + + def __eq__(self, other): + return isinstance(other, Schema) and other.base_schema == self.base_schema + + def __repr__(self): + column_width = max([len(name) for name in self.names] + [len("Column")]) + padding = 2 + + output = "Column" + output += " " * ((column_width + padding) - len("Column")) + output += "Type\n" + + output += "-" * len("Column") + output += " " * ((column_width + padding) - len("Column")) + output += "-" * len("Type") + "\n" + + for name, type in zip(self.names, self.types): + output += name + output += " " * ((column_width + padding) - len(name)) + output += f"{type}\n" + + output = output.rstrip() + return output + + +def _get_size_bytes(block: Block) -> int: + block = BlockAccessor.for_block(block) + return block.size_bytes() + + +def _block_to_df(block: Block): + block = BlockAccessor.for_block(block) + return block.to_pandas() + + +def _block_to_ndarray(block: Block, column: Optional[str]): + block = BlockAccessor.for_block(block) + return block.to_numpy(column) + + +def _block_to_arrow(block: Block): + block = BlockAccessor.for_block(block) + return block.to_arrow() + + +def _sliding_window(iterable: Iterable, n: int): + """Creates an iterator consisting of n-width sliding windows over + iterable. The sliding windows are constructed lazily such that an + element on the base iterator (iterable) isn't consumed until the + first sliding window containing that element is reached. + + If n > len(iterable), then a single len(iterable) window is + returned. + + Args: + iterable: The iterable on which the sliding window will be + created. + n: The width of the sliding window. + + Returns: + An iterator of n-width windows over iterable. + If n > len(iterable), then a single len(iterable) window is + returned. + """ + it = iter(iterable) + window = collections.deque(itertools.islice(it, n), maxlen=n) + if len(window) > 0: + yield tuple(window) + for elem in it: + window.append(elem) + yield tuple(window) + + +def _do_write( + ds: Datasource, + ctx: DataContext, + blocks: List[Block], + meta: List[BlockMetadata], + ray_remote_args: Dict[str, Any], + write_args: Dict[str, Any], +) -> List[ObjectRef[WriteResult]]: + write_args = _unwrap_arrow_serialization_workaround(write_args) + DataContext._set_current(ctx) + return ds.do_write(blocks, meta, ray_remote_args=ray_remote_args, **write_args) diff --git a/python/ray/data/dataset_pipeline.py b/python/ray/data/dataset_pipeline.py index 1d3e2bb54151..4851b1c5792c 100644 --- a/python/ray/data/dataset_pipeline.py +++ b/python/ray/data/dataset_pipeline.py @@ -7,7 +7,6 @@ Any, Callable, Dict, - Generic, Iterable, Iterator, List, @@ -20,7 +19,6 @@ import ray from ray.air.util.data_batch_conversion import BlockFormat -from ray.data._internal import progress_bar from ray.data._internal.block_batching import batch_block_refs from ray.data._internal.block_list import BlockList from ray.data._internal.compute import ComputeStrategy @@ -32,26 +30,21 @@ PipelinedDataIterator, ) from ray.data._internal.plan import ExecutionPlan -from ray.data._internal.stats import DatasetPipelineStats, DatastreamStats +from ray.data._internal.stats import DatasetPipelineStats, DatasetStats from ray.data.block import ( - BatchUDF, + UserDefinedFunction, Block, DataBatch, - KeyFn, - RowUDF, - T, - U, _apply_strict_mode_batch_format, ) from ray.data.context import DataContext -from ray.data.datastream import Datastream +from ray.data.dataset import Dataset from ray.data.iterator import DataIterator from ray.data.datasource import Datasource from ray.data.datasource.file_based_datasource import ( BlockWritePathProvider, DefaultBlockWritePathProvider, ) -from ray.data.row import TableRow from ray.types import ObjectRef from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy @@ -73,35 +66,35 @@ @PublicAPI -class DatasetPipeline(Generic[T]): - """Implements a pipeline of Datastreams. +class DatasetPipeline: + """Implements a pipeline of Datasets. DatasetPipelines implement pipelined execution. This allows for the overlapped execution of data input (e.g., reading files), computation (e.g. feature preprocessing), and output (e.g., distributed ML training). - A DatasetPipeline can be created by either repeating a Datastream - (``ds.repeat(times=None)``), by turning a single Datastream into a pipeline + A DatasetPipeline can be created by either repeating a Dataset + (``ds.repeat(times=None)``), by turning a single Dataset into a pipeline (``ds.window(blocks_per_window=10)``), or defined explicitly using ``DatasetPipeline.from_iterable()``. - DatasetPipeline supports the all the per-record transforms of Datastreams + DatasetPipeline supports the all the per-record transforms of Datasets (e.g., map, flat_map, filter), holistic transforms (e.g., repartition), and output methods (e.g., iter_rows, to_tf, to_torch, write_datasource). """ def __init__( self, - base_iterable: Iterable[Callable[[], Datastream[T]]], - stages: List[Callable[[Datastream[Any]], Datastream[Any]]] = None, + base_iterable: Iterable[Callable[[], Dataset]], + stages: List[Callable[[Dataset], Dataset]] = None, length: Optional[int] = None, - progress_bars: bool = progress_bar._enabled, + progress_bars: bool = DataContext.get_current().enable_progress_bars, _executed: List[bool] = None, ): """Construct a DatasetPipeline (internal API). The constructor is not part of the DatasetPipeline API. Use the - ``Datastream.repeat()``, ``Datastream.window()``, or + ``Dataset.repeat()``, ``Dataset.window()``, or ``DatasetPipeline.from_iterable()`` methods to construct a pipeline. """ self._base_iterable = base_iterable @@ -113,19 +106,17 @@ def __init__( # Whether the pipeline execution has started. # This variable is shared across all pipelines descending from this. self._executed = _executed or [False] - self._first_datastream: Optional[Datastream] = None - self._remaining_datastreams_iter: Optional[ - Iterator[Callable[[], Datastream]] - ] = None + self._first_dataset: Optional[Dataset] = None + self._remaining_datasets_iter: Optional[Iterator[Callable[[], Dataset]]] = None self._schema = None self._stats = DatasetPipelineStats() def iterator(self) -> DataIterator: """Return a :class:`~ray.data.DataIterator` that - can be used to repeatedly iterate over the datastream. + can be used to repeatedly iterate over the dataset. - Note that each pass iterates over the entire original Datastream, even if - the datastream was windowed with ``.window()``. + Note that each pass iterates over the entire original Dataset, even if + the dataset was windowed with ``.window()``. Examples: >>> import ray @@ -140,13 +131,9 @@ def iterator(self) -> DataIterator: """ return PipelinedDataIterator(self) - def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Union[T, TableRow]]: + def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Dict[str, Any]]: """Return a local row iterator over the data in the pipeline. - If the datastream is a tabular datastream (Arrow/Pandas blocks), dict-like - mappings :py:class:`~ray.data.row.TableRow` are yielded for each row by the - iterator. If the datastream is not tabular, the raw row is yielded. - Examples: >>> import ray >>> for i in ray.data.range(1000000).repeat(5).iter_rows(): # doctest: +SKIP @@ -162,7 +149,7 @@ def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Union[T, TableRow]] A local iterator over the records in the pipeline. """ - def gen_rows() -> Iterator[Union[T, TableRow]]: + def gen_rows() -> Iterator[Dict[str, Any]]: time_start = time.perf_counter() for ds in self.iter_datasets(): @@ -208,12 +195,10 @@ def iter_batches( The final batch may include fewer than ``batch_size`` rows if ``drop_last`` is ``False``. Defaults to 256. batch_format: Specify ``"default"`` to use the default block format - (promotes tables to Pandas and tensors to NumPy), ``"pandas"`` to select - ``pandas.DataFrame``, "pyarrow" to select ``pyarrow.Table``, or - ``"numpy"`` to select ``numpy.ndarray`` for tensor datastreams and - ``Dict[str, numpy.ndarray]`` for tabular datastreams, or None to return - the underlying block exactly as is with no additional formatting. - The default is "default". + (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to + select ``pyarrow.Table``, or ``"numpy"`` to select + ``Dict[str, numpy.ndarray]``, or None to return the underlying block + exactly as is with no additional formatting. drop_last: Whether to drop the last batch if it's incomplete. local_shuffle_buffer_size: If non-None, the data will be randomly shuffled using a local in-memory shuffle buffer, and this value will serve as the @@ -238,9 +223,9 @@ def iter_batches( if self._executed[0]: raise RuntimeError("Pipeline cannot be read multiple times.") time_start = time.perf_counter() - if self._first_datastream is not None: + if self._first_dataset is not None: blocks_owned_by_consumer = ( - self._first_datastream._plan.execute()._owned_by_consumer + self._first_dataset._plan.execute()._owned_by_consumer ) else: blocks_owned_by_consumer = self._peek()._plan.execute()._owned_by_consumer @@ -267,7 +252,7 @@ def _iter_blocks(self) -> Iterator[ObjectRef[Block]]: def split( self, n: int, *, equal: bool = False, locality_hints: List[Any] = None - ) -> List["DatasetPipeline[T]"]: + ) -> List["DatasetPipeline"]: """Split the pipeline into ``n`` disjoint pipeline shards. This returns a list of sub-pipelines that can be passed to Ray tasks @@ -310,16 +295,16 @@ def split( ), ) - def split_at_indices(self, indices: List[int]) -> List["DatasetPipeline[T]"]: - """Split the datastreams within the pipeline at the given indices + def split_at_indices(self, indices: List[int]) -> List["DatasetPipeline"]: + """Split the datasets within the pipeline at the given indices (like np.split). - This will split each datastream contained within this pipeline, thereby + This will split each dataset contained within this pipeline, thereby producing len(indices) + 1 pipelines with the first pipeline containing - the [0, indices[0]) slice from each datastream, the second pipeline - containing the [indices[0], indices[1]) slice from each datastream, and so + the [0, indices[0]) slice from each dataset, the second pipeline + containing the [indices[0], indices[1]) slice from each dataset, and so on, with the final pipeline will containing the - [indices[-1], self.count()) slice from each datastream. + [indices[-1], self.count()) slice from each dataset. Examples: >>> import ray @@ -355,8 +340,8 @@ def split_at_indices(self, indices: List[int]) -> List["DatasetPipeline[T]"]: return self._split(len(indices) + 1, lambda ds: ds.split_at_indices(indices)) def _split( - self, n: int, splitter: Callable[[Datastream], List["Datastream[T]"]] - ) -> List["DatasetPipeline[T]"]: + self, n: int, splitter: Callable[[Dataset], List["Dataset"]] + ) -> List["DatasetPipeline"]: ctx = DataContext.get_current() scheduling_strategy = ctx.scheduling_strategy if not ray.util.client.ray.is_connected(): @@ -390,9 +375,7 @@ def __next__(self): tries = 0 while ds is None: ds = ray.get( - self.coordinator.next_datastream_if_ready.remote( - self.split_index - ) + self.coordinator.next_dataset_if_ready.remote(self.split_index) ) # Wait for other shards to catch up reading. if not ds: @@ -424,14 +407,14 @@ def __next__(self): def rewindow( self, *, blocks_per_window: int, preserve_epoch: bool = True - ) -> "DatasetPipeline[T]": - """Change the windowing (blocks per datastream) of this pipeline. + ) -> "DatasetPipeline": + """Change the windowing (blocks per dataset) of this pipeline. Changes the windowing of this pipeline to the specified size. For - example, if the current pipeline has two blocks per datastream, and - `.rewindow(blocks_per_window=4)` is requested, adjacent datastreams will - be merged until each datastream is 4 blocks. If - `.rewindow(blocks_per_window)` was requested the datastreams will be + example, if the current pipeline has two blocks per dataset, and + `.rewindow(blocks_per_window=4)` is requested, adjacent datasets will + be merged until each dataset is 4 blocks. If + `.rewindow(blocks_per_window)` was requested the datasets will be split into smaller windows. Args: @@ -443,9 +426,9 @@ def rewindow( class WindowIterator: def __init__(self, original_iter): self._original_iter = original_iter - self._buffer: Optional[Datastream[T]] = None + self._buffer: Optional[Dataset] = None - def __next__(self) -> Datastream[T]: + def __next__(self) -> Dataset: try: # Merge windows until we meet the requested window size. if self._buffer is None: @@ -490,7 +473,7 @@ def __iter__(self): length = None # The newly created DatasetPipeline will contain a PipelineExecutor (because - # this will execute the pipeline so far to iter the datastreams). In order to + # this will execute the pipeline so far to iter the datasets). In order to # make this new DatasetPipeline serializable, we need to make sure the # PipelineExecutor has not been iterated. So this uses # _iter_datasets_without_peek() instead of iter_datasets(). @@ -499,7 +482,7 @@ def __iter__(self): length=length, ) - def repeat(self, times: int = None) -> "DatasetPipeline[T]": + def repeat(self, times: int = None) -> "DatasetPipeline": """Repeat this pipeline a given number or times, or indefinitely. This operation is only allowed for pipelines of a finite length. An @@ -527,7 +510,7 @@ def __init__(self, original_iter): # This is calculated later. self._max_i = None - def __next__(self) -> Callable[[], Datastream[T]]: + def __next__(self) -> Callable[[], Dataset]: # Still going through the original pipeline. if self._original_iter: try: @@ -585,10 +568,10 @@ def __iter__(self): def schema( self, fetch_if_missing: bool = False ) -> Union[type, "pyarrow.lib.Schema"]: - """Return the schema of the datastream pipeline. + """Return the schema of the dataset pipeline. - For datastreams of Arrow records, this will return the Arrow schema. - For datastream of Python objects, this returns their Python type. + For datasets of Arrow records, this will return the Arrow schema. + For dataset of Python objects, this returns their Python type. Note: This is intended to be a method for peeking schema before the execution of DatasetPipeline. If execution has already started, @@ -610,7 +593,7 @@ def schema( return self._schema def dataset_format(self) -> BlockFormat: - """The format of the datastream pipeline's underlying data blocks. Possible + """The format of the dataset pipeline's underlying data blocks. Possible values are: "arrow", "pandas" and "simple". This may block; if the schema is unknown, this will synchronously fetch @@ -621,8 +604,8 @@ def dataset_format(self) -> BlockFormat: schema = self.schema(fetch_if_missing=True) if schema is None: raise ValueError( - "Datastream is empty or cleared, can't determine the format of " - "the datastream." + "Dataset is empty or cleared, can't determine the format of " + "the dataset." ) try: @@ -639,51 +622,57 @@ def dataset_format(self) -> BlockFormat: return BlockFormat.SIMPLE def count(self) -> int: - """Count the number of records in the datastream pipeline. + """Count the number of records in the dataset pipeline. This blocks until the entire pipeline is fully executed. - Time complexity: O(datastream size / parallelism) + Time complexity: O(dataset size / parallelism) Returns: - The number of records in the datastream pipeline. + The number of records in the dataset pipeline. """ if self._length == float("inf"): raise ValueError("Cannot count a pipeline of infinite length.") - pipe = self.map_batches(lambda batch: [len(batch)]) + def batch_len(batch): + key0 = list(batch.keys())[0] + return len(batch[key0]) + + pipe = self.map_batches(lambda batch: {"len": np.array([batch_len(batch)])}) total = 0 for elem in pipe.iter_rows(): - total += elem + total += elem["len"] return total def sum(self) -> int: - """Sum the records in the datastream pipeline. + """Sum the records in the dataset pipeline. This blocks until the entire pipeline is fully executed. - Time complexity: O(datastream size / parallelism) + Time complexity: O(dataset size / parallelism) Returns: - The sum of the records in the datastream pipeline. + The sum of the records in the dataset pipeline. """ if self._length == float("inf"): raise ValueError("Cannot sum a pipeline of infinite length.") - pipe = self.map_batches(lambda batch: [batch.sum()[0]], batch_format="pandas") + pipe = self.map_batches( + lambda batch: {"sum": np.array([batch.sum()[0]])}, batch_format="pandas" + ) total = 0 for elem in pipe.iter_rows(): - total += elem + total += elem["sum"] return total - def show_windows(self, limit_per_datastream: int = 10) -> None: - """Print up to the given number of records from each window/datastream. + def show_windows(self, limit_per_dataset: int = 10) -> None: + """Print up to the given number of records from each window/dataset. This is helpful as a debugging tool for understanding the structure of - datastream pipelines. + dataset pipelines. Args: - limit_per_datastream: Rows to print per window/datastream. + limit_per_dataset: Rows to print per window/dataset. """ epoch = None for i, ds in enumerate(self.iter_datasets()): @@ -691,12 +680,12 @@ def show_windows(self, limit_per_datastream: int = 10) -> None: epoch = ds._get_epoch() print("------ Epoch {} ------".format(epoch)) print("=== Window {} ===".format(i)) - ds.show(limit_per_datastream) + ds.show(limit_per_dataset) - def iter_epochs(self, max_epoch: int = -1) -> Iterator["DatasetPipeline[T]"]: + def iter_epochs(self, max_epoch: int = -1) -> Iterator["DatasetPipeline"]: """Split this pipeline up by epoch. - This allows reading of data per-epoch for repeated Datastreams, which is + This allows reading of data per-epoch for repeated Datasets, which is useful for ML training. For example, ``ray.data.range(10).repeat(50)`` generates a pipeline with 500 rows total split across 50 epochs. This method allows iterating over the data individually per epoch @@ -719,7 +708,7 @@ def iter_epochs(self, max_epoch: int = -1) -> Iterator["DatasetPipeline[T]"]: """ class Peekable: - def __init__(self, base_iter: Iterator[T]): + def __init__(self, base_iter: Iterator[Dataset]): self._iter = base_iter self._buffer = None @@ -731,13 +720,13 @@ def _fill_buffer_if_possible(self): except StopIteration: pass - def peek(self) -> T: + def peek(self) -> Dataset: self._fill_buffer_if_possible() if self._buffer is None: raise StopIteration return self._buffer - def __next__(self) -> T: + def __next__(self) -> Dataset: self._fill_buffer_if_possible() if self._buffer is None: raise StopIteration @@ -746,11 +735,11 @@ def __next__(self) -> T: return item class SingleEpochIterator: - def __init__(self, peekable_iter: Iterator[Datastream[T]], epoch: int): + def __init__(self, peekable_iter: Iterator[Dataset], epoch: int): self._iter = peekable_iter self._epoch = epoch - def __next__(self) -> Datastream[T]: + def __next__(self) -> Dataset: if self._iter.peek()._get_epoch() > self._epoch: raise StopIteration ds = next(self._iter) @@ -765,7 +754,7 @@ def __init__(self, pipe, max_epoch): self._cur_epoch = None self._max_epoch = max_epoch - def __next__(self) -> "DatasetPipeline[T]": + def __next__(self) -> "DatasetPipeline": if self._cur_epoch is None: self._cur_epoch = self._iter.peek()._get_epoch() else: @@ -793,12 +782,12 @@ def __iter__(self): def map( self, - fn: RowUDF, + fn: UserDefinedFunction[Dict[str, Any], Dict[str, Any]], *, compute: Union[str, ComputeStrategy] = None, **ray_remote_args, - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.map ` to each datastream/window + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.map ` to each dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.map(fn, compute=compute, **ray_remote_args) @@ -806,7 +795,7 @@ def map( def map_batches( self, - fn: BatchUDF, + fn: UserDefinedFunction[DataBatch, DataBatch], *, batch_size: Optional[Union[int, Literal["default"]]] = "default", compute: Optional[Union[str, ComputeStrategy]] = None, @@ -816,9 +805,9 @@ def map_batches( fn_constructor_args: Optional[Iterable[Any]] = None, fn_constructor_kwargs: Optional[Dict[str, Any]] = None, **ray_remote_args, - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.map_batches ` to each - datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.map_batches ` to each + dataset/window in this pipeline.""" batch_format = _apply_strict_mode_batch_format(batch_format) return self.foreach_window( @@ -837,26 +826,26 @@ def map_batches( def flat_map( self, - fn: RowUDF, + fn: UserDefinedFunction[Dict[str, Any], List[Dict[str, Any]]], *, compute: Union[str, ComputeStrategy] = None, **ray_remote_args, - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.flat_map ` to each - datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.flat_map ` to each + dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.flat_map(fn, compute=compute, **ray_remote_args) ) def filter( self, - fn: RowUDF, + fn: UserDefinedFunction[Dict[str, Any], bool], *, compute: Union[str, ComputeStrategy] = None, **ray_remote_args, - ) -> "DatasetPipeline[T]": - """Apply :py:meth:`Datastream.filter ` to each - datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.filter ` to each + dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.filter(fn, compute=compute, **ray_remote_args) ) @@ -868,9 +857,9 @@ def add_column( *, compute: Optional[str] = None, **ray_remote_args, - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.add_column ` to each - datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.add_column ` to each + dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.add_column(col, fn, compute=compute, **ray_remote_args) ) @@ -881,9 +870,9 @@ def drop_columns( *, compute: Optional[str] = None, **ray_remote_args, - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.drop_columns ` to - each datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.drop_columns ` to + each dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.drop_columns(cols, compute=compute, **ray_remote_args) ) @@ -894,18 +883,18 @@ def select_columns( *, compute: Optional[str] = None, **ray_remote_args, - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.select_columns ` to - each datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.select_columns ` to + each dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.select_columns(cols, compute=compute, **ray_remote_args) ) def repartition_each_window( self, num_blocks: int, *, shuffle: bool = False - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.repartition ` to each - datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.repartition ` to each + dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.repartition(num_blocks, shuffle=shuffle) ) @@ -916,9 +905,9 @@ def random_shuffle_each_window( seed: Optional[int] = None, num_blocks: Optional[int] = None, **ray_remote_args, - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.random_shuffle ` to - each datastream/window in this pipeline.""" + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.random_shuffle ` to + each dataset/window in this pipeline.""" return self.foreach_window( lambda ds: ds.random_shuffle( seed=seed, num_blocks=num_blocks, **ray_remote_args @@ -926,17 +915,17 @@ def random_shuffle_each_window( ) def sort_each_window( - self, key: Optional[KeyFn] = None, descending: bool = False - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.sort ` to each datastream/window + self, key: Optional[str] = None, descending: bool = False + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.sort ` to each dataset/window in this pipeline.""" return self.foreach_window(lambda ds: ds.sort(key, descending)) def randomize_block_order_each_window( self, *, seed: Optional[int] = None - ) -> "DatasetPipeline[U]": - """Apply :py:meth:`Datastream.randomize_block_order - ` to each datastream/window in this + ) -> "DatasetPipeline": + """Apply :py:meth:`Dataset.randomize_block_order + ` to each dataset/window in this pipeline.""" return self.foreach_window(lambda ds: ds.randomize_block_order(seed=seed)) @@ -952,9 +941,9 @@ def write_json( ray_remote_args: Dict[str, Any] = None, **pandas_json_args, ) -> None: - """Call :py:meth:`Datastream.write_json ` on each - output datastream of this pipeline.""" - self._write_each_datastream( + """Call :py:meth:`Dataset.write_json ` on each + output dataset of this pipeline.""" + self._write_each_dataset( lambda ds: ds.write_json( path, filesystem=filesystem, @@ -979,9 +968,9 @@ def write_csv( ray_remote_args: Dict[str, Any] = None, **arrow_csv_args, ) -> None: - """Call :py:meth:`Datastream.write_csv ` on each - output datastream of this pipeline.""" - self._write_each_datastream( + """Call :py:meth:`Dataset.write_csv ` on each + output dataset of this pipeline.""" + self._write_each_dataset( lambda ds: ds.write_csv( path, filesystem=filesystem, @@ -1006,9 +995,9 @@ def write_parquet( ray_remote_args: Dict[str, Any] = None, **arrow_parquet_args, ) -> None: - """Call :py:meth:`Datastream.write_parquet ` on - each output datastream of this pipeline.""" - self._write_each_datastream( + """Call :py:meth:`Dataset.write_parquet ` on + each output dataset of this pipeline.""" + self._write_each_dataset( lambda ds: ds.write_parquet( path, filesystem=filesystem, @@ -1031,9 +1020,9 @@ def write_tfrecords( block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), ray_remote_args: Dict[str, Any] = None, ) -> None: - """Call :py:meth:`Datastream.write_tfrecords ` on - each output datastream of this pipeline.""" - self._write_each_datastream( + """Call :py:meth:`Dataset.write_tfrecords ` on + each output dataset of this pipeline.""" + self._write_each_dataset( lambda ds: ds.write_tfrecords( path, filesystem=filesystem, @@ -1046,14 +1035,14 @@ def write_tfrecords( def write_datasource( self, - datasource: Datasource[T], + datasource: Datasource, *, ray_remote_args: Dict[str, Any] = None, **write_args, ) -> None: - """Call :py:meth:`Datastream.write_datasource ` - on each output datastream of this pipeline.""" - self._write_each_datastream( + """Call :py:meth:`Dataset.write_datasource ` + on each output dataset of this pipeline.""" + self._write_each_dataset( lambda ds: ds.write_datasource( datasource, ray_remote_args=ray_remote_args, @@ -1061,20 +1050,27 @@ def write_datasource( ) ) - def take(self, limit: int = 20) -> List[T]: - """Call :py:meth:`Datastream.take ` over the stream of + def take(self, limit: int = 20) -> List[Dict[str, Any]]: + """Call :py:meth:`Dataset.take ` over the stream of output batches from the pipeline""" - return Datastream.take(self, limit) + return Dataset.take(self, limit) - def take_all(self, limit: Optional[int] = None) -> List[T]: - """Call :py:meth:`Datastream.take_all ` over the stream + def take_all(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: + """Call :py:meth:`Dataset.take_all ` over the stream of output batches from the pipeline""" - return Datastream.take_all(self, limit) + return Dataset.take_all(self, limit) + + def take_batch( + self, batch_size: int = 20, *, batch_format: Optional[str] = "default" + ) -> DataBatch: + """Call :py:meth:`Dataset.take_batch ` + over the stream of output batches from the pipeline""" + return Dataset.take_batch(self, batch_size, batch_format=batch_format) def show(self, limit: int = 20) -> None: - """Call :py:meth:`Datastream.show ` over the stream of + """Call :py:meth:`Dataset.show ` over the stream of output batches from the pipeline""" - return Datastream.show(self, limit) + return Dataset.show(self, limit) def iter_tf_batches( self, @@ -1087,7 +1083,7 @@ def iter_tf_batches( local_shuffle_seed: Optional[int] = None, ) -> Iterator[Union["tf.Tensor", Dict[str, "tf.Tensor"]]]: """Call - :py:meth:`Datastream.iter_tf_batches ` + :py:meth:`Dataset.iter_tf_batches ` over the stream of output batches from the pipeline.""" batch_format = _apply_strict_mode_batch_format(batch_format) return DataIterator.iter_tf_batches( @@ -1114,8 +1110,8 @@ def iter_torch_batches( local_shuffle_seed: Optional[int] = None, ) -> Iterator["TorchTensorBatchType"]: """Call - :py:meth:`Datastream.iter_torch_batches - ` over the stream of output batches + :py:meth:`Dataset.iter_torch_batches + ` over the stream of output batches from the pipeline.""" return DataIterator.iter_torch_batches( self, @@ -1140,7 +1136,7 @@ def to_tf( local_shuffle_buffer_size: Optional[int] = None, local_shuffle_seed: Optional[int] = None, ) -> "tf.data.Dataset": - """Call :py:meth:`Datastream.to_tf ` over the stream of + """Call :py:meth:`Dataset.to_tf ` over the stream of output batches from the pipeline""" return DataIterator.to_tf( self, @@ -1170,7 +1166,7 @@ def to_torch( unsqueeze_label_tensor: bool = True, unsqueeze_feature_tensors: bool = True, ) -> "torch.utils.data.IterableDataset": - """Call :py:meth:`Datastream.to_torch ` over the stream + """Call :py:meth:`Dataset.to_torch ` over the stream of output batches from the pipeline""" return DataIterator.to_torch( self, @@ -1190,17 +1186,17 @@ def _iter_datasets_without_peek(self): if self._executed[0]: raise RuntimeError("Pipeline cannot be read multiple times.") self._executed[0] = True - if self._first_datastream: + if self._first_dataset: raise RuntimeError("The pipeline has been peeked.") self._optimize_stages() return PipelineExecutor(self) @DeveloperAPI - def iter_datasets(self) -> Iterator[Datastream[T]]: - """Iterate over the output datastreams of this pipeline. + def iter_datasets(self) -> Iterator[Dataset]: + """Iterate over the output datasets of this pipeline. Returns: - Iterator over the datastreams outputted from this pipeline. + Iterator over the datasets outputted from this pipeline. """ if self._executed[0]: raise RuntimeError("Pipeline cannot be read multiple times.") @@ -1208,10 +1204,10 @@ def iter_datasets(self) -> Iterator[Datastream[T]]: self._optimize_stages() - # If the first datastream has already been executed (via a peek operation), then - # we don't re-execute the first datastream when iterating through the pipeline. - # We re-use the saved _first_datastream and _remaining_datastream_iter. - if self._first_datastream is not None: + # If the first dataset has already been executed (via a peek operation), then + # we don't re-execute the first dataset when iterating through the pipeline. + # We re-use the saved _first_dataset and _remaining_dataset_iter. + if self._first_dataset is not None: class _IterableWrapper(Iterable): """Wrapper that takes an iterator and converts it to an @@ -1223,26 +1219,24 @@ def __init__(self, base_iterator): def __iter__(self): return self.base_iterator - # Update the base iterable to skip the first datastream. + # Update the base iterable to skip the first dataset. # It is ok to update the base iterable here since # the pipeline can never be executed again. - self._base_iterable = _IterableWrapper(self._remaining_datastreams_iter) + self._base_iterable = _IterableWrapper(self._remaining_datasets_iter) - iter = itertools.chain([self._first_datastream], PipelineExecutor(self)) - self._first_datastream = None - self._remaining_datastreams_iter = None + iter = itertools.chain([self._first_dataset], PipelineExecutor(self)) + self._first_dataset = None + self._remaining_datasets_iter = None return iter else: return PipelineExecutor(self) @DeveloperAPI - def foreach_window( - self, fn: Callable[[Datastream[T]], Datastream[U]] - ) -> "DatasetPipeline[U]": - """Apply a transform to each datastream/window in this pipeline. + def foreach_window(self, fn: Callable[[Dataset], Dataset]) -> "DatasetPipeline": + """Apply a transform to each dataset/window in this pipeline. Args: - fn: The function to transform each datastream with. + fn: The function to transform each dataset with. Returns: The transformed DatasetPipeline. @@ -1271,13 +1265,13 @@ def stats(self, exclude_first_window: bool = True) -> str: @staticmethod def from_iterable( - iterable: Iterable[Callable[[], Datastream[T]]], - ) -> "DatasetPipeline[T]": - """Create a pipeline from an sequence of Datastream producing functions. + iterable: Iterable[Callable[[], Dataset]], + ) -> "DatasetPipeline": + """Create a pipeline from an sequence of Dataset producing functions. Args: iterable: A finite or infinite-length sequence of functions that - each produce a Datastream when called. + each produce a Dataset when called. """ if hasattr(iterable, "__len__"): length = len(iterable) @@ -1307,22 +1301,22 @@ def _optimize_stages(self): self._optimized_stages = self._stages return - # This dummy datastream will be used to get a set of optimized stages. - dummy_ds = Datastream( + # This dummy dataset will be used to get a set of optimized stages. + dummy_ds = Dataset( ExecutionPlan( BlockList([], [], owned_by_consumer=True), - DatastreamStats(stages={}, parent=None), + DatasetStats(stages={}, parent=None), run_by_consumer=True, ), 0, True, ) - # Apply all pipeline operations to the dummy datastream. + # Apply all pipeline operations to the dummy dataset. for stage in self._stages: dummy_ds = stage(dummy_ds) # Get the optimized stages. _, _, stages = dummy_ds._plan._optimize() - # Apply these optimized stages to the datastreams underlying the pipeline. + # Apply these optimized stages to the datasets underlying the pipeline. # These optimized stages will be executed by the PipelineExecutor. optimized_stages = [] for stage in stages: @@ -1332,33 +1326,31 @@ def add_stage(ds, stage): return ds._plan.with_stage(stage) optimized_stages.append( - lambda ds, stage=stage: Datastream( - add_stage(ds, stage), ds._epoch, True - ) + lambda ds, stage=stage: Dataset(add_stage(ds, stage), ds._epoch, True) ) self._optimized_stages = optimized_stages - def _peek(self) -> Datastream[T]: - if self._first_datastream is None: - datastream_iter = iter(self._base_iterable) - first_datastream_gen = next(datastream_iter) + def _peek(self) -> Dataset: + if self._first_dataset is None: + dataset_iter = iter(self._base_iterable) + first_dataset_gen = next(dataset_iter) peek_pipe = DatasetPipeline( - base_iterable=[first_datastream_gen], + base_iterable=[first_dataset_gen], stages=self._stages.copy(), length=1, progress_bars=True, ) - # Cache the executed _first_datastream. - self._first_datastream = next(peek_pipe.iter_datasets()) - self._remaining_datastreams_iter = datastream_iter + # Cache the executed _first_dataset. + self._first_dataset = next(peek_pipe.iter_datasets()) + self._remaining_datasets_iter = dataset_iter # Store the stats from the peek pipeline. self._stats.add_pipeline_stats(peek_pipe._stats) - return self._first_datastream + return self._first_dataset - def _write_each_datastream(self, write_fn: Callable[[Datastream[T]], None]) -> None: - """Write output for each datastream. + def _write_each_dataset(self, write_fn: Callable[[Dataset], None]) -> None: + """Write output for each dataset. This is utility method used for write_json, write_csv, write_parquet, write_datasource, etc. diff --git a/python/ray/data/datasource/datasource.py b/python/ray/data/datasource/datasource.py index 5d39a96f158e..87761f0dbebb 100644 --- a/python/ray/data/datasource/datasource.py +++ b/python/ray/data/datasource/datasource.py @@ -1,11 +1,10 @@ import builtins from copy import copy -from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple import numpy as np import ray -from ray.data._internal.arrow_block import ArrowRow from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.util import _check_pyarrow_version @@ -13,7 +12,6 @@ Block, BlockAccessor, BlockMetadata, - T, ) from ray.data.context import DataContext from ray.types import ObjectRef @@ -23,11 +21,11 @@ @PublicAPI -class Datasource(Generic[T]): - """Interface for defining a custom ``ray.data.Datastream`` datasource. +class Datasource: + """Interface for defining a custom ``ray.data.Dataset`` datasource. - To read a datasource into a datastream, use ``ray.data.read_datasource()``. - To write to a writable datasource, use ``Datastream.write_datasource()``. + To read a datasource into a dataset, use ``ray.data.read_datasource()``. + To write to a writable datasource, use ``Dataset.write_datasource()``. See ``RangeDatasource`` and ``DummyOutputDatasource`` for examples of how to implement readable and writable datasources. @@ -36,7 +34,7 @@ class Datasource(Generic[T]): ``write()`` are called in remote tasks. """ - def create_reader(self, **read_args) -> "Reader[T]": + def create_reader(self, **read_args) -> "Reader": """Return a Reader for the given read arguments. The reader object will be responsible for querying the read metadata, and @@ -48,7 +46,7 @@ def create_reader(self, **read_args) -> "Reader[T]": return _LegacyDatasourceReader(self, **read_args) @Deprecated - def prepare_read(self, parallelism: int, **read_args) -> List["ReadTask[T]"]: + def prepare_read(self, parallelism: int, **read_args) -> List["ReadTask"]: """Deprecated: Please implement create_reader() instead.""" raise NotImplementedError @@ -131,11 +129,11 @@ def get_name(self) -> str: @PublicAPI -class Reader(Generic[T]): +class Reader: """A bound read operation for a datasource. This is a stateful class so that reads can be prepared in multiple stages. - For example, it is useful for Datastreams to know the in-memory size of the read + For example, it is useful for Datasets to know the in-memory size of the read prior to executing it. """ @@ -146,7 +144,7 @@ def estimate_inmemory_data_size(self) -> Optional[int]: """ raise NotImplementedError - def get_read_tasks(self, parallelism: int) -> List["ReadTask[T]"]: + def get_read_tasks(self, parallelism: int) -> List["ReadTask"]: """Execute the read and return read tasks. Args: @@ -169,13 +167,13 @@ def __init__(self, datasource: Datasource, **read_args): def estimate_inmemory_data_size(self) -> Optional[int]: return None - def get_read_tasks(self, parallelism: int) -> List["ReadTask[T]"]: + def get_read_tasks(self, parallelism: int) -> List["ReadTask"]: return self._datasource.prepare_read(parallelism, **self._read_args) @DeveloperAPI class ReadTask(Callable[[], Iterable[Block]]): - """A function used to read blocks from the datastream. + """A function used to read blocks from the dataset. Read tasks are generated by ``reader.get_read_tasks()``, and return a list of ``ray.data.Block`` when called. Initial metadata about the read @@ -223,7 +221,7 @@ def __call__(self) -> Iterable[Block]: @PublicAPI -class RangeDatasource(Datasource[Union[ArrowRow, int]]): +class RangeDatasource(Datasource): """An example datasource that generates ranges of numbers from [0..n). Examples: @@ -237,7 +235,7 @@ class RangeDatasource(Datasource[Union[ArrowRow, int]]): def create_reader( self, n: int, - block_format: str = "list", + block_format: str = "arrow", tensor_shape: Tuple = (1,), column_name: Optional[str] = None, ) -> List[ReadTask]: @@ -340,7 +338,7 @@ def make_block(start: int, count: int) -> Block: @DeveloperAPI -class DummyOutputDatasource(Datasource[Union[ArrowRow, int]]): +class DummyOutputDatasource(Datasource): """An example implementation of a writable datasource for testing. Examples: @@ -400,7 +398,7 @@ def on_write_failed( @DeveloperAPI -class RandomIntRowDatasource(Datasource[ArrowRow]): +class RandomIntRowDatasource(Datasource): """An example datasource that generates rows with random int64 columns. Examples: diff --git a/python/ray/data/datasource/file_based_datasource.py b/python/ray/data/datasource/file_based_datasource.py index 03f514fadc50..c73938b1ffea 100644 --- a/python/ray/data/datasource/file_based_datasource.py +++ b/python/ray/data/datasource/file_based_datasource.py @@ -21,7 +21,6 @@ import numpy as np from ray.air._internal.remote_storage import _is_local_windows_path -from ray.data._internal.arrow_block import ArrowRow from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.output_buffer import BlockOutputBuffer @@ -63,7 +62,7 @@ @DeveloperAPI class BlockWritePathProvider: """Abstract callable that provides concrete output paths when writing - datastream blocks. + dataset blocks. Current subclasses: DefaultBlockWritePathProvider @@ -74,32 +73,32 @@ def _get_write_path_for_block( base_path: str, *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, - datastream_uuid: Optional[str] = None, + dataset_uuid: Optional[str] = None, block: Optional[Block] = None, block_index: Optional[int] = None, file_format: Optional[str] = None, ) -> str: """ - Resolves and returns the write path for the given datastream block. When + Resolves and returns the write path for the given dataset block. When implementing this method, care should be taken to ensure that a unique - path is provided for every datastream block. + path is provided for every dataset block. Args: - base_path: The base path to write the datastream block out to. This is - expected to be the same for all blocks in the datastream, and may + base_path: The base path to write the dataset block out to. This is + expected to be the same for all blocks in the dataset, and may point to either a directory or file prefix. filesystem: The filesystem implementation that will be used to write a file out to the write path returned. - datastream_uuid: Unique identifier for the datastream that this block + dataset_uuid: Unique identifier for the dataset that this block belongs to. block: The block to write. block_index: Ordered index of the block to write within its parent - datastream. + dataset. file_format: File format string for the block that can be used as the file extension in the write path returned. Returns: - The datastream block write path. + The dataset block write path. """ raise NotImplementedError @@ -108,7 +107,7 @@ def __call__( base_path: str, *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, - datastream_uuid: Optional[str] = None, + dataset_uuid: Optional[str] = None, block: Optional[Block] = None, block_index: Optional[int] = None, file_format: Optional[str] = None, @@ -116,7 +115,7 @@ def __call__( return self._get_write_path_for_block( base_path, filesystem=filesystem, - datastream_uuid=datastream_uuid, + dataset_uuid=dataset_uuid, block=block, block_index=block_index, file_format=file_format, @@ -126,8 +125,8 @@ def __call__( @DeveloperAPI class DefaultBlockWritePathProvider(BlockWritePathProvider): """Default block write path provider implementation that writes each - datastream block out to a file of the form: - {base_path}/{datastream_uuid}_{block_index}.{file_format} + dataset block out to a file of the form: + {base_path}/{dataset_uuid}_{block_index}.{file_format} """ def _get_write_path_for_block( @@ -135,12 +134,12 @@ def _get_write_path_for_block( base_path: str, *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, - datastream_uuid: Optional[str] = None, + dataset_uuid: Optional[str] = None, block: Optional[ObjectRef[Block]] = None, block_index: Optional[int] = None, file_format: Optional[str] = None, ) -> str: - suffix = f"{datastream_uuid}_{block_index:06}.{file_format}" + suffix = f"{dataset_uuid}_{block_index:06}.{file_format}" # Uses POSIX path for cross-filesystem compatibility, since PyArrow # FileSystem paths are always forward slash separated, see: # https://arrow.apache.org/docs/python/filesystems.html @@ -190,7 +189,7 @@ def __repr__(self): @DeveloperAPI -class FileBasedDatasource(Datasource[Union[ArrowRow, Any]]): +class FileBasedDatasource(Datasource): """File-based datasource, for reading and writing files. This class should not be used directly, and should instead be subclassed @@ -277,7 +276,7 @@ def write( blocks: Iterable[Block], ctx: TaskContext, path: str, - datastream_uuid: str, + dataset_uuid: str, filesystem: Optional["pyarrow.fs.FileSystem"] = None, try_create_dir: bool = True, open_stream_args: Optional[Dict[str, Any]] = None, @@ -332,7 +331,7 @@ def write_block(write_path: str, block: Block): write_path = block_path_provider( path, filesystem=filesystem, - datastream_uuid=datastream_uuid, + dataset_uuid=dataset_uuid, block=block, block_index=ctx.task_idx, file_format=file_format, @@ -427,6 +426,7 @@ def estimate_inmemory_data_size(self) -> Optional[int]: def get_read_tasks(self, parallelism: int) -> List[ReadTask]: import numpy as np + ctx = DataContext.get_current() open_stream_args = self._open_stream_args reader_args = self._reader_args partitioning = self._partitioning @@ -447,9 +447,9 @@ def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: + DataContext._set_current(ctx) logger.debug(f"Reading {len(read_paths)} files.") fs = _unwrap_s3_serialization_workaround(filesystem) - ctx = DataContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py index 1242d6ba5aab..972053d77bab 100644 --- a/python/ray/data/datasource/file_meta_provider.py +++ b/python/ray/data/datasource/file_meta_provider.py @@ -25,7 +25,7 @@ @DeveloperAPI class FileMetadataProvider: - """Abstract callable that provides metadata for the files of a single datastream block. + """Abstract callable that provides metadata for the files of a single dataset block. Current subclasses: BaseFileMetadataProvider @@ -40,10 +40,10 @@ def _get_block_metadata( ) -> BlockMetadata: """Resolves and returns block metadata for files in the given paths. - All file paths provided should belong to a single datastream block. + All file paths provided should belong to a single dataset block. Args: - paths: The file paths for a single datastream block. + paths: The file paths for a single dataset block. schema: The user-provided or inferred schema for the given paths, if any. @@ -80,10 +80,10 @@ def _get_block_metadata( rows_per_file: Optional[int], file_sizes: List[Optional[int]], ) -> BlockMetadata: - """Resolves and returns block metadata for files of a single datastream block. + """Resolves and returns block metadata for files of a single dataset block. Args: - paths: The file paths for a single datastream block. These + paths: The file paths for a single dataset block. These paths will always be a subset of those previously returned from `expand_paths()`. schema: The user-provided or inferred schema for the given file @@ -206,7 +206,7 @@ def expand_paths( class ParquetMetadataProvider(FileMetadataProvider): """Abstract callable that provides block metadata for Arrow Parquet file fragments. - All file fragments should belong to a single datastream block. + All file fragments should belong to a single dataset block. Supports optional pre-fetching of ordered metadata for all file fragments in a single batch to help optimize metadata resolution. @@ -223,10 +223,10 @@ def _get_block_metadata( pieces: List["pyarrow.dataset.ParquetFileFragment"], prefetched_metadata: Optional[List[Any]], ) -> BlockMetadata: - """Resolves and returns block metadata for files of a single datastream block. + """Resolves and returns block metadata for files of a single dataset block. Args: - paths: The file paths for a single datastream block. + paths: The file paths for a single dataset block. schema: The user-provided or inferred schema for the given file paths, if any. pieces: The Parquet file fragments derived from the input file paths. @@ -269,7 +269,7 @@ class DefaultParquetMetadataProvider(ParquetMetadataProvider): """The default file metadata provider for ParquetDatasource. Aggregates total block bytes and number of rows using the Parquet file metadata - associated with a list of Arrow Parquet datastream file fragments. + associated with a list of Arrow Parquet dataset file fragments. """ def _get_block_metadata( diff --git a/python/ray/data/datasource/numpy_datasource.py b/python/ray/data/datasource/numpy_datasource.py index d5691ce68fa8..e81471f24e29 100644 --- a/python/ray/data/datasource/numpy_datasource.py +++ b/python/ray/data/datasource/numpy_datasource.py @@ -4,7 +4,6 @@ import numpy as np import ray -from ray.air.constants import TENSOR_COLUMN_NAME from ray.data.block import BlockAccessor from ray.data.datasource.file_based_datasource import FileBasedDatasource from typing import Optional @@ -40,7 +39,7 @@ def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): data = f.readall() buf.write(data) buf.seek(0) - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: return BlockAccessor.batch_to_block( {"data": np.load(buf, allow_pickle=True)} @@ -55,7 +54,6 @@ def _convert_block_to_tabular_block( column_name = self._COLUMN_NAME column_names = block.column_names - assert column_names[0] == TENSOR_COLUMN_NAME column_names[0] = column_name return block.rename_columns(column_names) diff --git a/python/ray/data/datasource/parquet_datasource.py b/python/ray/data/datasource/parquet_datasource.py index 668f8ea69074..c98eaeec52b2 100644 --- a/python/ray/data/datasource/parquet_datasource.py +++ b/python/ray/data/datasource/parquet_datasource.py @@ -41,7 +41,7 @@ # compared to Parquet encoded representation. Parquet file statistics only record # encoded (i.e. uncompressed) data size information. # -# To estimate real-time in-memory data size, Datastreams will try to estimate the +# To estimate real-time in-memory data size, Datasets will try to estimate the # correct inflation ratio from Parquet to Arrow, using this constant as the default # value for safety. See https://github.com/ray-project/ray/pull/26516 for more context. PARQUET_ENCODING_RATIO_ESTIMATE_DEFAULT = 5 @@ -49,11 +49,11 @@ # The lower bound size to estimate Parquet encoding ratio. PARQUET_ENCODING_RATIO_ESTIMATE_LOWER_BOUND = 2 -# The percentage of files (1% by default) to be sampled from the datastream to estimate +# The percentage of files (1% by default) to be sampled from the dataset to estimate # Parquet encoding ratio. PARQUET_ENCODING_RATIO_ESTIMATE_SAMPLING_RATIO = 0.01 -# The minimal and maximal number of file samples to take from the datastream to estimate +# The minimal and maximal number of file samples to take from the dataset to estimate # Parquet encoding ratio. # This is to restrict `PARQUET_ENCODING_RATIO_ESTIMATE_SAMPLING_RATIO` within the # proper boundary. @@ -146,8 +146,8 @@ class ParquetDatasource(ParquetBaseDatasource): """Parquet datasource, for reading and writing Parquet files. The primary difference from ParquetBaseDatasource is that this uses - PyArrow's `ParquetDataset` abstraction for datastream reads, and thus offers - automatic Arrow datastream schema inference and row count collection at the + PyArrow's `ParquetDataset` abstraction for dataset reads, and thus offers + automatic Arrow dataset schema inference and row count collection at the cost of some potential performance and/or compatibility penalties. Examples: @@ -217,14 +217,14 @@ def __init__( ) if _block_udf is not None: - # Try to infer datastream schema by passing dummy table through UDF. + # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata(schema.metadata) except Exception: logger.debug( - "Failed to infer schema of datastream by passing dummy table " + "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True, ) diff --git a/python/ray/data/datasource/partitioning.py b/python/ray/data/datasource/partitioning.py index 0554a50c4a77..30462514c36a 100644 --- a/python/ray/data/datasource/partitioning.py +++ b/python/ray/data/datasource/partitioning.py @@ -17,7 +17,7 @@ @DeveloperAPI class PartitionStyle(str, Enum): - """Supported datastream partition styles. + """Supported dataset partition styles. Inherits from `str` to simplify plain text serialization/deserialization. @@ -41,7 +41,7 @@ class Partitioning: """Partition scheme used to describe path-based partitions. Path-based partition formats embed all partition keys and values directly in - their datastream file paths. + their dataset file paths. """ #: The partition style - may be either HIVE or DIRECTORY. @@ -53,7 +53,7 @@ class Partitioning: #: directories. base_dir: Optional[str] = None #: The partition key field names (i.e. column names for tabular - #: datastreams). When non-empty, the order and length of partition key + #: datasets). When non-empty, the order and length of partition key #: field names must match the order and length of partition values. #: Required when parsing DIRECTORY partitioned paths or generating #: HIVE partitioned paths. @@ -112,7 +112,7 @@ class PathPartitionEncoder: """Callable that generates directory path strings for path-based partition formats. Path-based partition formats embed all partition keys and values directly in - their datastream file paths. + their dataset file paths. Two path partition formats are currently supported - HIVE and DIRECTORY. @@ -140,7 +140,7 @@ def of( base_dir: "/"-delimited base directory that all partition paths will be generated under (exclusive). field_names: The partition key field names (i.e. column names for tabular - datastreams). Required for HIVE partition paths, optional for DIRECTORY + datasets). Required for HIVE partition paths, optional for DIRECTORY partition paths. When non-empty, the order and length of partition key field names must match the order and length of partition values. filesystem: Filesystem that will be used for partition path file I/O. @@ -229,7 +229,7 @@ class PathPartitionParser: """Partition parser for path-based partition formats. Path-based partition formats embed all partition keys and values directly in - their datastream file paths. + their dataset file paths. Two path partition formats are currently supported - HIVE and DIRECTORY. @@ -274,7 +274,7 @@ def of( Optional for HIVE partitioning. When non-empty, the order and length of partition key field names must match the order and length of partition directories discovered. Partition key field names are not required to - exist in the datastream schema. + exist in the dataset schema. filesystem: Filesystem that will be used for partition path file I/O. Returns: @@ -452,7 +452,7 @@ def do_assert(val, msg): Optional for HIVE partitioning. When non-empty, the order and length of partition key field names must match the order and length of partition directories discovered. Partition key field names are not required to - exist in the datastream schema. + exist in the dataset schema. filesystem: Filesystem that will be used for partition path file I/O. Returns: diff --git a/python/ray/data/datasource/sql_datasource.py b/python/ray/data/datasource/sql_datasource.py index 9071069dc136..a46ce81383e7 100644 --- a/python/ray/data/datasource/sql_datasource.py +++ b/python/ray/data/datasource/sql_datasource.py @@ -2,7 +2,6 @@ from contextlib import contextmanager from typing import Any, Callable, Iterator, Iterable, List, Optional -from ray.data._internal.arrow_block import ArrowRow from ray.data.block import Block, BlockAccessor, BlockMetadata from ray.data.datasource.datasource import Datasource, Reader, ReadTask from ray.util.annotations import PublicAPI @@ -23,7 +22,7 @@ def _cursor_to_block(cursor) -> Block: @PublicAPI(stability="alpha") -class SQLDatasource(Datasource[ArrowRow]): +class SQLDatasource(Datasource): def __init__(self, connection_factory: Callable[[], Connection]): self.connection_factory = connection_factory diff --git a/python/ray/data/datasource/webdataset_datasource.py b/python/ray/data/datasource/webdataset_datasource.py index 431dea55e6d6..6020cda47717 100644 --- a/python/ray/data/datasource/webdataset_datasource.py +++ b/python/ray/data/datasource/webdataset_datasource.py @@ -295,17 +295,17 @@ def _make_iterable(block: BlockAccessor): This is a placeholder for dealing with more complex blocks. Args: - block: Ray Datastream block + block: Ray Dataset block Returns: Iterable[Dict[str,Any]]: Iterable of samples """ - return block.iter_rows() + return block.iter_rows(public_row_format=False) @PublicAPI(stability="alpha") class WebDatasetDatasource(FileBasedDatasource): - """A Datasource for WebDataset datastreams (tar format with naming conventions).""" + """A Datasource for WebDataset datasets (tar format with naming conventions).""" _FILE_EXTENSION = "tar" @@ -337,6 +337,7 @@ def _read_stream( Yields: List[Dict[str, Any]]: List of sample (list of length 1). """ + import pandas as pd files = _tar_file_iterator( stream, @@ -348,7 +349,7 @@ def _read_stream( for sample in samples: if decoder is not None: sample = _apply_list(decoder, sample, default=_default_decoder) - yield [sample] + yield pd.DataFrame({k: [v] for k, v in sample.items()}) def _write_block( self, diff --git a/python/ray/data/datastream.py b/python/ray/data/datastream.py deleted file mode 100644 index 82c0dcc0be28..000000000000 --- a/python/ray/data/datastream.py +++ /dev/null @@ -1,4749 +0,0 @@ -import collections -import itertools -import logging -import sys -import time -import html -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Generic, - Iterable, - Iterator, - List, - Type, - Optional, - Tuple, - Union, -) -from uuid import uuid4 - -import numpy as np - -import ray -from ray.air.util.tensor_extensions.utils import _create_possibly_ragged_ndarray -import ray.cloudpickle as pickle -from ray._private.usage import usage_lib -from ray.air.constants import TENSOR_COLUMN_NAME -from ray.air.util.data_batch_conversion import BlockFormat -from ray.data._internal.logical.operators.all_to_all_operator import ( - RandomShuffle, - RandomizeBlocks, - Repartition, - Sort, -) -from ray.data._internal.logical.operators.n_ary_operator import Zip -from ray.data._internal.logical.optimizers import LogicalPlan -from ray.data._internal.logical.operators.map_operator import ( - Filter, - FlatMap, - MapRows, - MapBatches, -) -from ray.data._internal.logical.operators.write_operator import Write -from ray.data._internal.planner.filter import generate_filter_fn -from ray.data._internal.planner.flat_map import generate_flat_map_fn -from ray.data._internal.planner.map_batches import generate_map_batches_fn -from ray.data._internal.planner.map_rows import generate_map_rows_fn -from ray.data._internal.planner.write import generate_write_fn -from ray.data.iterator import DataIterator -from ray.data._internal.block_list import BlockList -from ray.data._internal.iterator.iterator_impl import ( - DataIteratorImpl, -) -from ray.data._internal.iterator.stream_split_iterator import ( - StreamSplitDataIterator, -) -from ray.data._internal.compute import ( - ActorPoolStrategy, - CallableClass, - ComputeStrategy, - TaskPoolStrategy, -) -from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder -from ray.data._internal.equalize import _equalize -from ray.data._internal.lazy_block_list import LazyBlockList -from ray.data._internal.util import ( - _estimate_available_parallelism, - _is_local_scheme, - ConsumptionAPI, -) -from ray.data._internal.pandas_block import PandasBlockSchema -from ray.data._internal.plan import ( - ExecutionPlan, - OneToOneStage, -) -from ray.data._internal.stage_impl import ( - RandomizeBlocksStage, - RepartitionStage, - RandomShuffleStage, - ZipStage, - SortStage, -) -from ray.data._internal.progress_bar import ProgressBar -from ray.data._internal.remote_fn import cached_remote_fn -from ray.data._internal.split import _split_at_index, _split_at_indices, _get_num_rows -from ray.data._internal.stats import DatastreamStats, DatastreamStatsSummary -from ray.data.aggregate import AggregateFn, Max, Mean, Min, Std, Sum -from ray.data.block import ( - VALID_BATCH_FORMATS, - _apply_strict_mode_batch_format, - BatchUDF, - Block, - BlockAccessor, - BlockMetadata, - BlockPartition, - DataBatch, - FlatMapUDF, - KeyFn, - RowUDF, - T, - U, - _validate_key_fn, -) -from ray.data.context import ( - DataContext, - WARN_PREFIX, - OK_PREFIX, - ESTIMATED_SAFE_MEMORY_FRACTION, - DEFAULT_BATCH_SIZE, -) -from ray.data.datasource import ( - BlockWritePathProvider, - CSVDatasource, - Datasource, - DefaultBlockWritePathProvider, - JSONDatasource, - NumpyDatasource, - ParquetDatasource, - ReadTask, - TFRecordDatasource, - WriteResult, -) -from ray.data.datasource.file_based_datasource import ( - _unwrap_arrow_serialization_workaround, - _wrap_arrow_serialization_workaround, -) -from ray.data.random_access_dataset import RandomAccessDataset -from ray.data.row import TableRow -from ray.types import ObjectRef -from ray.util.annotations import DeveloperAPI, PublicAPI, Deprecated -from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy -from ray.widgets import Template -from ray.widgets.util import ensure_notebook_deps, fallback_if_colab - -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal - -if TYPE_CHECKING: - import dask - import mars - import modin - import pandas - import pyarrow - import pyspark - import tensorflow as tf - import torch - import torch.utils.data - - from ray.data.dataset_pipeline import DatasetPipeline - from ray.data.grouped_dataset import GroupedData - from ray.data._internal.execution.interfaces import Executor, NodeIdStr - from ray.data._internal.torch_iterable_dataset import TorchTensorBatchType - from tensorflow_metadata.proto.v0 import schema_pb2 - - -logger = logging.getLogger(__name__) - -TensorflowFeatureTypeSpec = Union[ - "tf.TypeSpec", List["tf.TypeSpec"], Dict[str, "tf.TypeSpec"] -] - -TensorFlowTensorBatchType = Union["tf.Tensor", Dict[str, "tf.Tensor"]] - - -@PublicAPI -class Datastream(Generic[T]): - """A Datastream is a distributed data collection for data loading and processing. - - Datastreams are distributed streams that produce ``ObjectRef[Block]`` outputs, - where each block holds an ordered collection of items, representing a shard of the - overall data collection. The block can be either a ``pyarrow.Table``, or Python - list. The block also determines the unit of parallelism. - - Datastreams can be created in multiple ways: from synthetic data via ``range_*()`` - APIs, from existing memory data via ``from_*()`` APIs (this creates a subclass - of Datastream called ``MaterializedDatastream``), or from external storage - systems such as local disk, S3, HDFS etc. via the ``read_*()`` APIs. The - (potentially processed) Datastream can be saved back to external storage systems - via the ``write_*()`` APIs. - - Examples: - >>> import ray - >>> # Create datastream from synthetic data. - >>> ds = ray.data.range(1000) - >>> # Create datastream from in-memory data. - >>> ds = ray.data.from_items( - ... [{"col1": i, "col2": i * 2} for i in range(1000)]) - >>> # Create datastream from external storage system. - >>> ds = ray.data.read_parquet("s3://bucket/path") # doctest: +SKIP - >>> # Save datastream back to external storage system. - >>> ds.write_csv("s3://bucket/output") # doctest: +SKIP - - Datastream has two kinds of operations: transformation, which takes in Datastream - and outputs a new Datastream (e.g. :py:meth:`.map_batches()`); and consumption, - which produces values (not Datatream) as output (e.g. :py:meth:`.iter_batches()`). - - Datastream transformations are lazy, with execution of the transformations being - triggered by downstream consumption. - - Datastream supports parallel processing at scale: transformations such as - :py:meth:`.map_batches()`, aggregations such as - :py:meth:`.min()`/:py:meth:`.max()`/:py:meth:`.mean()`, grouping via - :py:meth:`.groupby()`, shuffling operations such as :py:meth:`.sort()`, - :py:meth:`.random_shuffle()`, and :py:meth:`.repartition()`. - - Examples: - >>> import ray - >>> ds = ray.data.range(1000) - >>> # Transform in parallel with map_batches(). - >>> ds.map_batches(lambda batch: [v * 2 for v in batch]) - MapBatches() - +- Datastream(num_blocks=17, num_rows=1000, schema=) - >>> # Compute maximum - >>> ds.max() - 999 - >>> # Group the data. - >>> ds.groupby(lambda x: x % 3).count() - Aggregate - +- Datastream(num_blocks=..., num_rows=1000, schema=) - >>> # Shuffle this datastream randomly. - >>> ds.random_shuffle() - RandomShuffle - +- Datastream(num_blocks=..., num_rows=1000, schema=) - >>> # Sort it back in order. - >>> ds.sort() - Sort - +- Datastream(num_blocks=..., num_rows=1000, schema=) - - Both unexecuted and materialized Datastreams can be passed between Ray tasks and - actors without incurring a copy. Datastream supports conversion to/from several more - featureful dataframe libraries (e.g., Spark, Dask, Modin, MARS), and are also - compatible with distributed - TensorFlow / PyTorch. - """ - - def __init__( - self, - plan: ExecutionPlan, - epoch: int, - lazy: bool = True, - logical_plan: Optional[LogicalPlan] = None, - ): - """Construct a Datastream (internal API). - - The constructor is not part of the Datastream API. Use the ``ray.data.*`` - read methods to construct a datastream. - """ - assert isinstance(plan, ExecutionPlan) - usage_lib.record_library_usage("dataset") # Legacy telemetry name. - - self._plan = plan - self._uuid = uuid4().hex - self._epoch = epoch - self._lazy = lazy - self._logical_plan = logical_plan - if logical_plan is not None: - self._plan.link_logical_plan(logical_plan) - - if not lazy: - self._plan.execute(allow_clear_input_blocks=False) - - # Handle to currently running executor for this datastream. - self._current_executor: Optional["Executor"] = None - - @staticmethod - def copy( - ds: "Datastream[T]", _deep_copy: bool = False, _as: Optional[type] = None - ) -> "Datastream[T]": - if not _as: - _as = Datastream - if _deep_copy: - return _as(ds._plan.deep_copy(), ds._epoch, ds._lazy, ds._logical_plan) - else: - return _as(ds._plan.copy(), ds._epoch, ds._lazy, ds._logical_plan) - - def map( - self, - fn: RowUDF[T, U], - *, - compute: Union[str, ComputeStrategy] = None, - **ray_remote_args, - ) -> "Datastream[U]": - """Apply the given function to each record of this datastream. - - Note that mapping individual records can be quite slow. Consider using - `.map_batches()` for performance. - - Examples: - >>> import ray - >>> # Transform python objects. - >>> ds = ray.data.range(1000) - >>> ds.map(lambda x: x * 2) - Map - +- Datastream(num_blocks=..., num_rows=1000, schema=) - >>> # Transform Arrow records. - >>> ds = ray.data.from_items( - ... [{"value": i} for i in range(1000)]) - >>> ds.map(lambda record: {"v2": record["value"] * 2}) - Map - +- Datastream(num_blocks=200, num_rows=1000, schema={value: int64}) - >>> # Define a callable class that persists state across - >>> # function invocations for efficiency. - >>> init_model = ... # doctest: +SKIP - >>> class CachedModel: - ... def __init__(self): - ... self.model = init_model() - ... def __call__(self, batch): - ... return self.model(batch) - >>> # Apply the transform in parallel on GPUs. Since - >>> # compute=ActorPoolStrategy(size=8) the transform will be applied on a - >>> # pool of 8 Ray actors, each allocated 1 GPU by Ray. - >>> from ray.data._internal.compute import ActorPoolStrategy - >>> ds.map(CachedModel, # doctest: +SKIP - ... compute=ActorPoolStrategy(size=8), - ... num_gpus=1) - - Time complexity: O(datastream size / parallelism) - - Args: - fn: The function to apply to each record, or a class type - that can be instantiated to create such a callable. Callable classes are - only supported for the actor compute strategy. - compute: The compute strategy, either "tasks" (default) to use Ray - tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor - pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an - autoscaling actor pool. - ray_remote_args: Additional resource requirements to request from - ray (e.g., num_gpus=1 to request GPUs for the map tasks). - - .. seealso:: - - :meth:`~Datastream.flat_map`: - Call this method to create new records from existing ones. Unlike - :meth:`~Datastream.map`, a function passed to - :meth:`~Datastream.flat_map` can return multiple records. - - :meth:`~Datastream.flat_map` isn't recommended because it's slow; call - :meth:`~Datastream.map_batches` instead. - - :meth:`~Datastream.map_batches` - Call this method to transform batches of data. It's faster and more - flexible than :meth:`~Datastream.map` and :meth:`~Datastream.flat_map`. - """ - if isinstance(fn, CallableClass) and ( - compute is None - or compute == "tasks" - or isinstance(compute, TaskPoolStrategy) - ): - raise ValueError( - "``compute`` must be specified when using a CallableClass, and must " - f"specify the actor compute strategy, but got: {compute}. " - "For example, use ``compute=ActorPoolStrategy(size=n)``." - ) - - self._warn_slow() - - transform_fn = generate_map_rows_fn() - - plan = self._plan.with_stage( - OneToOneStage( - "Map", - transform_fn, - compute, - ray_remote_args, - fn=fn, - ) - ) - - logical_plan = self._logical_plan - if logical_plan is not None: - map_op = MapRows( - logical_plan.dag, - fn, - compute=compute, - ray_remote_args=ray_remote_args, - ) - logical_plan = LogicalPlan(map_op) - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def map_batches( - self, - fn: BatchUDF, - *, - batch_size: Optional[Union[int, Literal["default"]]] = "default", - compute: Optional[Union[str, ComputeStrategy]] = None, - batch_format: Optional[str] = "default", - zero_copy_batch: bool = False, - fn_args: Optional[Iterable[Any]] = None, - fn_kwargs: Optional[Dict[str, Any]] = None, - fn_constructor_args: Optional[Iterable[Any]] = None, - fn_constructor_kwargs: Optional[Dict[str, Any]] = None, - **ray_remote_args, - ) -> "Datastream[Any]": - """Apply the given function to batches of data. - - This applies the ``fn`` in parallel with map tasks, with each task handling - a block or a bundle of blocks of the datastream. Each batch is executed serially - at Ray level (at lower level, the processing of the batch is usually - vectorized). - - Batches are represented as dataframes, ndarrays, or lists. The default batch - type is determined by your datastream's schema. To determine the default batch - type, call :meth:`~Datastream.default_batch_format`. Alternatively, set the batch - type with ``batch_format``. - - To learn more about writing functions for :meth:`~Datastream.map_batches`, read - :ref:`writing user-defined functions `. - - .. tip:: - If you have a small number of big blocks, it may limit parallelism. You may - consider increasing the number of blocks via ``.repartition()`` before - applying ``.map_batches()``. - - .. tip:: - If ``fn`` does not mutate its input, set ``zero_copy_batch=True`` to elide a - batch copy, which can improve performance and decrease memory utilization. - ``fn`` will then receive zero-copy read-only batches. - If ``fn`` mutates its input, you will need to ensure that the batch provided - to ``fn`` is writable by setting ``zero_copy_batch=False`` (default). This - will create an extra, mutable copy of each batch before handing it to - ``fn``. - - .. note:: - The size of the batches provided to ``fn`` may be smaller than the provided - ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent to - a given map task. When ``batch_size`` is specified, each map task will be - sent a single block if the block is equal to or larger than ``batch_size``, - and will be sent a bundle of blocks up to (but not exceeding) - ``batch_size`` if blocks are smaller than ``batch_size``. - - Examples: - - >>> import pandas as pd - >>> import ray - >>> df = pd.DataFrame({ - ... "name": ["Luna", "Rory", "Scout"], - ... "age": [4, 14, 9] - ... }) - >>> ds = ray.data.from_pandas(df) - >>> ds # doctest: +SKIP - MaterializedDatastream( - num_blocks=1, - num_rows=3, - schema={name: object, age: int64} - ) - - Call :meth:`.default_batch_format` to determine the default batch - type. - - >>> ds.default_batch_format() - - - .. tip:: - - Datastreams created from tabular data like Arrow tables and Parquet files - yield ``pd.DataFrame`` batches. - - Once you know the batch type, define a function that transforms batches - of data. ``ds.map_batches`` applies the function in parallel. - - >>> def map_fn(batch: pd.DataFrame) -> pd.DataFrame: - ... batch["age_in_dog_years"] = 7 * batch["age"] - ... return batch - >>> ds = ds.map_batches(map_fn) - >>> ds - MapBatches(map_fn) - +- Datastream(num_blocks=1, num_rows=3, schema={name: object, age: int64}) - - Your ``fn`` can return a different type than the input type. To learn more - about supported output types, read - :ref:`user-defined function output types `. - - >>> from typing import List - >>> def map_fn(batch: pd.DataFrame) -> List[int]: - ... return list(batch["age_in_dog_years"]) - >>> ds = ds.map_batches(map_fn) - >>> ds - MapBatches(map_fn) - +- MapBatches(map_fn) - +- Datastream(num_blocks=1, num_rows=3, schema={name: object, age: int64}) - - :ref:`Actors ` can improve the performance of some workloads. - For example, you can use :ref:`actors ` to load a model once - per worker instead of once per inference. - - To transform batches with :ref:`actors `, pass a callable type - to ``fn`` and specify an :class:`~ray.data.ActorPoolStrategy>`. - - In the example below, ``CachedModel`` is called on an autoscaling pool of - two to eight :ref:`actors `, each allocated one GPU by Ray. - - >>> from ray.data import ActorPoolStrategy - >>> init_large_model = ... # doctest: +SKIP - >>> class CachedModel: - ... def __init__(self): - ... self.model = init_large_model() - ... def __call__(self, item): - ... return self.model(item) - >>> ds.map_batches( # doctest: +SKIP - ... CachedModel, # doctest: +SKIP - ... batch_size=256, # doctest: +SKIP - ... compute=ActorPoolStrategy(size=8), # doctest: +SKIP - ... num_gpus=1, - ... ) # doctest: +SKIP - - ``fn`` can also be a generator, yielding multiple batches in a single - invocation. This is useful when returning large objects. Instead of - returning a very large output batch, ``fn`` can instead yield the - output batch in chunks. - - >>> from typing import Iterator - >>> def map_fn_with_large_output(batch: List[int]) -> Iterator[List[int]]: - ... for i in range(3): - ... yield batch * 100 - >>> ds = ray.data.from_items([1]) - >>> ds = ds.map_batches(map_fn_with_large_output) - >>> ds - MapBatches(map_fn_with_large_output) - +- Datastream(num_blocks=1, num_rows=1, schema=) - - - Args: - fn: The function or generator to apply to each record batch, or a class type - that can be instantiated to create such a callable. Callable classes are - only supported for the actor compute strategy. Note ``fn`` must be - pickle-able. - batch_size: The desired number of rows in each batch, or None to use entire - blocks as batches (blocks may contain different number of rows). - The actual size of the batch provided to ``fn`` may be smaller than - ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent - to a given map task. Default batch_size is 4096 with "default". - compute: The compute strategy, either "tasks" (default) to use Ray - tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor - pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an - autoscaling actor pool. - batch_format: Specify ``"default"`` to use the default block format - (promotes tables to Pandas and tensors to NumPy), ``"pandas"`` to select - ``pandas.DataFrame``, "pyarrow" to select ``pyarrow.Table``, or - ``"numpy"`` to select ``numpy.ndarray`` for tensor datastreams and - ``Dict[str, numpy.ndarray]`` for tabular datastreams, or None to return - the underlying block exactly as is with no additional formatting. - The default is "default". - zero_copy_batch: Whether ``fn`` should be provided zero-copy, read-only - batches. If this is ``True`` and no copy is required for the - ``batch_format`` conversion, the batch will be a zero-copy, read-only - view on data in Ray's object store, which can decrease memory - utilization and improve performance. If this is ``False``, the batch - will be writable, which will require an extra copy to guarantee. - If ``fn`` mutates its input, this will need to be ``False`` in order to - avoid "assignment destination is read-only" or "buffer source array is - read-only" errors. Default is ``False``. See - :ref:`batch format docs ` for details - on which format conversion always require a copy. - fn_args: Positional arguments to pass to ``fn`` after the first argument. - These arguments are top-level arguments to the underlying Ray task. - fn_kwargs: Keyword arguments to pass to ``fn``. These arguments are - top-level arguments to the underlying Ray task. - fn_constructor_args: Positional arguments to pass to ``fn``'s constructor. - You can only provide this if ``fn`` is a callable class. These arguments - are top-level arguments in the underlying Ray actor construction task. - fn_constructor_kwargs: Keyword arguments to pass to ``fn``'s constructor. - This can only be provided if ``fn`` is a callable class. These arguments - are top-level arguments in the underlying Ray actor construction task. - ray_remote_args: Additional resource requirements to request from - ray (e.g., ``num_gpus=1`` to request GPUs for the map tasks). - - .. seealso:: - - :meth:`~Datastream.iter_batches` - Call this function to iterate over batches of data. - - :meth:`~Datastream.default_batch_format` - Call this function to determine the default batch type. - - :meth:`~Datastream.flat_map`: - Call this method to create new records from existing ones. Unlike - :meth:`~Datastream.map`, a function passed to :meth:`~Datastream.flat_map` - can return multiple records. - - :meth:`~Datastream.flat_map` isn't recommended because it's slow; call - :meth:`~Datastream.map_batches` instead. - - :meth:`~Datastream.map` - Call this method to transform one record at time. - - This method isn't recommended because it's slow; call - :meth:`~Datastream.map_batches` instead. - """ # noqa: E501 - - batch_format = _apply_strict_mode_batch_format(batch_format) - if batch_format == "native": - logger.warning("The 'native' batch format has been renamed 'default'.") - - target_block_size = None - if batch_size == "default": - batch_size = DEFAULT_BATCH_SIZE - elif batch_size is not None: - if batch_size < 1: - raise ValueError("Batch size cannot be negative or 0") - # Enable blocks bundling when batch_size is specified by caller. - target_block_size = batch_size - - if batch_format not in VALID_BATCH_FORMATS: - raise ValueError( - f"The batch format must be one of {VALID_BATCH_FORMATS}, got: " - f"{batch_format}" - ) - - if isinstance(fn, CallableClass) and ( - compute is None - or compute == "tasks" - or isinstance(compute, TaskPoolStrategy) - ): - raise ValueError( - "``compute`` must be specified when using a CallableClass, and must " - f"specify the actor compute strategy, but got: {compute}. " - "For example, use ``compute=ActorPoolStrategy(size=n)``." - ) - - if fn_constructor_args is not None or fn_constructor_kwargs is not None: - if compute is None or ( - compute != "actors" and not isinstance(compute, ActorPoolStrategy) - ): - raise ValueError( - "fn_constructor_args and fn_constructor_kwargs can only be " - "specified if using the actor pool compute strategy, but got: " - f"{compute}" - ) - if not isinstance(fn, CallableClass): - raise ValueError( - "fn_constructor_args and fn_constructor_kwargs can only be " - "specified if providing a CallableClass instance for fn, but got: " - f"{fn}" - ) - - transform_fn = generate_map_batches_fn( - batch_size=batch_size, - batch_format=batch_format, - zero_copy_batch=zero_copy_batch, - ) - - # TODO(chengsu): pass function name to MapBatches logical operator. - if hasattr(fn, "__self__") and isinstance( - fn.__self__, ray.data.preprocessor.Preprocessor - ): - stage_name = fn.__self__.__class__.__name__ - else: - stage_name = f'MapBatches({getattr(fn, "__name__", type(fn))})' - - stage = OneToOneStage( - stage_name, - transform_fn, - compute, - ray_remote_args, - # TODO(Clark): Add a strict cap here. - target_block_size=target_block_size, - fn=fn, - fn_args=fn_args, - fn_kwargs=fn_kwargs, - fn_constructor_args=fn_constructor_args, - fn_constructor_kwargs=fn_constructor_kwargs, - ) - plan = self._plan.with_stage(stage) - - logical_plan = self._logical_plan - if logical_plan is not None: - map_batches_op = MapBatches( - logical_plan.dag, - fn, - batch_size=batch_size, - batch_format=batch_format, - zero_copy_batch=zero_copy_batch, - target_block_size=target_block_size, - fn_args=fn_args, - fn_kwargs=fn_kwargs, - fn_constructor_args=fn_constructor_args, - fn_constructor_kwargs=fn_constructor_kwargs, - compute=compute, - ray_remote_args=ray_remote_args, - ) - logical_plan = LogicalPlan(map_batches_op) - - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def add_column( - self, - col: str, - fn: Callable[["pandas.DataFrame"], "pandas.Series"], - *, - compute: Optional[str] = None, - **ray_remote_args, - ) -> "Datastream[T]": - """Add the given column to the datastream. - - This is only supported for datastreams convertible to pandas format. - A function generating the new column values given the batch in pandas - format must be specified. - - Examples: - >>> import ray - >>> ds = ray.data.range_table(100) - >>> # Add a new column equal to value * 2. - >>> ds = ds.add_column( - ... "new_col", lambda df: df["value"] * 2) - >>> # Overwrite the existing "value" with zeros. - >>> ds = ds.add_column("value", lambda df: 0) - - Time complexity: O(datastream size / parallelism) - - Args: - col: Name of the column to add. If the name already exists, the - column will be overwritten. - fn: Map function generating the column values given a batch of - records in pandas format. - compute: The compute strategy, either "tasks" (default) to use Ray - tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor - pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an - autoscaling actor pool. - ray_remote_args: Additional resource requirements to request from - ray (e.g., num_gpus=1 to request GPUs for the map tasks). - """ - - def process_batch(batch: "pandas.DataFrame") -> "pandas.DataFrame": - batch.loc[:, col] = fn(batch) - return batch - - if not callable(fn): - raise ValueError("`fn` must be callable, got {}".format(fn)) - - return self.map_batches( - process_batch, - batch_format="pandas", # TODO(ekl) we should make this configurable. - compute=compute, - zero_copy_batch=False, - **ray_remote_args, - ) - - def drop_columns( - self, - cols: List[str], - *, - compute: Optional[str] = None, - **ray_remote_args, - ) -> "Datastream[U]": - """Drop one or more columns from the datastream. - - Examples: - >>> import ray - >>> ds = ray.data.range_table(100) - >>> # Add a new column equal to value * 2. - >>> ds = ds.add_column( - ... "new_col", lambda df: df["value"] * 2) - >>> # Drop the existing "value" column. - >>> ds = ds.drop_columns(["value"]) - - - Time complexity: O(datastream size / parallelism) - - Args: - cols: Names of the columns to drop. If any name does not exist, - an exception will be raised. - compute: The compute strategy, either "tasks" (default) to use Ray - tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor - pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an - autoscaling actor pool. - ray_remote_args: Additional resource requirements to request from - ray (e.g., num_gpus=1 to request GPUs for the map tasks). - """ - - return self.map_batches( - lambda batch: batch.drop(columns=cols), - batch_format="pandas", - zero_copy_batch=True, - compute=compute, - **ray_remote_args, - ) - - def select_columns( - self, - cols: List[str], - *, - compute: Union[str, ComputeStrategy] = None, - **ray_remote_args, - ) -> "Datastream[T]": - """Select one or more columns from the datastream. - - All input columns used to select need to be in the schema of the datastream. - - Examples: - >>> import ray - >>> # Create a datastream with 3 columns - >>> ds = ray.data.from_items([{"col1": i, "col2": i+1, "col3": i+2} - ... for i in range(10)]) - >>> # Select only "col1" and "col2" columns. - >>> ds = ds.select_columns(cols=["col1", "col2"]) - >>> ds - MapBatches() - +- Datastream( - num_blocks=10, - num_rows=10, - schema={col1: int64, col2: int64, col3: int64} - ) - - - Time complexity: O(datastream size / parallelism) - - Args: - cols: Names of the columns to select. If any name is not included in the - datastream schema, an exception will be raised. - compute: The compute strategy, either "tasks" (default) to use Ray - tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor - pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an - autoscaling actor pool. - ray_remote_args: Additional resource requirements to request from - ray (e.g., num_gpus=1 to request GPUs for the map tasks). - """ # noqa: E501 - return self.map_batches( - lambda batch: BlockAccessor.for_block(batch).select(columns=cols), - zero_copy_batch=True, - compute=compute, - **ray_remote_args, - ) - - def flat_map( - self, - fn: FlatMapUDF[T, U], - *, - compute: Union[str, ComputeStrategy] = None, - **ray_remote_args, - ) -> "Datastream[U]": - """Apply the given function to each record and then flatten results. - - Consider using ``.map_batches()`` for better performance (the batch size can be - altered in map_batches). - - Examples: - >>> import ray - >>> ds = ray.data.range(1000) - >>> ds.flat_map(lambda x: [x, x ** 2, x ** 3]) - FlatMap - +- Datastream(num_blocks=..., num_rows=1000, schema=) - - Time complexity: O(datastream size / parallelism) - - Args: - fn: The function or generator to apply to each record, or a class type - that can be instantiated to create such a callable. Callable classes are - only supported for the actor compute strategy. - compute: The compute strategy, either "tasks" (default) to use Ray - tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor - pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an - autoscaling actor pool. - ray_remote_args: Additional resource requirements to request from - ray (e.g., num_gpus=1 to request GPUs for the map tasks). - - .. seealso:: - - :meth:`~Datastream.map_batches` - Call this method to transform batches of data. It's faster and more - flexible than :meth:`~Datastream.map` and :meth:`~Datastream.flat_map`. - - :meth:`~Datastream.map` - Call this method to transform one record at time. - - This method isn't recommended because it's slow; call - :meth:`~Datastream.map_batches` instead. - """ - if isinstance(fn, CallableClass) and ( - compute is None - or compute == "tasks" - or isinstance(compute, TaskPoolStrategy) - ): - raise ValueError( - "``compute`` must be specified when using a CallableClass, and must " - f"specify the actor compute strategy, but got: {compute}. " - "For example, use ``compute=ActorPoolStrategy(size=n)``." - ) - - self._warn_slow() - - transform_fn = generate_flat_map_fn() - - plan = self._plan.with_stage( - OneToOneStage("FlatMap", transform_fn, compute, ray_remote_args, fn=fn) - ) - - logical_plan = self._logical_plan - if logical_plan is not None: - op = FlatMap( - input_op=logical_plan.dag, - fn=fn, - compute=compute, - ray_remote_args=ray_remote_args, - ) - logical_plan = LogicalPlan(op) - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def filter( - self, - fn: RowUDF[T, U], - *, - compute: Union[str, ComputeStrategy] = None, - **ray_remote_args, - ) -> "Datastream[T]": - """Filter out records that do not satisfy the given predicate. - - Consider using ``.map_batches()`` for better performance (you can implement - filter by dropping records). - - Examples: - >>> import ray - >>> ds = ray.data.range(100) - >>> ds.filter(lambda x: x % 2 == 0) - Filter - +- Datastream(num_blocks=..., num_rows=100, schema=) - - Time complexity: O(datastream size / parallelism) - - Args: - fn: The predicate to apply to each record, or a class type - that can be instantiated to create such a callable. Callable classes are - only supported for the actor compute strategy. - compute: The compute strategy, either "tasks" (default) to use Ray - tasks, ``ray.data.ActorPoolStrategy(size=n)`` to use a fixed-size actor - pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an - autoscaling actor pool. - ray_remote_args: Additional resource requirements to request from - ray (e.g., num_gpus=1 to request GPUs for the map tasks). - """ - if isinstance(fn, CallableClass) and ( - compute is None - or compute == "tasks" - or isinstance(compute, TaskPoolStrategy) - ): - raise ValueError( - "``compute`` must be specified when using a CallableClass, and must " - f"specify the actor compute strategy, but got: {compute}. " - "For example, use ``compute=ActorPoolStrategy(size=n)``." - ) - - self._warn_slow() - - transform_fn = generate_filter_fn() - - plan = self._plan.with_stage( - OneToOneStage("Filter", transform_fn, compute, ray_remote_args, fn=fn) - ) - - logical_plan = self._logical_plan - if logical_plan is not None: - op = Filter( - input_op=logical_plan.dag, - fn=fn, - compute=compute, - ray_remote_args=ray_remote_args, - ) - logical_plan = LogicalPlan(op) - - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def repartition(self, num_blocks: int, *, shuffle: bool = False) -> "Datastream[T]": - """Repartition the datastream into exactly this number of blocks. - - After repartitioning, all blocks in the returned datastream will have - approximately the same number of rows. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) - >>> # Set the number of output partitions to write to disk. - >>> ds.repartition(10).write_parquet("/tmp/test") - - Time complexity: O(datastream size / parallelism) - - Args: - num_blocks: The number of blocks. - shuffle: Whether to perform a distributed shuffle during the - repartition. When shuffle is enabled, each output block - contains a subset of data rows from each input block, which - requires all-to-all data movement. When shuffle is disabled, - output blocks are created from adjacent input blocks, - minimizing data movement. - - Returns: - The repartitioned datastream. - """ - - plan = self._plan.with_stage(RepartitionStage(num_blocks, shuffle)) - - logical_plan = self._logical_plan - if logical_plan is not None: - op = Repartition( - logical_plan.dag, - num_outputs=num_blocks, - shuffle=shuffle, - ) - logical_plan = LogicalPlan(op) - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def random_shuffle( - self, - *, - seed: Optional[int] = None, - num_blocks: Optional[int] = None, - **ray_remote_args, - ) -> "Datastream[T]": - """Randomly shuffle the elements of this datastream. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) - >>> # Shuffle this datastream randomly. - >>> ds.random_shuffle() - RandomShuffle - +- Datastream(num_blocks=..., num_rows=100, schema=) - >>> # Shuffle this datastream with a fixed random seed. - >>> ds.random_shuffle(seed=12345) - RandomShuffle - +- Datastream(num_blocks=..., num_rows=100, schema=) - - Time complexity: O(datastream size / parallelism) - - Args: - seed: Fix the random seed to use, otherwise one will be chosen - based on system randomness. - num_blocks: The number of output blocks after the shuffle, or None - to retain the number of blocks. - - Returns: - The shuffled datastream. - """ - - plan = self._plan.with_stage( - RandomShuffleStage(seed, num_blocks, ray_remote_args) - ) - - logical_plan = self._logical_plan - if logical_plan is not None: - op = RandomShuffle( - logical_plan.dag, - seed=seed, - num_outputs=num_blocks, - ray_remote_args=ray_remote_args, - ) - logical_plan = LogicalPlan(op) - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def randomize_block_order( - self, - *, - seed: Optional[int] = None, - ) -> "Datastream[T]": - """Randomly shuffle the blocks of this datastream. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> # Randomize the block order. - >>> ds.randomize_block_order() # doctest: +SKIP - >>> # Randomize the block order with a fixed random seed. - >>> ds.randomize_block_order(seed=12345) # doctest: +SKIP - - Args: - seed: Fix the random seed to use, otherwise one will be chosen - based on system randomness. - - Returns: - The block-shuffled datastream. - """ - - plan = self._plan.with_stage(RandomizeBlocksStage(seed)) - - logical_plan = self._logical_plan - if logical_plan is not None: - op = RandomizeBlocks( - logical_plan.dag, - seed=seed, - ) - logical_plan = LogicalPlan(op) - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def random_sample( - self, fraction: float, *, seed: Optional[int] = None - ) -> "Datastream[T]": - """Randomly samples a fraction of the elements of this datastream. - - Note that the exact number of elements returned is not guaranteed, - and that the number of elements being returned is roughly fraction * total_rows. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> ds.random_sample(0.1) # doctest: +SKIP - >>> ds.random_sample(0.2, seed=12345) # doctest: +SKIP - - Args: - fraction: The fraction of elements to sample. - seed: Seeds the python random pRNG generator. - - Returns: - Returns a Datastream containing the sampled elements. - """ - import random - - import pandas as pd - import pyarrow as pa - - if self.num_blocks() == 0: - raise ValueError("Cannot sample from an empty Datastream.") - - if fraction < 0 or fraction > 1: - raise ValueError("Fraction must be between 0 and 1.") - - if seed is not None: - random.seed(seed) - - def process_batch(batch): - if isinstance(batch, list): - return [row for row in batch if random.random() <= fraction] - if isinstance(batch, pa.Table): - # Lets the item pass if weight generated for that item <= fraction - return batch.filter( - pa.array(random.random() <= fraction for _ in range(len(batch))) - ) - if isinstance(batch, pd.DataFrame): - return batch.sample(frac=fraction) - if isinstance(batch, np.ndarray): - return _create_possibly_ragged_ndarray( - [row for row in batch if random.random() <= fraction] - ) - raise ValueError(f"Unsupported batch type: {type(batch)}") - - return self.map_batches(process_batch) - - @ConsumptionAPI - def streaming_split( - self, - n: int, - *, - equal: bool = False, - locality_hints: Optional[List["NodeIdStr"]] = None, - ) -> List[DataIterator]: - """Returns ``n`` :class:`DataIterators ` that can - be used to read disjoint subsets of the datastream in parallel. - - This method is the recommended way to consume Datastreams from multiple - processes (e.g., for distributed training), and requires streaming execution - mode. - - Streaming split works by delegating the execution of this Datastream to a - coordinator actor. The coordinator pulls block references from the executed - stream, and divides those blocks among `n` output iterators. Iterators pull - blocks from the coordinator actor to return to their caller on `next`. - - The returned iterators are also repeatable; each iteration will trigger a - new execution of the Datastream. There is an implicit barrier at the start of - each iteration, which means that `next` must be called on all iterators before - the iteration starts. - - Warning: because iterators are pulling blocks from the same Datastream - execution, if one iterator falls behind other iterators may be stalled. - - Examples: - >>> import ray - >>> ds = ray.data.range(1000000) - >>> it1, it2 = ds.streaming_split(2, equal=True) - - >>> # Can consume from both iterators in parallel. - >>> @ray.remote - ... def consume(it): - ... for batch in it.iter_batches(): - ... print(batch) - >>> ray.get([consume.remote(it1), consume.remote(it2)]) # doctest: +SKIP - - >>> # Can loop over the iterators multiple times (multiple epochs). - >>> @ray.remote - ... def train(it): - ... NUM_EPOCHS = 100 - ... for _ in range(NUM_EPOCHS): - ... for batch in it.iter_batches(): - ... print(batch) - >>> ray.get([train.remote(it1), train.remote(it2)]) # doctest: +SKIP - - >>> # ERROR: this will block waiting for a read on `it2` to start. - >>> ray.get(train.remote(it1)) # doctest: +SKIP - - Args: - n: Number of output iterators to return. - equal: If True, each output iterator will see an exactly equal number - of rows, dropping data if necessary. If False, some iterators may see - slightly more or less rows than other, but no data will be dropped. - locality_hints: Specify the node ids corresponding to each iterator - location. Datastream will try to minimize data movement based on the - iterator output locations. This list must have length ``n``. You can - get the current node id of a task or actor by calling - ``ray.get_runtime_context().get_node_id()``. - - Returns: - The output iterator splits. These iterators are Ray-serializable and can - be freely passed to any Ray task or actor. - """ - return StreamSplitDataIterator.create(self, n, equal, locality_hints) - - @ConsumptionAPI - def split( - self, n: int, *, equal: bool = False, locality_hints: Optional[List[Any]] = None - ) -> List["MaterializedDatastream[T]"]: - """Materialize and split the datastream into ``n`` disjoint pieces. - - This returns a list of MaterializedDatastreams that can be passed to Ray tasks - and actors and used to read the datastream records in parallel. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> workers = ... # doctest: +SKIP - >>> # Split up a datastream to process over `n` worker actors. - >>> shards = ds.split(len(workers), locality_hints=workers) # doctest: +SKIP - >>> for shard, worker in zip(shards, workers): # doctest: +SKIP - ... worker.consume.remote(shard) # doctest: +SKIP - - Time complexity: O(1) - - See also: ``Datastream.split_at_indices``, ``Datastream.split_proportionately``, - and ``Datastream.streaming_split``. - - Args: - n: Number of child datastreams to return. - equal: Whether to guarantee each split has an equal - number of records. This may drop records if they cannot be - divided equally among the splits. - locality_hints: [Experimental] A list of Ray actor handles of size ``n``. - The system will try to co-locate the blocks of the i-th datastream - with the i-th actor to maximize data locality. - - Returns: - A list of ``n`` disjoint datastream splits. - """ - if n <= 0: - raise ValueError(f"The number of splits {n} is not positive.") - - # fallback to split_at_indices for equal split without locality hints. - # simple benchmarks shows spilit_at_indices yields more stable performance. - # https://github.com/ray-project/ray/pull/26641 for more context. - if equal and locality_hints is None: - count = self.count() - split_index = count // n - # we are creating n split_indices which will generate - # n + 1 splits; the last split will at most contains (n - 1) - # rows, which could be safely dropped. - split_indices = [split_index * i for i in range(1, n + 1)] - shards = self.split_at_indices(split_indices) - return shards[:n] - - if locality_hints and len(locality_hints) != n: - raise ValueError( - f"The length of locality_hints {len(locality_hints)} " - f"doesn't equal the number of splits {n}." - ) - # TODO: this is unreachable code. - if len(set(locality_hints)) != len(locality_hints): - raise ValueError( - "locality_hints must not contain duplicate actor handles" - ) - - blocks = self._plan.execute() - owned_by_consumer = blocks._owned_by_consumer - stats = self._plan.stats() - block_refs, metadata = zip(*blocks.get_blocks_with_metadata()) - - if locality_hints is None: - blocks = np.array_split(block_refs, n) - meta = np.array_split(metadata, n) - return [ - MaterializedDatastream( - ExecutionPlan( - BlockList( - b.tolist(), m.tolist(), owned_by_consumer=owned_by_consumer - ), - stats, - run_by_consumer=owned_by_consumer, - ), - self._epoch, - self._lazy, - ) - for b, m in zip(blocks, meta) - ] - - metadata_mapping = {b: m for b, m in zip(block_refs, metadata)} - - # If the locality_hints is set, we use a two-round greedy algorithm - # to co-locate the blocks with the actors based on block - # and actor's location (node_id). - # - # The split algorithm tries to allocate equally-sized blocks regardless - # of locality. Thus we first calculate the expected number of blocks - # for each split. - # - # In the first round, for each actor, we look for all blocks that - # match the actor's node_id, then allocate those matched blocks to - # this actor until we reach the limit(expected number). - # - # In the second round: fill each actor's allocation with - # remaining unallocated blocks until we reach the limit. - - def build_allocation_size_map( - num_blocks: int, actors: List[Any] - ) -> Dict[Any, int]: - """Given the total number of blocks and a list of actors, calcuate - the expected number of blocks to allocate for each actor. - """ - num_actors = len(actors) - num_blocks_per_actor = num_blocks // num_actors - num_blocks_left = num_blocks - num_blocks_per_actor * n - num_blocks_by_actor = {} - for i, actor in enumerate(actors): - num_blocks_by_actor[actor] = num_blocks_per_actor - if i < num_blocks_left: - num_blocks_by_actor[actor] += 1 - return num_blocks_by_actor - - def build_block_refs_by_node_id( - blocks: List[ObjectRef[Block]], - ) -> Dict[str, List[ObjectRef[Block]]]: - """Build the reverse index from node_id to block_refs. For - simplicity, if the block is stored on multiple nodes we - only pick the first one. - """ - block_ref_locations = ray.experimental.get_object_locations(blocks) - block_refs_by_node_id = collections.defaultdict(list) - for block_ref in blocks: - node_ids = block_ref_locations.get(block_ref, {}).get("node_ids", []) - node_id = node_ids[0] if node_ids else None - block_refs_by_node_id[node_id].append(block_ref) - return block_refs_by_node_id - - def build_node_id_by_actor(actors: List[Any]) -> Dict[Any, str]: - """Build a map from a actor to its node_id.""" - actors_state = ray._private.state.actors() - return { - actor: actors_state.get(actor._actor_id.hex(), {}) - .get("Address", {}) - .get("NodeID") - for actor in actors - } - - # expected number of blocks to be allocated for each actor - expected_block_count_by_actor = build_allocation_size_map( - len(block_refs), locality_hints - ) - # the reverse index from node_id to block_refs - block_refs_by_node_id = build_block_refs_by_node_id(block_refs) - # the map from actor to its node_id - node_id_by_actor = build_node_id_by_actor(locality_hints) - - allocation_per_actor = collections.defaultdict(list) - - # In the first round, for each actor, we look for all blocks that - # match the actor's node_id, then allocate those matched blocks to - # this actor until we reach the limit(expected number) - for actor in locality_hints: - node_id = node_id_by_actor[actor] - matching_blocks = block_refs_by_node_id[node_id] - expected_block_count = expected_block_count_by_actor[actor] - allocation = [] - while matching_blocks and len(allocation) < expected_block_count: - allocation.append(matching_blocks.pop()) - allocation_per_actor[actor] = allocation - - # In the second round: fill each actor's allocation with - # remaining unallocated blocks until we reach the limit - remaining_block_refs = list( - itertools.chain.from_iterable(block_refs_by_node_id.values()) - ) - for actor in locality_hints: - while ( - len(allocation_per_actor[actor]) < expected_block_count_by_actor[actor] - ): - allocation_per_actor[actor].append(remaining_block_refs.pop()) - - assert len(remaining_block_refs) == 0, len(remaining_block_refs) - - per_split_block_lists = [ - BlockList( - allocation_per_actor[actor], - [metadata_mapping[b] for b in allocation_per_actor[actor]], - owned_by_consumer=owned_by_consumer, - ) - for actor in locality_hints - ] - - if equal: - # equalize the splits - per_split_block_lists = _equalize(per_split_block_lists, owned_by_consumer) - - return [ - MaterializedDatastream( - ExecutionPlan( - block_split, - stats, - run_by_consumer=owned_by_consumer, - ), - self._epoch, - self._lazy, - ) - for block_split in per_split_block_lists - ] - - @ConsumptionAPI - def split_at_indices(self, indices: List[int]) -> List["MaterializedDatastream[T]"]: - """Materialize and split the datastream at the given indices (like np.split). - - Examples: - >>> import ray - >>> ds = ray.data.range(10) - >>> d1, d2, d3 = ds.split_at_indices([2, 5]) - >>> d1.take() - [0, 1] - >>> d2.take() - [2, 3, 4] - >>> d3.take() - [5, 6, 7, 8, 9] - - Time complexity: O(num splits) - - See also: ``Datastream.split_at_indices``, ``Datastream.split_proportionately``, - and ``Datastream.streaming_split``. - - Args: - indices: List of sorted integers which indicate where the datastream - will be split. If an index exceeds the length of the datastream, - an empty datastream will be returned. - - Returns: - The datastream splits. - """ - - if len(indices) < 1: - raise ValueError("indices must be at least of length 1") - if sorted(indices) != indices: - raise ValueError("indices must be sorted") - if indices[0] < 0: - raise ValueError("indices must be positive") - start_time = time.perf_counter() - block_list = self._plan.execute() - blocks, metadata = _split_at_indices( - block_list.get_blocks_with_metadata(), - indices, - block_list._owned_by_consumer, - ) - split_duration = time.perf_counter() - start_time - parent_stats = self._plan.stats() - splits = [] - for bs, ms in zip(blocks, metadata): - stats = DatastreamStats(stages={"Split": ms}, parent=parent_stats) - stats.time_total_s = split_duration - splits.append( - MaterializedDatastream( - ExecutionPlan( - BlockList( - bs, ms, owned_by_consumer=block_list._owned_by_consumer - ), - stats, - run_by_consumer=block_list._owned_by_consumer, - ), - self._epoch, - self._lazy, - ) - ) - return splits - - @ConsumptionAPI - def split_proportionately( - self, proportions: List[float] - ) -> List["MaterializedDatastream[T]"]: - """Materialize and split the datastream using proportions. - - A common use case for this would be splitting the datastream into train - and test sets (equivalent to eg. scikit-learn's ``train_test_split``). - See also ``Datastream.train_test_split`` for a higher level abstraction. - - The indices to split at will be calculated in such a way so that all splits - always contains at least one element. If that is not possible, - an exception will be raised. - - This is equivalent to caulculating the indices manually and calling - ``Datastream.split_at_indices``. - - Examples: - >>> import ray - >>> ds = ray.data.range(10) - >>> d1, d2, d3 = ds.split_proportionately([0.2, 0.5]) - >>> d1.take() - [0, 1] - >>> d2.take() - [2, 3, 4, 5, 6] - >>> d3.take() - [7, 8, 9] - - Time complexity: O(num splits) - - See also: ``Datastream.split``, ``Datastream.split_at_indices``, - ``Datastream.train_test_split`` - - Args: - proportions: List of proportions to split the datastream according to. - Must sum up to less than 1, and each proportion has to be bigger - than 0. - - Returns: - The datastream splits. - """ - - if len(proportions) < 1: - raise ValueError("proportions must be at least of length 1") - if sum(proportions) >= 1: - raise ValueError("proportions must sum to less than 1") - if any(p <= 0 for p in proportions): - raise ValueError("proportions must be bigger than 0") - - datastream_length = self.count() - cumulative_proportions = np.cumsum(proportions) - split_indices = [ - int(datastream_length * proportion) for proportion in cumulative_proportions - ] - - # Ensure each split has at least one element - subtract = 0 - for i in range(len(split_indices) - 2, -1, -1): - split_indices[i] -= subtract - if split_indices[i] == split_indices[i + 1]: - subtract += 1 - split_indices[i] -= 1 - if any(i <= 0 for i in split_indices): - raise ValueError( - "Couldn't create non-empty splits with the given proportions." - ) - - return self.split_at_indices(split_indices) - - @ConsumptionAPI - def train_test_split( - self, - test_size: Union[int, float], - *, - shuffle: bool = False, - seed: Optional[int] = None, - ) -> Tuple["MaterializedDatastream[T]", "MaterializedDatastream[T]"]: - """Materialize and split the datastream into train and test subsets. - - Examples: - - >>> import ray - >>> ds = ray.data.range(8) - >>> train, test = ds.train_test_split(test_size=0.25) - >>> train.take() - [0, 1, 2, 3, 4, 5] - >>> test.take() - [6, 7] - - Args: - test_size: If float, should be between 0.0 and 1.0 and represent the - proportion of the datastream to include in the test split. If int, - represents the absolute number of test samples. The train split will - always be the compliment of the test split. - shuffle: Whether or not to globally shuffle the datastream before splitting. - Defaults to False. This may be a very expensive operation with large - datastream. - seed: Fix the random seed to use for shuffle, otherwise one will be chosen - based on system randomness. Ignored if ``shuffle=False``. - - Returns: - Train and test subsets as two MaterializedDatastreams. - """ - ds = self - - if shuffle: - ds = ds.random_shuffle(seed=seed) - - if not isinstance(test_size, (int, float)): - raise TypeError(f"`test_size` must be int or float got {type(test_size)}.") - if isinstance(test_size, float): - if test_size <= 0 or test_size >= 1: - raise ValueError( - "If `test_size` is a float, it must be bigger than 0 and smaller " - f"than 1. Got {test_size}." - ) - return ds.split_proportionately([1 - test_size]) - else: - ds_length = ds.count() - if test_size <= 0 or test_size >= ds_length: - raise ValueError( - "If `test_size` is an int, it must be bigger than 0 and smaller " - f"than the size of the datastream ({ds_length}). " - f"Got {test_size}." - ) - return ds.split_at_indices([ds_length - test_size]) - - @ConsumptionAPI(pattern="Args:") - def union(self, *other: List["Datastream[T]"]) -> "Datastream[T]": - """Materialize and combine this datastream with others of the same type. - - The order of the blocks in the datastreams is preserved, as is the - relative ordering between the datastreams passed in the argument list. - - .. note:: - Unioned datastreams are not lineage-serializable, i.e. they can not be - used as a tunable hyperparameter in Ray Tune. - - Args: - other: List of datastreams to combine with this one. The datastreams - must have the same schema as this datastream, otherwise the - behavior is undefined. - - Returns: - A new datastream holding the union of their data. - """ - - start_time = time.perf_counter() - - owned_by_consumer = self._plan.execute()._owned_by_consumer - datastreams = [self] + list(other) - bls = [] - has_nonlazy = False - for ds in datastreams: - bl = ds._plan.execute() - if not isinstance(bl, LazyBlockList): - has_nonlazy = True - bls.append(bl) - if has_nonlazy: - blocks = [] - metadata = [] - for bl in bls: - if isinstance(bl, LazyBlockList): - bs, ms = bl._get_blocks_with_metadata() - else: - bs, ms = bl._blocks, bl._metadata - blocks.extend(bs) - metadata.extend(ms) - blocklist = BlockList(blocks, metadata, owned_by_consumer=owned_by_consumer) - else: - tasks: List[ReadTask] = [] - block_partition_refs: List[ObjectRef[BlockPartition]] = [] - block_partition_meta_refs: List[ObjectRef[BlockMetadata]] = [] - - # Gather read task names from input blocks of unioned Datastreams, - # and concat them before passing to resulting LazyBlockList - read_task_names = [] - self_read_name = self._plan._in_blocks._read_stage_name or "Read" - read_task_names.append(self_read_name) - other_read_names = [ - o._plan._in_blocks._read_stage_name or "Read" for o in other - ] - read_task_names.extend(other_read_names) - - for bl in bls: - tasks.extend(bl._tasks) - block_partition_refs.extend(bl._block_partition_refs) - block_partition_meta_refs.extend(bl._block_partition_meta_refs) - blocklist = LazyBlockList( - tasks, - f"Union({','.join(read_task_names)})", - block_partition_refs, - block_partition_meta_refs, - owned_by_consumer=owned_by_consumer, - ) - - epochs = [ds._get_epoch() for ds in datastreams] - max_epoch = max(*epochs) - if len(set(epochs)) > 1: - if ray.util.log_once("datastream_epoch_warned"): - logger.warning( - "Datastream contains data from multiple epochs: {}, " - "likely due to a `rewindow()` call. The higher epoch " - "number {} will be used. This warning will not " - "be shown again.".format(set(epochs), max_epoch) - ) - stats = DatastreamStats( - stages={"Union": []}, - parent=[d._plan.stats() for d in datastreams], - ) - stats.time_total_s = time.perf_counter() - start_time - return Datastream( - ExecutionPlan(blocklist, stats, run_by_consumer=owned_by_consumer), - max_epoch, - self._lazy, - ) - - def groupby(self, key: Optional[KeyFn]) -> "GroupedData[T]": - """Group the datastream by the key function or column name. - - Examples: - >>> import ray - >>> # Group by a key function and aggregate. - >>> ray.data.range(100).groupby(lambda x: x % 3).count() - Aggregate - +- Datastream(num_blocks=..., num_rows=100, schema=) - >>> # Group by an Arrow table column and aggregate. - >>> ray.data.from_items([ - ... {"A": x % 3, "B": x} for x in range(100)]).groupby( - ... "A").count() - Aggregate - +- Datastream(num_blocks=100, num_rows=100, schema={A: int64, B: int64}) - - Time complexity: O(datastream size * log(datastream size / parallelism)) - - Args: - key: A key function or Arrow column name. If this is None, the - grouping is global. - - Returns: - A lazy GroupedData that can be aggregated later. - """ - from ray.data.grouped_dataset import GroupedData - - # Always allow None since groupby interprets that as grouping all - # records into a single global group. - if key is not None: - _validate_key_fn(self.schema(fetch_if_missing=True), key) - - return GroupedData(self, key) - - @ConsumptionAPI - def aggregate(self, *aggs: AggregateFn) -> U: - """Aggregate the entire datastream as one group. - - Examples: - >>> import ray - >>> from ray.data.aggregate import Max, Mean - >>> ray.data.range(100).aggregate(Max()) - (99,) - >>> ray.data.range_table(100).aggregate( - ... Max("value"), Mean("value")) - {'max(value)': 99, 'mean(value)': 49.5} - - Time complexity: O(datastream size / parallelism) - - Args: - aggs: Aggregations to do. - - Returns: - If the input datastream is a simple datastream then the output is - a tuple of ``(agg1, agg2, ...)`` where each tuple element is - the corresponding aggregation result. - If the input datastream is an Arrow datastream then the output is - an ``ArrowRow`` where each column is the corresponding - aggregation result. - If the datastream is empty, return ``None``. - """ - ret = self.groupby(None).aggregate(*aggs).take(1) - return ret[0] if len(ret) > 0 else None - - @ConsumptionAPI - def sum( - self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True - ) -> U: - """Compute sum over entire datastream. - - Examples: - >>> import ray - >>> ray.data.range(100).sum() - 4950 - >>> ray.data.from_items([ - ... (i, i**2) - ... for i in range(100)]).sum(lambda x: x[1]) - 328350 - >>> ray.data.range_table(100).sum("value") - 4950 - >>> ray.data.from_items([ - ... {"A": i, "B": i**2} - ... for i in range(100)]).sum(["A", "B"]) - {'sum(A)': 4950, 'sum(B)': 328350} - - Args: - on: The data subset on which to compute the sum. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to return a scalar sum of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to return an ``ArrowRow`` - containing the column-wise sum of all columns. - ignore_nulls: Whether to ignore null values. If ``True``, null - values will be ignored when computing the sum; if ``False``, - if a null value is encountered, the output will be None. - We consider np.nan, None, and pd.NaT to be null values. - Default is ``True``. - - Returns: - The sum result. - - For a simple datastream, the output is: - - - ``on=None``: a scalar representing the sum of all rows, - - ``on=callable``: a scalar representing the sum of the outputs of - the callable called on each row, - - ``on=[callable_1, ..., calalble_n]``: a tuple of - ``(sum_1, ..., sum_n)`` representing the sum of the outputs of - the corresponding callables called on each row. - - For an Arrow datastream, the output is: - - - ``on=None``: an ArrowRow containing the column-wise sum of all - columns, - - ``on="col"``: a scalar representing the sum of all items in - column ``"col"``, - - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow`` - containing the column-wise sum of the provided columns. - - If the datastream is empty, all values are null, or any value is null - AND ``ignore_nulls`` is ``False``, then the output will be None. - """ - ret = self._aggregate_on(Sum, on, ignore_nulls) - return self._aggregate_result(ret) - - @ConsumptionAPI - def min( - self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True - ) -> U: - """Compute minimum over entire datastream. - - Examples: - >>> import ray - >>> ray.data.range(100).min() - 0 - >>> ray.data.from_items([ - ... (i, i**2) - ... for i in range(100)]).min(lambda x: x[1]) - 0 - >>> ray.data.range_table(100).min("value") - 0 - >>> ray.data.from_items([ - ... {"A": i, "B": i**2} - ... for i in range(100)]).min(["A", "B"]) - {'min(A)': 0, 'min(B)': 0} - - Args: - on: The data subset on which to compute the min. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to return a scalar min of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to return an ``ArrowRow`` - containing the column-wise min of all columns. - ignore_nulls: Whether to ignore null values. If ``True``, null - values will be ignored when computing the min; if ``False``, - if a null value is encountered, the output will be None. - We consider np.nan, None, and pd.NaT to be null values. - Default is ``True``. - - Returns: - The min result. - - For a simple datastream, the output is: - - - ``on=None``: a scalar representing the min of all rows, - - ``on=callable``: a scalar representing the min of the outputs - of the callable called on each row, - - ``on=[callable_1, ..., calalble_n]``: a tuple of - ``(min_1, ..., min_n)`` representing the min of the outputs - of the corresponding callables called on each row. - - For an Arrow datastream, the output is: - - - ``on=None``: an ``ArrowRow`` containing the column-wise min of - all columns, - - ``on="col"``: a scalar representing the min of all items in - column ``"col"``, - - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow`` - containing the column-wise min of the provided columns. - - If the datastream is empty, all values are null, or any value is null - AND ``ignore_nulls`` is ``False``, then the output will be None. - """ - ret = self._aggregate_on(Min, on, ignore_nulls) - return self._aggregate_result(ret) - - @ConsumptionAPI - def max( - self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True - ) -> U: - """Compute maximum over entire datastream. - - Examples: - >>> import ray - >>> ray.data.range(100).max() - 99 - >>> ray.data.from_items([ - ... (i, i**2) - ... for i in range(100)]).max(lambda x: x[1]) - 9801 - >>> ray.data.range_table(100).max("value") - 99 - >>> ray.data.from_items([ - ... {"A": i, "B": i**2} - ... for i in range(100)]).max(["A", "B"]) - {'max(A)': 99, 'max(B)': 9801} - - Args: - on: The data subset on which to compute the max. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to return a scalar max of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to return an ``ArrowRow`` - containing the column-wise max of all columns. - ignore_nulls: Whether to ignore null values. If ``True``, null - values will be ignored when computing the max; if ``False``, - if a null value is encountered, the output will be None. - We consider np.nan, None, and pd.NaT to be null values. - Default is ``True``. - - Returns: - The max result. - - For a simple datastream, the output is: - - - ``on=None``: a scalar representing the max of all rows, - - ``on=callable``: a scalar representing the max of the outputs of - the callable called on each row, - - ``on=[callable_1, ..., calalble_n]``: a tuple of - ``(max_1, ..., max_n)`` representing the max of the outputs of - the corresponding callables called on each row. - - For an Arrow datastream, the output is: - - - ``on=None``: an ``ArrowRow`` containing the column-wise max of - all columns, - - ``on="col"``: a scalar representing the max of all items in - column ``"col"``, - - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow`` - containing the column-wise max of the provided columns. - - If the datastream is empty, all values are null, or any value is null - AND ``ignore_nulls`` is ``False``, then the output will be None. - """ - ret = self._aggregate_on(Max, on, ignore_nulls) - return self._aggregate_result(ret) - - @ConsumptionAPI - def mean( - self, on: Optional[Union[KeyFn, List[KeyFn]]] = None, ignore_nulls: bool = True - ) -> U: - """Compute mean over entire datastream. - - Examples: - >>> import ray - >>> ray.data.range(100).mean() - 49.5 - >>> ray.data.from_items([ - ... (i, i**2) - ... for i in range(100)]).mean(lambda x: x[1]) - 3283.5 - >>> ray.data.range_table(100).mean("value") - 49.5 - >>> ray.data.from_items([ - ... {"A": i, "B": i**2} - ... for i in range(100)]).mean(["A", "B"]) - {'mean(A)': 49.5, 'mean(B)': 3283.5} - - Args: - on: The data subset on which to compute the mean. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to return a scalar mean of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to return an ``ArrowRow`` - containing the column-wise mean of all columns. - ignore_nulls: Whether to ignore null values. If ``True``, null - values will be ignored when computing the mean; if ``False``, - if a null value is encountered, the output will be None. - We consider np.nan, None, and pd.NaT to be null values. - Default is ``True``. - - Returns: - The mean result. - - For a simple datastream, the output is: - - - ``on=None``: a scalar representing the mean of all rows, - - ``on=callable``: a scalar representing the mean of the outputs - of the callable called on each row, - - ``on=[callable_1, ..., calalble_n]``: a tuple of - ``(mean_1, ..., mean_n)`` representing the mean of the outputs - of the corresponding callables called on each row. - - For an Arrow datastream, the output is: - - - ``on=None``: an ``ArrowRow`` containing the column-wise mean of - all columns, - - ``on="col"``: a scalar representing the mean of all items in - column ``"col"``, - - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow`` - containing the column-wise mean of the provided columns. - - If the datastream is empty, all values are null, or any value is null - AND ``ignore_nulls`` is ``False``, then the output will be None. - """ - ret = self._aggregate_on(Mean, on, ignore_nulls) - return self._aggregate_result(ret) - - @ConsumptionAPI - def std( - self, - on: Optional[Union[KeyFn, List[KeyFn]]] = None, - ddof: int = 1, - ignore_nulls: bool = True, - ) -> U: - """Compute standard deviation over entire datastream. - - Examples: - >>> import ray - >>> round(ray.data.range(100).std(), 5) - 29.01149 - >>> ray.data.from_items([ - ... (i, i**2) - ... for i in range(100)]).std(lambda x: x[1]) - 2968.1748039269296 - >>> round(ray.data.range_table(100).std("value", ddof=0), 5) - 28.86607 - >>> ray.data.from_items([ - ... {"A": i, "B": i**2} - ... for i in range(100)]).std(["A", "B"]) - {'std(A)': 29.011491975882016, 'std(B)': 2968.1748039269296} - - .. note:: This uses Welford's online method for an accumulator-style computation - of the standard deviation. This method was chosen due to it's numerical - stability, and it being computable in a single pass. This may give different - (but more accurate) results than NumPy, Pandas, and sklearn, which use a - less numerically stable two-pass algorithm. - See - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm - - Args: - on: The data subset on which to compute the std. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to return a scalar std of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to return an ``ArrowRow`` - containing the column-wise std of all columns. - ddof: Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - ignore_nulls: Whether to ignore null values. If ``True``, null - values will be ignored when computing the std; if ``False``, - if a null value is encountered, the output will be None. - We consider np.nan, None, and pd.NaT to be null values. - Default is ``True``. - - Returns: - The standard deviation result. - - For a simple datastream, the output is: - - - ``on=None``: a scalar representing the std of all rows, - - ``on=callable``: a scalar representing the std of the outputs of - the callable called on each row, - - ``on=[callable_1, ..., calalble_n]``: a tuple of - ``(std_1, ..., std_n)`` representing the std of the outputs of - the corresponding callables called on each row. - - For an Arrow datastream, the output is: - - - ``on=None``: an ``ArrowRow`` containing the column-wise std of - all columns, - - ``on="col"``: a scalar representing the std of all items in - column ``"col"``, - - ``on=["col_1", ..., "col_n"]``: an n-column ``ArrowRow`` - containing the column-wise std of the provided columns. - - If the datastream is empty, all values are null, or any value is null - AND ``ignore_nulls`` is ``False``, then the output will be None. - """ - ret = self._aggregate_on(Std, on, ignore_nulls, ddof=ddof) - return self._aggregate_result(ret) - - def sort( - self, key: Optional[KeyFn] = None, descending: bool = False - ) -> "Datastream[T]": - # TODO ds.sort(lambda ...) fails with: - # Callable key ' at 0x1b07a4cb0>' requires - # datastream format to be 'simple', was 'arrow'. - # How do I create something "simple" here? - """Sort the datastream by the specified key column or key function. - - Examples: - >>> import ray - >>> # Sort using the entire record as the key. - >>> ds = ray.data.range(100) - >>> ds.sort() - Sort - +- Datastream(num_blocks=..., num_rows=100, schema=) - >>> # Sort by a single column in descending order. - >>> ds = ray.data.from_items( - ... [{"value": i} for i in range(1000)]) - >>> ds.sort("value", descending=True) - Sort - +- Datastream(num_blocks=200, num_rows=1000, schema={value: int64}) - >>> # Sort by a key function. - >>> ds.sort(lambda record: record["value"]) # doctest: +SKIP - - Time complexity: O(datastream size * log(datastream size / parallelism)) - - Args: - key: - - For Arrow tables, key must be a single column name. - - For datastreams of Python objects, key can be either a lambda - function that returns a comparison key to sort by, or None - to sort by the original value. - descending: Whether to sort in descending order. - - Returns: - A new, sorted datastream. - """ - - plan = self._plan.with_stage(SortStage(self, key, descending)) - - logical_plan = self._logical_plan - if logical_plan is not None: - op = Sort( - logical_plan.dag, - key=key, - descending=descending, - ) - logical_plan = LogicalPlan(op) - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - def zip(self, other: "Datastream[U]") -> "Datastream[(T, U)]": - """Materialize and zip this datastream with the elements of another. - - The datastreams must have the same number of rows. For tabular datastreams, the - datastreams will be concatenated horizontally; namely, their column sets will be - merged, and any duplicate column names disambiguated with _1, _2, etc. suffixes. - - .. note:: - The smaller of the two datastreams will be repartitioned to align the number - of rows per block with the larger datastream. - - .. note:: - Zipped datastreams are not lineage-serializable, i.e. they can not be used - as a tunable hyperparameter in Ray Tune. - - Examples: - >>> import ray - >>> ds1 = ray.data.range(5) - >>> ds2 = ray.data.range(5, parallelism=2).map(lambda x: x + 1) - >>> ds1.zip(ds2).take() - [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)] - - Time complexity: O(datastream size / parallelism) - - Args: - other: The datastream to zip with on the right hand side. - - Returns: - If the inputs are simple datastreams, this returns a ``Datastream`` - containing (k, v) pairs, where k comes from the first datastream and v - comes from the second. - If the inputs are tabular datastreams, this returns a ``Datastream`` - containing the columns of the second datastream concatenated horizontally - with the columns of the first datastream, with duplicate column names - disambiguated with _1, _2, etc. suffixes. - """ - - plan = self._plan.with_stage(ZipStage(other)) - - logical_plan = self._logical_plan - other_logical_plan = other._logical_plan - if logical_plan is not None and other_logical_plan is not None: - op = Zip(logical_plan.dag, other_logical_plan.dag) - logical_plan = LogicalPlan(op) - return Datastream(plan, self._epoch, self._lazy, logical_plan) - - @ConsumptionAPI - def limit(self, limit: int) -> "Datastream[T]": - """Materialize and truncate the datastream to the first ``limit`` records. - - Contrary to :meth`.take`, this will not move any data to the caller's - machine. Instead, it will return a new ``Datastream`` pointing to the truncated - distributed data. - - Examples: - >>> import ray - >>> ds = ray.data.range(1000) - >>> ds.limit(100).map(lambda x: x * 2).take() - [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38] - - Time complexity: O(limit specified) - - Args: - limit: The size of the datastream to truncate to. - - Returns: - The truncated datastream. - """ - start_time = time.perf_counter() - # Truncate the block list to the minimum number of blocks that contains at least - # `limit` rows. - block_list = self._plan.execute().truncate_by_rows(limit) - blocks, metadata, _, _ = _split_at_index(block_list, limit) - split_duration = time.perf_counter() - start_time - meta_for_stats = [ - BlockMetadata( - num_rows=m.num_rows, - size_bytes=m.size_bytes, - schema=m.schema, - input_files=m.input_files, - exec_stats=None, - ) - for m in metadata - ] - datastream_stats = DatastreamStats( - stages={"Limit": meta_for_stats}, - parent=self._plan.stats(), - ) - datastream_stats.time_total_s = split_duration - return Datastream( - ExecutionPlan( - BlockList( - blocks, - metadata, - owned_by_consumer=block_list._owned_by_consumer, - ), - datastream_stats, - run_by_consumer=block_list._owned_by_consumer, - ), - self._epoch, - self._lazy, - ) - - @ConsumptionAPI(pattern="Time complexity:") - def take_batch( - self, batch_size: int = 20, *, batch_format: Optional[str] = "default" - ) -> DataBatch: - """Return up to ``batch_size`` records from the datastream in a batch. - - Unlike take(), the records are returned in the same format as used for - `iter_batches` and `map_batches`. - - This will move up to ``batch_size`` records to the caller's machine; if - ``batch_size`` is very large, this can result in an OutOfMemory crash on - the caller. - - Time complexity: O(batch_size specified) - - Args: - batch_size: The max number of records to return. - batch_format: Specify ``"default"`` to use the default block format - (promotes tables to Pandas and tensors to NumPy), ``"pandas"`` to select - ``pandas.DataFrame``, "pyarrow" to select ``pyarrow.Table``, or - ``"numpy"`` to select ``numpy.ndarray`` for tensor datastreams and - ``Dict[str, numpy.ndarray]`` for tabular datastreams, or None - to return the underlying block exactly as is with no additional - formatting. The default is "default". - - Returns: - A batch of up to ``batch_size`` records from the datastream. - - Raises: - ValueError if the datastream is empty. - """ - batch_format = _apply_strict_mode_batch_format(batch_format) - try: - res = next( - self.iter_batches( - batch_size=batch_size, prefetch_batches=0, batch_format=batch_format - ) - ) - except StopIteration: - raise ValueError("The datastream is empty.") - self._synchronize_progress_bar() - return res - - @ConsumptionAPI(pattern="Time complexity:") - def take(self, limit: int = 20) -> List[T]: - """Return up to ``limit`` records from the datastream. - - This will move up to ``limit`` records to the caller's machine; if - ``limit`` is very large, this can result in an OutOfMemory crash on - the caller. - - Time complexity: O(limit specified) - - Args: - limit: The max number of records to return. - - Returns: - A list of up to ``limit`` records from the datastream. - """ - if ray.util.log_once("datastream_take"): - logger.info( - "Tip: Use `take_batch()` instead of `take() / show()` to return " - "records in pandas or numpy batch format." - ) - output = [] - for row in self.iter_rows(): - output.append(row) - if len(output) >= limit: - break - self._synchronize_progress_bar() - return output - - @ConsumptionAPI(pattern="Time complexity:") - def take_all(self, limit: Optional[int] = None) -> List[T]: - """Return all of the records in the datastream. - - This will move the entire datastream to the caller's machine; if the - datastream is very large, this can result in an OutOfMemory crash on - the caller. - - Time complexity: O(datastream size) - - Args: - limit: Raise an error if the size exceeds the specified limit. - - Returns: - A list of all the records in the datastream. - """ - output = [] - for row in self.iter_rows(): - output.append(row) - if limit is not None and len(output) > limit: - raise ValueError( - f"The datastream has more than the given limit of {limit} records." - ) - self._synchronize_progress_bar() - return output - - @ConsumptionAPI(pattern="Time complexity:") - def show(self, limit: int = 20) -> None: - """Print up to the given number of records from the datastream. - - Time complexity: O(limit specified) - - Args: - limit: The max number of records to print. - """ - for row in self.take(limit): - print(row) - - @ConsumptionAPI( - if_more_than_read=True, - datasource_metadata="row count", - pattern="Time complexity:", - ) - def count(self) -> int: - """Count the number of records in the datastream. - - Time complexity: O(datastream size / parallelism), O(1) for parquet - - Returns: - The number of records in the datastream. - """ - # Handle empty datastream. - if self.num_blocks() == 0: - return 0 - - # For parquet, we can return the count directly from metadata. - meta_count = self._meta_count() - if meta_count is not None: - return meta_count - - get_num_rows = cached_remote_fn(_get_num_rows) - - return sum( - ray.get( - [get_num_rows.remote(block) for block in self.get_internal_block_refs()] - ) - ) - - @ConsumptionAPI( - if_more_than_read=True, - datasource_metadata="schema", - extra_condition="or if ``fetch_if_missing=True`` (the default)", - pattern="Time complexity:", - ) - def schema( - self, fetch_if_missing: bool = True - ) -> Union[type, "pyarrow.lib.Schema"]: - """Return the schema of the datastream. - - For datastream of Arrow records, this will return the Arrow schema. - For datastream of Python objects, this returns their Python type. - - Time complexity: O(1) - - Args: - fetch_if_missing: If True, synchronously fetch the schema if it's - not known. If False, None is returned if the schema is not known. - Default is True. - - Returns: - The Python type or Arrow schema of the records, or None if the - schema is not known and fetch_if_missing is False. - """ - ctx = DataContext.get_current() - base_schema = self._plan.schema(fetch_if_missing=fetch_if_missing) - if ctx.strict_mode: - return Schema(base_schema) - else: - return base_schema - - def num_blocks(self) -> int: - """Return the number of blocks of this datastream. - - Note that during read and transform operations, the number of blocks - may be dynamically adjusted to respect memory limits, increasing the - number of blocks at runtime. - - Time complexity: O(1) - - Returns: - The number of blocks of this datastream. - """ - return self._plan.initial_num_blocks() - - @ConsumptionAPI(if_more_than_read=True, pattern="Time complexity:") - def size_bytes(self) -> int: - """Return the in-memory size of the datastream. - - Time complexity: O(1) - - Returns: - The in-memory size of the datastream in bytes, or None if the - in-memory size is not known. - """ - metadata = self._plan.execute().get_metadata() - if not metadata or metadata[0].size_bytes is None: - return None - return sum(m.size_bytes for m in metadata) - - @ConsumptionAPI(if_more_than_read=True, pattern="Time complexity:") - def input_files(self) -> List[str]: - """Return the list of input files for the datastream. - - Time complexity: O(num input files) - - Returns: - The list of input files used to create the datastream, or an empty - list if the input files is not known. - """ - metadata = self._plan.execute().get_metadata() - files = set() - for m in metadata: - for f in m.input_files: - files.add(f) - return list(files) - - @ConsumptionAPI - def write_parquet( - self, - path: str, - *, - filesystem: Optional["pyarrow.fs.FileSystem"] = None, - try_create_dir: bool = True, - arrow_open_stream_args: Optional[Dict[str, Any]] = None, - block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), - arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, - ray_remote_args: Dict[str, Any] = None, - **arrow_parquet_args, - ) -> None: - """Write the datastream to parquet. - - This is only supported for datastream convertible to Arrow records. - To control the number of files, use ``.repartition()``. - - Unless a custom block path provider is given, the format of the output - files will be {uuid}_{block_idx}.parquet, where ``uuid`` is an unique - id for the datastream. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> ds.write_parquet("s3://bucket/path") # doctest: +SKIP - - Time complexity: O(datastream size / parallelism) - - Args: - path: The path to the destination root directory, where Parquet - files will be written to. - filesystem: The filesystem implementation to write to. - try_create_dir: Try to create all directories in destination path - if True. Does nothing if all directories already exist. - arrow_open_stream_args: kwargs passed to - pyarrow.fs.FileSystem.open_output_stream - block_path_provider: BlockWritePathProvider implementation to - write each datastream block to a custom output path. - arrow_parquet_args_fn: Callable that returns a dictionary of write - arguments to use when writing each block to a file. Overrides - any duplicate keys from arrow_parquet_args. This should be used - instead of arrow_parquet_args if any of your write arguments - cannot be pickled, or if you'd like to lazily resolve the write - arguments for each datastream block. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - arrow_parquet_args: Options to pass to - pyarrow.parquet.write_table(), which is used to write out each - block to a file. - """ - self.write_datasource( - ParquetDatasource(), - ray_remote_args=ray_remote_args, - path=path, - datastream_uuid=self._uuid, - filesystem=filesystem, - try_create_dir=try_create_dir, - open_stream_args=arrow_open_stream_args, - block_path_provider=block_path_provider, - write_args_fn=arrow_parquet_args_fn, - **arrow_parquet_args, - ) - - @ConsumptionAPI - def write_json( - self, - path: str, - *, - filesystem: Optional["pyarrow.fs.FileSystem"] = None, - try_create_dir: bool = True, - arrow_open_stream_args: Optional[Dict[str, Any]] = None, - block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), - pandas_json_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, - ray_remote_args: Dict[str, Any] = None, - **pandas_json_args, - ) -> None: - """Write the datastream to json. - - This is only supported for datastreams convertible to Arrow records. - To control the number of files, use ``.repartition()``. - - Unless a custom block path provider is given, the format of the output - files will be {self._uuid}_{block_idx}.json, where ``uuid`` is an - unique id for the datastream. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> ds.write_json("s3://bucket/path") # doctest: +SKIP - - Time complexity: O(datastream size / parallelism) - - Args: - path: The path to the destination root directory, where json - files will be written to. - filesystem: The filesystem implementation to write to. - try_create_dir: Try to create all directories in destination path - if True. Does nothing if all directories already exist. - arrow_open_stream_args: kwargs passed to - pyarrow.fs.FileSystem.open_output_stream - block_path_provider: BlockWritePathProvider implementation to - write each datastream block to a custom output path. - pandas_json_args_fn: Callable that returns a dictionary of write - arguments to use when writing each block to a file. Overrides - any duplicate keys from pandas_json_args. This should be used - instead of pandas_json_args if any of your write arguments - cannot be pickled, or if you'd like to lazily resolve the write - arguments for each datastream block. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - pandas_json_args: These args will be passed to - pandas.DataFrame.to_json(), which we use under the hood to - write out each Datastream block. These - are dict(orient="records", lines=True) by default. - """ - self.write_datasource( - JSONDatasource(), - ray_remote_args=ray_remote_args, - path=path, - datastream_uuid=self._uuid, - filesystem=filesystem, - try_create_dir=try_create_dir, - open_stream_args=arrow_open_stream_args, - block_path_provider=block_path_provider, - write_args_fn=pandas_json_args_fn, - **pandas_json_args, - ) - - @ConsumptionAPI - def write_csv( - self, - path: str, - *, - filesystem: Optional["pyarrow.fs.FileSystem"] = None, - try_create_dir: bool = True, - arrow_open_stream_args: Optional[Dict[str, Any]] = None, - block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), - arrow_csv_args_fn: Callable[[], Dict[str, Any]] = lambda: {}, - ray_remote_args: Dict[str, Any] = None, - **arrow_csv_args, - ) -> None: - """Write the datastream to csv. - - This is only supported for datastreams convertible to Arrow records. - To control the number of files, use ``.repartition()``. - - Unless a custom block path provider is given, the format of the output - files will be {uuid}_{block_idx}.csv, where ``uuid`` is an unique id - for the datastream. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> ds.write_csv("s3://bucket/path") # doctest: +SKIP - - Time complexity: O(datastream size / parallelism) - - Args: - path: The path to the destination root directory, where csv - files will be written to. - filesystem: The filesystem implementation to write to. - try_create_dir: Try to create all directories in destination path - if True. Does nothing if all directories already exist. - arrow_open_stream_args: kwargs passed to - pyarrow.fs.FileSystem.open_output_stream - block_path_provider: BlockWritePathProvider implementation to - write each datastream block to a custom output path. - arrow_csv_args_fn: Callable that returns a dictionary of write - arguments to use when writing each block to a file. Overrides - any duplicate keys from arrow_csv_args. This should be used - instead of arrow_csv_args if any of your write arguments - cannot be pickled, or if you'd like to lazily resolve the write - arguments for each datastream block. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - arrow_csv_args: Other CSV write options to pass to pyarrow. - """ - self.write_datasource( - CSVDatasource(), - ray_remote_args=ray_remote_args, - path=path, - datastream_uuid=self._uuid, - filesystem=filesystem, - try_create_dir=try_create_dir, - open_stream_args=arrow_open_stream_args, - block_path_provider=block_path_provider, - write_args_fn=arrow_csv_args_fn, - **arrow_csv_args, - ) - - @ConsumptionAPI - def write_tfrecords( - self, - path: str, - *, - tf_schema: Optional["schema_pb2.Schema"] = None, - filesystem: Optional["pyarrow.fs.FileSystem"] = None, - try_create_dir: bool = True, - arrow_open_stream_args: Optional[Dict[str, Any]] = None, - block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), - ray_remote_args: Dict[str, Any] = None, - ) -> None: - """Write the datastream to TFRecord files. - - The `TFRecord `_ - files will contain - `tf.train.Example `_ # noqa: E501 - records, with one Example record for each row in the datastream. - - .. warning:: - tf.train.Feature only natively stores ints, floats, and bytes, - so this function only supports datastreams with these data types, - and will error if the datastream contains unsupported types. - - This is only supported for datastreams convertible to Arrow records. - To control the number of files, use ``.repartition()``. - - Unless a custom block path provider is given, the format of the output - files will be {uuid}_{block_idx}.tfrecords, where ``uuid`` is an unique id - for the datastream. - - Examples: - >>> import ray - >>> ds = ray.data.from_items([ - ... { "name": "foo", "score": 42 }, - ... { "name": "bar", "score": 43 }, - ... ]) - >>> ds.write_tfrecords("s3://bucket/path") # doctest: +SKIP - - Time complexity: O(datastream size / parallelism) - - Args: - path: The path to the destination root directory, where tfrecords - files will be written to. - filesystem: The filesystem implementation to write to. - try_create_dir: Try to create all directories in destination path - if True. Does nothing if all directories already exist. - arrow_open_stream_args: kwargs passed to - pyarrow.fs.FileSystem.open_output_stream - block_path_provider: BlockWritePathProvider implementation to - write each datastream block to a custom output path. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - - """ - - self.write_datasource( - TFRecordDatasource(), - ray_remote_args=ray_remote_args, - path=path, - datastream_uuid=self._uuid, - filesystem=filesystem, - try_create_dir=try_create_dir, - open_stream_args=arrow_open_stream_args, - block_path_provider=block_path_provider, - tf_schema=tf_schema, - ) - - @PublicAPI(stability="alpha") - @ConsumptionAPI - def write_webdataset( - self, - path: str, - *, - filesystem: Optional["pyarrow.fs.FileSystem"] = None, - try_create_dir: bool = True, - arrow_open_stream_args: Optional[Dict[str, Any]] = None, - block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), - ray_remote_args: Dict[str, Any] = None, - encoder: Optional[Union[bool, str, callable, list]] = True, - ) -> None: - """Write the datastream to WebDataset files. - - The `TFRecord `_ - files will contain - `tf.train.Example `_ # noqa: E501 - records, with one Example record for each row in the datastream. - - .. warning:: - tf.train.Feature only natively stores ints, floats, and bytes, - so this function only supports datastreams with these data types, - and will error if the datastream contains unsupported types. - - This is only supported for datastreams convertible to Arrow records. - To control the number of files, use ``.repartition()``. - - Unless a custom block path provider is given, the format of the output - files will be {uuid}_{block_idx}.tfrecords, where ``uuid`` is an unique id - for the datastream. - - Examples: - >>> import ray - >>> ds = ray.data.from_items([ - ... { "name": "foo", "score": 42 }, - ... { "name": "bar", "score": 43 }, - ... ]) - >>> ds.write_webdataset("s3://bucket/path") # doctest: +SKIP - - Time complexity: O(datastream size / parallelism) - - Args: - path: The path to the destination root directory, where tfrecords - files will be written to. - filesystem: The filesystem implementation to write to. - try_create_dir: Try to create all directories in destination path - if True. Does nothing if all directories already exist. - arrow_open_stream_args: kwargs passed to - pyarrow.fs.FileSystem.open_output_stream - block_path_provider: BlockWritePathProvider implementation to - write each datastream block to a custom output path. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - - """ - - from ray.data.datasource.webdataset_datasource import WebDatasetDatasource - - self.write_datasource( - WebDatasetDatasource(), - ray_remote_args=ray_remote_args, - path=path, - datastream_uuid=self._uuid, - filesystem=filesystem, - try_create_dir=try_create_dir, - open_stream_args=arrow_open_stream_args, - block_path_provider=block_path_provider, - encoder=encoder, - ) - - @ConsumptionAPI - def write_numpy( - self, - path: str, - *, - column: str = TENSOR_COLUMN_NAME, - filesystem: Optional["pyarrow.fs.FileSystem"] = None, - try_create_dir: bool = True, - arrow_open_stream_args: Optional[Dict[str, Any]] = None, - block_path_provider: BlockWritePathProvider = DefaultBlockWritePathProvider(), - ray_remote_args: Dict[str, Any] = None, - ) -> None: - """Write a tensor column of the datastream to npy files. - - This is only supported for datastreams convertible to Arrow records that - contain a TensorArray column. To control the number of files, use - ``.repartition()``. - - Unless a custom block path provider is given, the format of the output - files will be {self._uuid}_{block_idx}.npy, where ``uuid`` is an unique - id for the datastream. - - Examples: - >>> import ray - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> ds.write_numpy("s3://bucket/path") # doctest: +SKIP - - Time complexity: O(datastream size / parallelism) - - Args: - path: The path to the destination root directory, where npy - files will be written to. - column: The name of the table column that contains the tensor to - be written. The default is ``"__value__"``, the column name that - Datastream uses for storing tensors in single-column tables. - filesystem: The filesystem implementation to write to. - try_create_dir: Try to create all directories in destination path - if True. Does nothing if all directories already exist. - arrow_open_stream_args: kwargs passed to - pyarrow.fs.FileSystem.open_output_stream - block_path_provider: BlockWritePathProvider implementation to - write each datastream block to a custom output path. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - """ - self.write_datasource( - NumpyDatasource(), - ray_remote_args=ray_remote_args, - path=path, - datastream_uuid=self._uuid, - column=column, - filesystem=filesystem, - try_create_dir=try_create_dir, - open_stream_args=arrow_open_stream_args, - block_path_provider=block_path_provider, - ) - - @ConsumptionAPI - def write_mongo( - self, - uri: str, - database: str, - collection: str, - ray_remote_args: Dict[str, Any] = None, - ) -> None: - """Write the datastream to a MongoDB datasource. - - This is only supported for datastreams convertible to Arrow records. - To control the number of parallel write tasks, use ``.repartition()`` - before calling this method. - - .. note:: - Currently, this supports only a subset of the pyarrow's types, due to the - limitation of pymongoarrow which is used underneath. Writing unsupported - types will fail on type checking. See all the supported types at: - https://mongo-arrow.readthedocs.io/en/latest/supported_types.html. - - .. note:: - The records will be inserted into MongoDB as new documents. If a record has - the _id field, this _id must be non-existent in MongoDB, otherwise the write - will be rejected and fail (hence preexisting documents are protected from - being mutated). It's fine to not have _id field in record and MongoDB will - auto generate one at insertion. - - Examples: - >>> import ray - >>> import pandas as pd - >>> docs = [{"title": "MongoDB Datasource test"} for key in range(4)] - >>> ds = ray.data.from_pandas(pd.DataFrame(docs)) - >>> ds.write_mongo( # doctest: +SKIP - >>> MongoDatasource(), # doctest: +SKIP - >>> uri="mongodb://username:password@mongodb0.example.com:27017/?authSource=admin", # noqa: E501 # doctest: +SKIP - >>> database="my_db", # doctest: +SKIP - >>> collection="my_collection", # doctest: +SKIP - >>> ) # doctest: +SKIP - - Args: - uri: The URI to the destination MongoDB where the datastream will be - written to. For the URI format, see details in - https://www.mongodb.com/docs/manual/reference/connection-string/. - database: The name of the database. This database must exist otherwise - ValueError will be raised. - collection: The name of the collection in the database. This collection - must exist otherwise ValueError will be raised. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - """ - from ray.data.datasource import MongoDatasource - - self.write_datasource( - MongoDatasource(), - ray_remote_args=ray_remote_args, - uri=uri, - database=database, - collection=collection, - ) - - @ConsumptionAPI - def write_datasource( - self, - datasource: Datasource[T], - *, - ray_remote_args: Dict[str, Any] = None, - **write_args, - ) -> None: - """Write the datastream to a custom datasource. - - Examples: - >>> import ray - >>> from ray.data.datasource import Datasource - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> class CustomDatasource(Datasource): # doctest: +SKIP - ... # define custom data source - ... pass # doctest: +SKIP - >>> ds.write_datasource(CustomDatasource(...)) # doctest: +SKIP - - Time complexity: O(datastream size / parallelism) - - Args: - datasource: The datasource to write to. - ray_remote_args: Kwargs passed to ray.remote in the write tasks. - write_args: Additional write args to pass to the datasource. - """ - if ray_remote_args is None: - ray_remote_args = {} - path = write_args.get("path", None) - if path and _is_local_scheme(path): - if ray.util.client.ray.is_connected(): - raise ValueError( - f"The local scheme paths {path} are not supported in Ray Client." - ) - ray_remote_args["scheduling_strategy"] = NodeAffinitySchedulingStrategy( - ray.get_runtime_context().get_node_id(), - soft=False, - ) - - if type(datasource).write != Datasource.write: - write_fn = generate_write_fn(datasource, **write_args) - - def write_fn_wrapper(blocks: Iterator[Block], ctx, fn) -> Iterator[Block]: - return write_fn(blocks, ctx) - - plan = self._plan.with_stage( - OneToOneStage( - "Write", - write_fn_wrapper, - "tasks", - ray_remote_args, - fn=lambda x: x, - ) - ) - - logical_plan = self._logical_plan - if logical_plan is not None: - write_op = Write( - logical_plan.dag, - datasource, - ray_remote_args=ray_remote_args, - **write_args, - ) - logical_plan = LogicalPlan(write_op) - - try: - import pandas as pd - - self._write_ds = Datastream( - plan, self._epoch, self._lazy, logical_plan - ).materialize() - blocks = ray.get(self._write_ds._plan.execute().get_blocks()) - assert all( - isinstance(block, pd.DataFrame) and len(block) == 1 - for block in blocks - ) - write_results = [block["write_result"][0] for block in blocks] - datasource.on_write_complete(write_results) - except Exception as e: - datasource.on_write_failed([], e) - raise - else: - logger.warning( - "The Datasource.do_write() is deprecated in " - "Ray 2.4 and will be removed in future release. Use " - "Datasource.write() instead." - ) - - ctx = DataContext.get_current() - blocks, metadata = zip(*self._plan.execute().get_blocks_with_metadata()) - # Prepare write in a remote task so that in Ray client mode, we - # don't do metadata resolution from the client machine. - do_write = cached_remote_fn(_do_write, retry_exceptions=False, num_cpus=0) - write_results: List[ObjectRef[WriteResult]] = ray.get( - do_write.remote( - datasource, - ctx, - blocks, - metadata, - ray_remote_args, - _wrap_arrow_serialization_workaround(write_args), - ) - ) - - progress = ProgressBar("Write Progress", len(write_results)) - try: - progress.block_until_complete(write_results) - datasource.on_write_complete(ray.get(write_results)) - except Exception as e: - datasource.on_write_failed(write_results, e) - raise - finally: - progress.close() - - @ConsumptionAPI( - delegate=( - "Calling any of the consumption methods on the returned ``DataIterator``" - ) - ) - def iterator(self) -> DataIterator: - """Return a :class:`~ray.data.DataIterator` that - can be used to repeatedly iterate over the datastream. - - Examples: - >>> import ray - >>> for batch in ray.data.range( - ... 1000000 - ... ).iterator().iter_batches(): # doctest: +SKIP - ... print(batch) # doctest: +SKIP - - .. note:: - It is recommended to use ``DataIterator`` methods over directly - calling methods such as ``iter_batches()``. - """ - return DataIteratorImpl(self) - - @ConsumptionAPI - def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Union[T, TableRow]]: - """Return a local row iterator over the datastream. - - If the datastream is a tabular datastream (Arrow/Pandas blocks), dict-like - mappings :py:class:`~ray.data.row.TableRow` are yielded for each row by the - iterator. If the datastream is not tabular, the raw row is yielded. - - Examples: - >>> import ray - >>> for i in ray.data.range(1000000).iter_rows(): # doctest: +SKIP - ... print(i) # doctest: +SKIP - - Time complexity: O(1) - - Args: - prefetch_blocks: The number of blocks to prefetch ahead of the - current block during the scan. - - Returns: - A local iterator over the entire datastream. - """ - - return self.iterator().iter_rows(prefetch_blocks=prefetch_blocks) - - @ConsumptionAPI - def iter_batches( - self, - *, - prefetch_batches: int = 1, - batch_size: Optional[int] = 256, - batch_format: Optional[str] = "default", - drop_last: bool = False, - local_shuffle_buffer_size: Optional[int] = None, - local_shuffle_seed: Optional[int] = None, - _collate_fn: Optional[Callable[[DataBatch], Any]] = None, - # Deprecated. - prefetch_blocks: int = 0, - ) -> Iterator[DataBatch]: - """Return a local batched iterator over the datastream. - - Examples: - >>> import ray - >>> for batch in ray.data.range(1000000).iter_batches(): # doctest: +SKIP - ... print(batch) # doctest: +SKIP - - Time complexity: O(1) - - Args: - prefetch_batches: The number of batches to fetch ahead of the current batch - to fetch. If set to greater than 0, a separate threadpool will be used - to fetch the objects to the local node, format the batches, and apply - the collate_fn. Defaults to 1. You can revert back to the old - prefetching behavior that uses `prefetch_blocks` by setting - `use_legacy_iter_batches` to True in the datastreamContext. - batch_size: The number of rows in each batch, or None to use entire blocks - as batches (blocks may contain different number of rows). - The final batch may include fewer than ``batch_size`` rows if - ``drop_last`` is ``False``. Defaults to 256. - batch_format: Specify ``"default"`` to use the default block format - (promotes tables to Pandas and tensors to NumPy), ``"pandas"`` to select - ``pandas.DataFrame``, "pyarrow" to select ``pyarrow.Table``, or - ``"numpy"`` to select ``numpy.ndarray`` for tensor datastreams and - ``Dict[str, numpy.ndarray]`` for tabular datastreams, or None - to return the underlying block exactly as is with no additional - formatting. The default is "default". - drop_last: Whether to drop the last batch if it's incomplete. - local_shuffle_buffer_size: If non-None, the data will be randomly shuffled - using a local in-memory shuffle buffer, and this value will serve as the - minimum number of rows that must be in the local in-memory shuffle - buffer in order to yield a batch. When there are no more rows to add to - the buffer, the remaining rows in the buffer will be drained. - local_shuffle_seed: The seed to use for the local random shuffle. - - Returns: - An iterator over record batches. - """ - batch_format = _apply_strict_mode_batch_format(batch_format) - if batch_format == "native": - logger.warning("The 'native' batch format has been renamed 'default'.") - return self.iterator().iter_batches( - prefetch_batches=prefetch_batches, - prefetch_blocks=prefetch_blocks, - batch_size=batch_size, - batch_format=batch_format, - drop_last=drop_last, - local_shuffle_buffer_size=local_shuffle_buffer_size, - local_shuffle_seed=local_shuffle_seed, - _collate_fn=_collate_fn, - ) - - @ConsumptionAPI - def iter_torch_batches( - self, - *, - prefetch_batches: int = 1, - batch_size: Optional[int] = 256, - dtypes: Optional[Union["torch.dtype", Dict[str, "torch.dtype"]]] = None, - device: Optional[str] = None, - collate_fn: Optional[ - Callable[[Union[np.ndarray, Dict[str, np.ndarray]]], Any] - ] = None, - drop_last: bool = False, - local_shuffle_buffer_size: Optional[int] = None, - local_shuffle_seed: Optional[int] = None, - # Deprecated - prefetch_blocks: int = 0, - ) -> Iterator["TorchTensorBatchType"]: - """Return a local batched iterator of Torch Tensors over the datastream. - - This iterator will yield single-tensor batches if the underlying datastream - consists of a single column; otherwise, it will yield a dictionary of - column-tensors. If looking for more flexibility in the tensor conversion (e.g. - casting dtypes) or the batch format, try use `.iter_batches` directly, which is - a lower-level API. - - Examples: - >>> import ray - >>> for batch in ray.data.range( # doctest: +SKIP - ... 12, - ... ).iter_torch_batches(batch_size=4): - ... print(batch.shape) # doctest: +SKIP - torch.Size([4, 1]) - torch.Size([4, 1]) - torch.Size([4, 1]) - - Time complexity: O(1) - - Args: - prefetch_batches: The number of batches to fetch ahead of the current batch - to fetch. If set to greater than 0, a separate threadpool will be used - to fetch the objects to the local node, format the batches, and apply - the collate_fn. Defaults to 1. You can revert back to the old - prefetching behavior that uses `prefetch_blocks` by setting - `use_legacy_iter_batches` to True in the datastreamContext. - batch_size: The number of rows in each batch, or None to use entire blocks - as batches (blocks may contain different number of rows). - The final batch may include fewer than ``batch_size`` rows if - ``drop_last`` is ``False``. Defaults to 256. - dtypes: The Torch dtype(s) for the created tensor(s); if None, the dtype - will be inferred from the tensor data. - device: The device on which the tensor should be placed; if None, the Torch - tensor will be constructed on the CPU. - collate_fn: A function to convert a Numpy batch to a PyTorch tensor batch. - Potential use cases include collating along a dimension other than the - first, padding sequences of various lengths, or generally handling - batches of different length tensors. If not provided, the default - collate function is used which simply converts the batch of numpy - arrays to a batch of PyTorch tensors. This API is still experimental - and is subject to change. - drop_last: Whether to drop the last batch if it's incomplete. - local_shuffle_buffer_size: If non-None, the data will be randomly shuffled - using a local in-memory shuffle buffer, and this value will serve as the - minimum number of rows that must be in the local in-memory shuffle - buffer in order to yield a batch. When there are no more rows to add to - the buffer, the remaining rows in the buffer will be drained. This - buffer size must be greater than or equal to ``batch_size``, and - therefore ``batch_size`` must also be specified when using local - shuffling. - local_shuffle_seed: The seed to use for the local random shuffle. - - Returns: - An iterator over Torch Tensor batches. - """ - return self.iterator().iter_torch_batches( - prefetch_batches=prefetch_batches, - prefetch_blocks=prefetch_blocks, - batch_size=batch_size, - dtypes=dtypes, - device=device, - collate_fn=collate_fn, - drop_last=drop_last, - local_shuffle_buffer_size=local_shuffle_buffer_size, - local_shuffle_seed=local_shuffle_seed, - ) - - @ConsumptionAPI - def iter_tf_batches( - self, - *, - prefetch_batches: int = 1, - batch_size: Optional[int] = 256, - dtypes: Optional[Union["tf.dtypes.DType", Dict[str, "tf.dtypes.DType"]]] = None, - drop_last: bool = False, - local_shuffle_buffer_size: Optional[int] = None, - local_shuffle_seed: Optional[int] = None, - # Deprecated - prefetch_blocks: int = 0, - ) -> Iterator[TensorFlowTensorBatchType]: - """Return a local batched iterator of TensorFlow Tensors over the datastream. - - This iterator will yield single-tensor batches of the underlying datastream - consists of a single column; otherwise, it will yield a dictionary of - column-tensors. - - .. tip:: - If you don't need the additional flexibility provided by this method, - consider using :meth:`~ray.data.Datastream.to_tf` instead. It's easier - to use. - - Examples: - >>> import ray - >>> for batch in ray.data.range( # doctest: +SKIP - ... 12, - ... ).iter_tf_batches(batch_size=4): - ... print(batch.shape) # doctest: +SKIP - (4, 1) - (4, 1) - (4, 1) - - Time complexity: O(1) - - Args: - prefetch_batches: The number of batches to fetch ahead of the current batch - to fetch. If set to greater than 0, a separate threadpool will be used - to fetch the objects to the local node, format the batches, and apply - the collate_fn. Defaults to 1. You can revert back to the old - prefetching behavior that uses `prefetch_blocks` by setting - `use_legacy_iter_batches` to True in the datastreamContext. - batch_size: The number of rows in each batch, or None to use entire blocks - as batches (blocks may contain different number of rows). - The final batch may include fewer than ``batch_size`` rows if - ``drop_last`` is ``False``. Defaults to 256. - dtypes: The TensorFlow dtype(s) for the created tensor(s); if None, the - dtype will be inferred from the tensor data. - drop_last: Whether to drop the last batch if it's incomplete. - local_shuffle_buffer_size: If non-None, the data will be randomly shuffled - using a local in-memory shuffle buffer, and this value will serve as the - minimum number of rows that must be in the local in-memory shuffle - buffer in order to yield a batch. When there are no more rows to add to - the buffer, the remaining rows in the buffer will be drained. This - buffer size must be greater than or equal to ``batch_size``, and - therefore ``batch_size`` must also be specified when using local - shuffling. - local_shuffle_seed: The seed to use for the local random shuffle. - - Returns: - An iterator over TensorFlow Tensor batches. - """ - return self.iterator().iter_tf_batches( - prefetch_batches=prefetch_batches, - prefetch_blocks=prefetch_blocks, - batch_size=batch_size, - dtypes=dtypes, - drop_last=drop_last, - local_shuffle_buffer_size=local_shuffle_buffer_size, - local_shuffle_seed=local_shuffle_seed, - ) - - @ConsumptionAPI(pattern="Time complexity:") - def to_torch( - self, - *, - label_column: Optional[str] = None, - feature_columns: Optional[ - Union[List[str], List[List[str]], Dict[str, List[str]]] - ] = None, - label_column_dtype: Optional["torch.dtype"] = None, - feature_column_dtypes: Optional[ - Union["torch.dtype", List["torch.dtype"], Dict[str, "torch.dtype"]] - ] = None, - batch_size: int = 1, - prefetch_batches: int = 1, - drop_last: bool = False, - local_shuffle_buffer_size: Optional[int] = None, - local_shuffle_seed: Optional[int] = None, - unsqueeze_label_tensor: bool = True, - unsqueeze_feature_tensors: bool = True, - # Deprecated - prefetch_blocks: int = 0, - ) -> "torch.utils.data.IterableDataset": - """Return a Torch IterableDataset over this datastream. - - This is only supported for datastreams convertible to Arrow records. - - It is recommended to use the returned ``IterableDataset`` directly - instead of passing it into a torch ``DataLoader``. - - Each element in IterableDataset will be a tuple consisting of 2 - elements. The first item contains the feature tensor(s), and the - second item is the label tensor. Those can take on different - forms, depending on the specified arguments. - - For the features tensor (N is the ``batch_size`` and n, m, k - are the number of features per tensor): - - * If ``feature_columns`` is a ``List[str]``, the features will be - a tensor of shape (N, n), with columns corresponding to - ``feature_columns`` - - * If ``feature_columns`` is a ``List[List[str]]``, the features will be - a list of tensors of shape [(N, m),...,(N, k)], with columns of each - tensor corresponding to the elements of ``feature_columns`` - - * If ``feature_columns`` is a ``Dict[str, List[str]]``, the features - will be a dict of key-tensor pairs of shape - {key1: (N, m),..., keyN: (N, k)}, with columns of each - tensor corresponding to the value of ``feature_columns`` under the - key. - - If ``unsqueeze_label_tensor=True`` (default), the label tensor will be - of shape (N, 1). Otherwise, it will be of shape (N,). - If ``label_column`` is specified as ``None``, then no column from the - ``Datastream`` will be treated as the label, and the output label tensor - will be ``None``. - - Note that you probably want to call ``.split()`` on this datastream if - there are to be multiple Torch workers consuming the data. - - Time complexity: O(1) - - Args: - label_column: The name of the column used as the - label (second element of the output list). Can be None for - prediction, in which case the second element of returned - tuple will also be None. - feature_columns: The names of the columns - to use as the features. Can be a list of lists or - a dict of string-list pairs for multi-tensor output. - If None, then use all columns except the label column as - the features. - label_column_dtype: The torch dtype to - use for the label column. If None, then automatically infer - the dtype. - feature_column_dtypes: The dtypes to use for the feature - tensors. This should match the format of ``feature_columns``, - or be a single dtype, in which case it will be applied to - all tensors. If None, then automatically infer the dtype. - batch_size: How many samples per batch to yield at a time. - Defaults to 1. - prefetch_batches: The number of batches to fetch ahead of the current batch - to fetch. If set to greater than 0, a separate threadpool will be used - to fetch the objects to the local node, format the batches, and apply - the collate_fn. Defaults to 1. You can revert back to the old - prefetching behavior that uses `prefetch_blocks` by setting - `use_legacy_iter_batches` to True in the datastreamContext. - drop_last: Set to True to drop the last incomplete batch, - if the datastream size is not divisible by the batch size. If - False and the size of the stream is not divisible by the batch - size, then the last batch will be smaller. Defaults to False. - local_shuffle_buffer_size: If non-None, the data will be randomly shuffled - using a local in-memory shuffle buffer, and this value will serve as the - minimum number of rows that must be in the local in-memory shuffle - buffer in order to yield a batch. When there are no more rows to add to - the buffer, the remaining rows in the buffer will be drained. This - buffer size must be greater than or equal to ``batch_size``, and - therefore ``batch_size`` must also be specified when using local - shuffling. - local_shuffle_seed: The seed to use for the local random shuffle. - unsqueeze_label_tensor: If set to True, the label tensor - will be unsqueezed (reshaped to (N, 1)). Otherwise, it will - be left as is, that is (N, ). In general, regression loss - functions expect an unsqueezed tensor, while classification - loss functions expect a squeezed one. Defaults to True. - unsqueeze_feature_tensors: If set to True, the features tensors - will be unsqueezed (reshaped to (N, 1)) before being concatenated into - the final features tensor. Otherwise, they will be left as is, that is - (N, ). Defaults to True. - - Returns: - A torch IterableDataset. - """ - - return self.iterator().to_torch( - label_column=label_column, - feature_columns=feature_columns, - label_column_dtype=label_column_dtype, - feature_column_dtypes=feature_column_dtypes, - batch_size=batch_size, - prefetch_blocks=prefetch_blocks, - prefetch_batches=prefetch_batches, - drop_last=drop_last, - local_shuffle_buffer_size=local_shuffle_buffer_size, - local_shuffle_seed=local_shuffle_seed, - unsqueeze_label_tensor=unsqueeze_label_tensor, - unsqueeze_feature_tensors=unsqueeze_feature_tensors, - ) - - @ConsumptionAPI - def to_tf( - self, - feature_columns: Union[str, List[str]], - label_columns: Union[str, List[str]], - *, - prefetch_batches: int = 1, - batch_size: int = 1, - drop_last: bool = False, - local_shuffle_buffer_size: Optional[int] = None, - local_shuffle_seed: Optional[int] = None, - # Deprecated - prefetch_blocks: int = 0, - ) -> "tf.data.Dataset": - """Return a TF Dataset over this datastream. - - .. warning:: - If your datastream contains ragged tensors, this method errors. To prevent - errors, resize tensors or - :ref:`disable tensor extension casting `. - - Examples: - >>> import ray - >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") - >>> ds - Datastream( - num_blocks=1, - num_rows=150, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) - - If your model accepts a single tensor as input, specify a single feature column. - - >>> ds.to_tf(feature_columns="sepal length (cm)", label_columns="target") # doctest: +SKIP - <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> - - If your model accepts a dictionary as input, specify a list of feature columns. - - >>> ds.to_tf(["sepal length (cm)", "sepal width (cm)"], "target") # doctest: +SKIP - <_OptionsDataset element_spec=({'sepal length (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), 'sepal width (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal width (cm)')}, TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> - - If your datastream contains multiple features but your model accepts a single - tensor as input, combine features with - :class:`~ray.data.preprocessors.Concatenator`. - - >>> from ray.data.preprocessors import Concatenator - >>> preprocessor = Concatenator(output_column_name="features", exclude="target") - >>> ds = preprocessor.transform(ds) - >>> ds - Concatenator - +- Datastream( - num_blocks=1, - num_rows=150, - schema={ - sepal length (cm): double, - sepal width (cm): double, - petal length (cm): double, - petal width (cm): double, - target: int64 - } - ) - >>> ds.to_tf("features", "target") # doctest: +SKIP - <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> - - Args: - feature_columns: Columns that correspond to model inputs. If this is a - string, the input data is a tensor. If this is a list, the input data - is a ``dict`` that maps column names to their tensor representation. - label_column: Columns that correspond to model targets. If this is a - string, the target data is a tensor. If this is a list, the target data - is a ``dict`` that maps column names to their tensor representation. - prefetch_batches: The number of batches to fetch ahead of the current batch - to fetch. If set to greater than 0, a separate threadpool will be used - to fetch the objects to the local node, format the batches, and apply - the collate_fn. Defaults to 1. You can revert back to the old - prefetching behavior that uses `prefetch_blocks` by setting - `use_legacy_iter_batches` to True in the datastreamContext. - batch_size: Record batch size. Defaults to 1. - drop_last: Set to True to drop the last incomplete batch, - if the datastream size is not divisible by the batch size. If - False and the size of the stream is not divisible by the batch - size, then the last batch will be smaller. Defaults to False. - local_shuffle_buffer_size: If non-None, the data will be randomly shuffled - using a local in-memory shuffle buffer, and this value will serve as the - minimum number of rows that must be in the local in-memory shuffle - buffer in order to yield a batch. When there are no more rows to add to - the buffer, the remaining rows in the buffer will be drained. This - buffer size must be greater than or equal to ``batch_size``, and - therefore ``batch_size`` must also be specified when using local - shuffling. - local_shuffle_seed: The seed to use for the local random shuffle. - - Returns: - A ``tf.data.Dataset`` that yields inputs and targets. - - .. seealso:: - - :meth:`~ray.data.Datastream.iter_tf_batches` - Call this method if you need more flexibility. - - """ # noqa: E501 - - return self.iterator().to_tf( - feature_columns=feature_columns, - label_columns=label_columns, - prefetch_batches=prefetch_batches, - prefetch_blocks=prefetch_blocks, - drop_last=drop_last, - batch_size=batch_size, - local_shuffle_buffer_size=local_shuffle_buffer_size, - local_shuffle_seed=local_shuffle_seed, - ) - - @ConsumptionAPI(pattern="Time complexity:") - def to_dask( - self, - meta: Union[ - "pandas.DataFrame", - "pandas.Series", - Dict[str, Any], - Iterable[Any], - Tuple[Any], - None, - ] = None, - ) -> "dask.DataFrame": - """Convert this datastream into a Dask DataFrame. - - This is only supported for datastreams convertible to Arrow records. - - Note that this function will set the Dask scheduler to Dask-on-Ray - globally, via the config. - - Time complexity: O(datastream size / parallelism) - - Args: - meta: An empty pandas DataFrame or Series that matches the dtypes and column - names of the stream. This metadata is necessary for many algorithms in - dask dataframe to work. For ease of use, some alternative inputs are - also available. Instead of a DataFrame, a dict of ``{name: dtype}`` or - iterable of ``(name, dtype)`` can be provided (note that the order of - the names should match the order of the columns). Instead of a series, a - tuple of ``(name, dtype)`` can be used. - By default, this will be inferred from the underlying Datastream schema, - with this argument supplying an optional override. - - Returns: - A Dask DataFrame created from this datastream. - """ - import dask - import dask.dataframe as dd - import pandas as pd - - try: - import pyarrow as pa - except Exception: - pa = None - - from ray.data._internal.pandas_block import PandasBlockSchema - from ray.util.client.common import ClientObjectRef - from ray.util.dask import ray_dask_get - - dask.config.set(scheduler=ray_dask_get) - - @dask.delayed - def block_to_df(block: Block): - if isinstance(block, (ray.ObjectRef, ClientObjectRef)): - raise ValueError( - "Datastream.to_dask() must be used with Dask-on-Ray, please " - "set the Dask scheduler to ray_dask_get (located in " - "ray.util.dask)." - ) - return _block_to_df(block) - - if meta is None: - from ray.data.extensions import TensorDtype - - # Infer Dask metadata from Datastream schema. - schema = self.schema(fetch_if_missing=True) - if isinstance(schema, PandasBlockSchema): - meta = pd.DataFrame( - { - col: pd.Series( - dtype=( - dtype - if not isinstance(dtype, TensorDtype) - else np.object_ - ) - ) - for col, dtype in zip(schema.names, schema.types) - } - ) - elif pa is not None and isinstance(schema, pa.Schema): - from ray.data.extensions import ArrowTensorType - - if any(isinstance(type_, ArrowTensorType) for type_ in schema.types): - meta = pd.DataFrame( - { - col: pd.Series( - dtype=( - dtype.to_pandas_dtype() - if not isinstance(dtype, ArrowTensorType) - else np.object_ - ) - ) - for col, dtype in zip(schema.names, schema.types) - } - ) - else: - meta = schema.empty_table().to_pandas() - - ddf = dd.from_delayed( - [block_to_df(block) for block in self.get_internal_block_refs()], - meta=meta, - ) - return ddf - - @ConsumptionAPI(pattern="Time complexity:") - def to_mars(self) -> "mars.DataFrame": - """Convert this datastream into a MARS dataframe. - - Time complexity: O(datastream size / parallelism) - - Returns: - A MARS dataframe created from this datastream. - """ - import pandas as pd - import pyarrow as pa - from mars.dataframe.datasource.read_raydataset import DataFrameReadRayDataset - from mars.dataframe.utils import parse_index - - from ray.data._internal.pandas_block import PandasBlockSchema - - refs = self.to_pandas_refs() - # remove this when https://github.com/mars-project/mars/issues/2945 got fixed - schema = self.schema() - if isinstance(schema, PandasBlockSchema): - dtypes = pd.Series(schema.types, index=schema.names) - elif isinstance(schema, pa.Schema): - dtypes = schema.empty_table().to_pandas().dtypes - else: - raise NotImplementedError(f"Unsupported format of schema {schema}") - index_value = parse_index(pd.RangeIndex(-1)) - columns_value = parse_index(dtypes.index, store_data=True) - op = DataFrameReadRayDataset(refs=refs) - return op(index_value=index_value, columns_value=columns_value, dtypes=dtypes) - - @ConsumptionAPI(pattern="Time complexity:") - def to_modin(self) -> "modin.DataFrame": - """Convert this datastream into a Modin dataframe. - - This works by first converting this datastream into a distributed set of - Pandas dataframes (using ``.to_pandas_refs()``). Please see caveats - there. Then the individual dataframes are used to create the modin - DataFrame using - ``modin.distributed.dataframe.pandas.partitions.from_partitions()``. - - This is only supported for datastreams convertible to Arrow records. - This function induces a copy of the data. For zero-copy access to the - underlying data, consider using ``.to_arrow()`` or - ``.get_internal_block_refs()``. - - Time complexity: O(datastream size / parallelism) - - Returns: - A Modin dataframe created from this datastream. - """ - - from modin.distributed.dataframe.pandas.partitions import from_partitions - - pd_objs = self.to_pandas_refs() - return from_partitions(pd_objs, axis=0) - - @ConsumptionAPI(pattern="Time complexity:") - def to_spark(self, spark: "pyspark.sql.SparkSession") -> "pyspark.sql.DataFrame": - """Convert this datastream into a Spark dataframe. - - Time complexity: O(datastream size / parallelism) - - Returns: - A Spark dataframe created from this datastream. - """ - import raydp - - return raydp.spark.ray_dataset_to_spark_dataframe( - spark, self.schema(), self.get_internal_block_refs() - ) - - @ConsumptionAPI(pattern="Time complexity:") - def to_pandas(self, limit: int = 100000) -> "pandas.DataFrame": - """Convert this datastream into a single Pandas DataFrame. - - This is only supported for datastreams convertible to Arrow or Pandas - records. An error is raised if the number of records exceeds the - provided limit. Note that you can use ``.limit()`` on the datastream - beforehand to truncate the datastream manually. - - Time complexity: O(datastream size) - - Args: - limit: The maximum number of records to return. An error will be - raised if the limit is exceeded. - - Returns: - A Pandas DataFrame created from this datastream, containing a limited - number of records. - """ - count = self.count() - if count > limit: - raise ValueError( - f"the datastream has more than the given limit of {limit} " - f"records: {count}. If you are sure that a DataFrame with " - f"{count} rows will fit in local memory, use " - f"ds.to_pandas(limit={count})." - ) - blocks = self.get_internal_block_refs() - output = DelegatingBlockBuilder() - for block in blocks: - output.add_block(ray.get(block)) - block = output.build() - return _block_to_df(block) - - @ConsumptionAPI(pattern="Time complexity:") - @DeveloperAPI - def to_pandas_refs(self) -> List[ObjectRef["pandas.DataFrame"]]: - """Convert this datastream into a distributed set of Pandas dataframes. - - This is only supported for datastreams convertible to Arrow records. - This function induces a copy of the data. For zero-copy access to the - underlying data, consider using ``.to_arrow()`` or - ``.get_internal_block_refs()``. - - Time complexity: O(datastream size / parallelism) - - Returns: - A list of remote Pandas dataframes created from this datastream. - """ - - block_to_df = cached_remote_fn(_block_to_df) - return [block_to_df.remote(block) for block in self.get_internal_block_refs()] - - @DeveloperAPI - def to_numpy_refs( - self, *, column: Optional[str] = None - ) -> List[ObjectRef[np.ndarray]]: - """Convert this datastream into a distributed set of NumPy ndarrays. - - This is only supported for datastreams convertible to NumPy ndarrays. - This function induces a copy of the data. For zero-copy access to the - underlying data, consider using ``.to_arrow()`` or - ``.get_internal_block_refs()``. - - Time complexity: O(datastream size / parallelism) - - Args: - column: The name of the column to convert to numpy, or None to specify the - entire row. If not specified for Arrow or Pandas blocks, each returned - future will represent a dict of column ndarrays. - - Returns: - A list of remote NumPy ndarrays created from this datastream. - """ - block_to_ndarray = cached_remote_fn(_block_to_ndarray) - return [ - block_to_ndarray.remote(block, column=column) - for block in self.get_internal_block_refs() - ] - - @ConsumptionAPI(pattern="Time complexity:") - @DeveloperAPI - def to_arrow_refs(self) -> List[ObjectRef["pyarrow.Table"]]: - """Convert this datastream into a distributed set of Arrow tables. - - This is only supported for datastreams convertible to Arrow records. - This function is zero-copy if the existing data is already in Arrow - format. Otherwise, the data will be converted to Arrow format. - - Time complexity: O(1) unless conversion is required. - - Returns: - A list of remote Arrow tables created from this datastream. - """ - import pyarrow as pa - - blocks: List[ObjectRef["pyarrow.Table"]] = self.get_internal_block_refs() - # Schema is safe to call since we have already triggered execution with - # get_internal_block_refs. - schema = self.schema(fetch_if_missing=True) - if isinstance(schema, pa.Schema): - # Zero-copy path. - return blocks - - block_to_arrow = cached_remote_fn(_block_to_arrow) - return [block_to_arrow.remote(block) for block in blocks] - - @ConsumptionAPI(pattern="Args:") - def to_random_access_dataset( - self, - key: str, - num_workers: Optional[int] = None, - ) -> RandomAccessDataset: - """Convert this datastream into a distributed RandomAccessDataset (EXPERIMENTAL). - - RandomAccessDataset partitions the datastream across the cluster by the given - sort key, providing efficient random access to records via binary search. A - number of worker actors are created, each of which has zero-copy access to the - underlying sorted data blocks of the datastream. - - Note that the key must be unique in the datastream. If there are duplicate keys, - an arbitrary value is returned. - - This is only supported for Arrow-format datastreams. - - Args: - key: The key column over which records can be queried. - num_workers: The number of actors to use to serve random access queries. - By default, this is determined by multiplying the number of Ray nodes - in the cluster by four. As a rule of thumb, you can expect each worker - to provide ~3000 records / second via ``get_async()``, and - ~10000 records / second via ``multiget()``. - """ - if num_workers is None: - num_workers = 4 * len(ray.nodes()) - return RandomAccessDataset(self, key, num_workers=num_workers) - - @ConsumptionAPI - def repeat(self, times: Optional[int] = None) -> "DatasetPipeline[T]": - """Convert this into a DatasetPipeline by looping over this datastream. - - Transformations prior to the call to ``repeat()`` are evaluated once. - Transformations done on the returned pipeline are evaluated on each - loop of the pipeline over the base datastream. - - Note that every repeat of the datastream is considered an "epoch" for - the purposes of ``DatasetPipeline.iter_epochs()``. - - Examples: - >>> import ray - >>> # Infinite pipeline of numbers [0, 5) - >>> ray.data.range(5, parallelism=1).repeat().take() - [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, ...] - >>> # Can apply transformations to the pipeline. - >>> ray.data.range(5, parallelism=1).repeat().map(lambda x: -x).take() - [0, -1, -2, -3, -4, 0, -1, -2, -3, -4, ...] - >>> # Can shuffle each epoch (datastream) in the pipeline. - >>> ray.data.range(5).repeat().random_shuffle().take() # doctest: +SKIP - [2, 3, 0, 4, 1, 4, 0, 2, 1, 3, ...] - - Args: - times: The number of times to loop over this datastream, or None - to repeat indefinitely. - """ - from ray.data._internal.plan import _rewrite_read_stage - from ray.data.dataset_pipeline import DatasetPipeline - - ctx = DataContext.get_current() - if self._plan.is_read_stage_equivalent() and ctx.optimize_fuse_read_stages: - blocks, _, stages = self._plan._get_source_blocks_and_stages() - blocks.clear() - blocks, outer_stats, stages = _rewrite_read_stage(blocks, stages) - read_stage = stages[0] - else: - blocks = self._plan.execute() - outer_stats = self._plan.stats() - read_stage = None - uuid = self._get_uuid() - outer_stats.datastream_uuid = uuid - - if times is not None and times < 1: - raise ValueError("`times` must be >= 1, got {}".format(times)) - - class Iterator: - def __init__(self, blocks): - self._blocks = blocks - self._i = 0 - - def __next__(self) -> Callable[[], "Datastream[T]"]: - if times and self._i >= times: - raise StopIteration - epoch = self._i - blocks = self._blocks - self._i += 1 - - def gen(): - ds = Datastream( - ExecutionPlan( - blocks, - outer_stats, - datastream_uuid=uuid, - run_by_consumer=True, - ), - epoch, - lazy=False, - ) - ds._set_uuid(uuid) - return ds - - return gen - - class Iterable: - def __init__(self, blocks): - self._blocks = blocks - - def __iter__(self): - return Iterator(self._blocks) - - pipe = DatasetPipeline(Iterable(blocks), False, length=times or float("inf")) - if read_stage: - pipe = pipe.foreach_window( - lambda ds, read_stage=read_stage: Datastream( - ds._plan.with_stage(read_stage), ds._epoch, True - ) - ) - return pipe - - def window( - self, - *, - blocks_per_window: Optional[int] = None, - bytes_per_window: Optional[int] = None, - ) -> "DatasetPipeline[T]": - """Convert this into a DatasetPipeline by windowing over data blocks. - - Transformations prior to the call to ``window()`` are evaluated in - bulk on the entire datastream. Transformations done on the returned - pipeline are evaluated incrementally per window of blocks as data is - read from the output of the pipeline. - - Windowing execution allows for output to be read sooner without - waiting for all transformations to fully execute, and can also improve - efficiency if transforms use different resources (e.g., GPUs). - - Without windowing:: - - [preprocessing......] - [inference.......] - [write........] - Time -----------------------------------------------------------> - - With windowing:: - - [prep1] [prep2] [prep3] - [infer1] [infer2] [infer3] - [write1] [write2] [write3] - Time -----------------------------------------------------------> - - Examples: - >>> import ray - >>> # Create an inference pipeline. - >>> ds = ray.data.read_binary_files(dir) # doctest: +SKIP - >>> infer = ... # doctest: +SKIP - >>> pipe = ds.window(blocks_per_window=10).map(infer) # doctest: +SKIP - DatasetPipeline(num_windows=40, num_stages=2) - >>> # The higher the stage parallelism, the shorter the pipeline. - >>> pipe = ds.window(blocks_per_window=20).map(infer) # doctest: +SKIP - DatasetPipeline(num_windows=20, num_stages=2) - >>> # Outputs can be incrementally read from the pipeline. - >>> for item in pipe.iter_rows(): # doctest: +SKIP - ... print(item) # doctest: +SKIP - - Args: - blocks_per_window: The window size (parallelism) in blocks. - Increasing window size increases pipeline throughput, but also - increases the latency to initial output, since it decreases the - length of the pipeline. Setting this to infinity effectively - disables pipelining. - bytes_per_window: Specify the window size in bytes instead of blocks. - This will be treated as an upper bound for the window size, but each - window will still include at least one block. This is mutually - exclusive with ``blocks_per_window``. - """ - from ray.data._internal.plan import _rewrite_read_stage - from ray.data.dataset_pipeline import DatasetPipeline - - if blocks_per_window is not None and bytes_per_window is not None: - raise ValueError("Only one windowing scheme can be specified.") - - if blocks_per_window is None: - blocks_per_window = 10 - - ctx = DataContext.get_current() - if self._plan.is_read_stage_equivalent() and ctx.optimize_fuse_read_stages: - blocks, _, stages = self._plan._get_source_blocks_and_stages() - blocks.clear() - blocks, outer_stats, stages = _rewrite_read_stage(blocks, stages) - read_stage = stages[0] - else: - blocks = self._plan.execute() - outer_stats = self._plan.stats() - read_stage = None - - class Iterator: - def __init__(self, splits, epoch): - self._splits = splits.copy() - self._epoch = epoch - - def __next__(self) -> "Datastream[T]": - if not self._splits: - raise StopIteration - - blocks = self._splits.pop(0) - - def gen(): - ds = Datastream( - ExecutionPlan(blocks, outer_stats, run_by_consumer=True), - self._epoch, - lazy=True, - ) - return ds - - return gen - - class Iterable: - def __init__(self, blocks, epoch): - if bytes_per_window: - self._splits = blocks.split_by_bytes(bytes_per_window) - else: - self._splits = blocks.split(split_size=blocks_per_window) - try: - sizes = [s.size_bytes() for s in self._splits] - num_blocks = [s.initial_num_blocks() for s in self._splits] - assert [s > 0 for s in sizes], sizes - - def fmt(size_bytes): - if size_bytes > 1024 * 1024 * 1024: - return "{}GiB".format( - round(size_bytes / (1024 * 1024 * 1024), 2) - ) - elif size_bytes > 10 * 1024: - return "{}MiB".format(round(size_bytes / (1024 * 1024), 2)) - else: - return "{}b".format(size_bytes) - - mean_bytes = int(np.mean(sizes)) - logger.info( - "Created DatasetPipeline with {} windows: " - "{} min, {} max, {} mean".format( - len(self._splits), - fmt(min(sizes)), - fmt(max(sizes)), - fmt(mean_bytes), - ) - ) - mean_num_blocks = int(np.mean(num_blocks)) - logger.info( - "Blocks per window: " - "{} min, {} max, {} mean".format( - min(num_blocks), - max(num_blocks), - mean_num_blocks, - ) - ) - # TODO(ekl) we should try automatically choosing the default - # windowing settings to meet these best-practice constraints. - avail_parallelism = _estimate_available_parallelism() - if mean_num_blocks < avail_parallelism: - logger.warning( - f"{WARN_PREFIX} This pipeline's parallelism is limited " - f"by its blocks per window to ~{mean_num_blocks} " - "concurrent tasks per window. To maximize " - "performance, increase the blocks per window to at least " - f"{avail_parallelism}. This may require increasing the " - "base datastream's parallelism and/or adjusting the " - "windowing parameters." - ) - else: - logger.info( - f"{OK_PREFIX} This pipeline's per-window parallelism " - "is high enough to fully utilize the cluster." - ) - obj_store_mem = ray.cluster_resources().get( - "object_store_memory", 0 - ) - safe_mem_bytes = int(obj_store_mem * ESTIMATED_SAFE_MEMORY_FRACTION) - if mean_bytes > safe_mem_bytes: - logger.warning( - f"{WARN_PREFIX} This pipeline's windows are " - f"~{fmt(mean_bytes)} in size each and may not fit in " - "object store memory without spilling. To improve " - "performance, consider reducing the size of each window " - f"to {fmt(safe_mem_bytes)} or less." - ) - else: - logger.info( - f"{OK_PREFIX} This pipeline's windows likely fit in " - "object store memory without spilling." - ) - except Exception as e: - logger.info( - "Created DatasetPipeline with {} windows; " - "error getting sizes: {}".format( - len(self._splits), - e, - ) - ) - self._epoch = epoch - - def __iter__(self): - return Iterator(self._splits, self._epoch) - - it = Iterable(blocks, self._epoch) - pipe = DatasetPipeline(it, False, length=len(it._splits)) - if read_stage: - pipe = pipe.foreach_window( - lambda ds, read_stage=read_stage: Datastream( - ds._plan.with_stage(read_stage), ds._epoch, True - ) - ) - return pipe - - @Deprecated(message="Use `Datastream.materialize()` instead.") - def fully_executed(self) -> "MaterializedDatastream[T]": - logger.warning( - "Deprecation warning: use Datastream.materialize() instead of " - "fully_executed()." - ) - self._plan.execute(force_read=True) - return self - - @Deprecated( - message="Check `isinstance(Datastream, MaterializedDatastream)` instead." - ) - def is_fully_executed(self) -> bool: - logger.warning( - "Deprecation warning: Check " - "`isinstance(Datastream, MaterializedDatastream)` " - "instead of using is_fully_executed()." - ) - return self._plan.has_computed_output() - - @ConsumptionAPI(pattern="store memory.", insert_after=True) - def materialize(self) -> "MaterializedDatastream[T]": - """Execute and materialize this datastream into object store memory. - - This can be used to read all blocks into memory. By default, Datastream - doesn't read blocks from the datasource until the first transform. - - Note that this does not mutate the original Datastream. Only the blocks of the - returned MaterializedDatastream class are pinned in memory. - - Returns: - A MaterializedDatastream holding the materialized data blocks. - """ - copy = Datastream.copy(self, _deep_copy=True, _as=MaterializedDatastream) - copy._plan.execute(force_read=True) - return copy - - @ConsumptionAPI(pattern="timing information.", insert_after=True) - def stats(self) -> str: - """Returns a string containing execution timing information. - - Note that this does not trigger execution, so if the datastream has not yet - executed, an empty string will be returned. - """ - return self._get_stats_summary().to_string() - - def _get_stats_summary(self) -> DatastreamStatsSummary: - return self._plan.stats_summary() - - @ConsumptionAPI(pattern="Time complexity:") - @DeveloperAPI - def get_internal_block_refs(self) -> List[ObjectRef[Block]]: - """Get a list of references to the underlying blocks of this datastream. - - This function can be used for zero-copy access to the data. It blocks - until the underlying blocks are computed. - - Time complexity: O(1) - - Returns: - A list of references to this datastream's blocks. - """ - blocks = self._plan.execute().get_blocks() - self._synchronize_progress_bar() - return blocks - - @Deprecated( - message="Datastream is lazy by default, so this conversion call is no longer " - "needed and this API will be removed in a future release" - ) - def lazy(self) -> "Datastream[T]": - """Enable lazy evaluation. - - Datastream is lazy by default, so this is only useful for datastreams created - from :func:`ray.data.from_items() `, which is - eager. - - The returned datastream is a lazy datastream, where all subsequent operations - on the stream won't be executed until the datastream is consumed - (e.g. ``.take()``, ``.iter_batches()``, ``.to_torch()``, ``.to_tf()``, etc.) - or execution is manually triggered via ``.materialize()``. - """ - ds = Datastream( - self._plan, self._epoch, lazy=True, logical_plan=self._logical_plan - ) - ds._set_uuid(self._get_uuid()) - return ds - - def has_serializable_lineage(self) -> bool: - """Whether this datastream's lineage is able to be serialized for storage and - later deserialized, possibly on a different cluster. - - Only datastreams that are created from data that we know will still exist at - deserialization time, e.g. data external to this Ray cluster such as persistent - cloud object stores, support lineage-based serialization. All of the - ray.data.read_*() APIs support lineage-based serialization. - """ - return self._plan.has_lazy_input() - - @DeveloperAPI - def serialize_lineage(self) -> bytes: - """ - Serialize this datastream's lineage, not the actual data or the existing data - futures, to bytes that can be stored and later deserialized, possibly on a - different cluster. - - Note that this will drop all computed data, and that everything will be - recomputed from scratch after deserialization. - - Use :py:meth:`Datastream.deserialize_lineage` to deserialize the serialized - bytes returned from this method into a Datastream. - - .. note:: - Unioned and zipped datastreams, produced by :py:meth`Datastream.union` and - :py:meth:`Datastream.zip`, are not lineage-serializable. - - Returns: - Serialized bytes containing the lineage of this datastream. - """ - if not self.has_serializable_lineage(): - raise ValueError( - "Lineage-based serialization is not supported for this stream, which " - "means that it cannot be used as a tunable hyperparameter. " - "Lineage-based serialization is explicitly NOT supported for unioned " - "or zipped datastreams (see docstrings for those methods), and is only " - "supported for datastreams created from data that we know will still " - "exist at deserialization time, e.g. external data in persistent cloud " - "object stores or in-memory data from long-lived clusters. Concretely, " - "all ray.data.read_*() APIs should support lineage-based " - "serialization, while all of the ray.data.from_*() APIs do not. To " - "allow this stream to be serialized to storage, write the data to an " - "external store (such as AWS S3, GCS, or Azure Blob Storage) using the " - "Datastream.write_*() APIs, and serialize a new datastream reading " - "from the external store using the ray.data.read_*() APIs." - ) - # Copy Datastream and clear the blocks from the execution plan so only the - # Datastream's lineage is serialized. - plan_copy = self._plan.deep_copy(preserve_uuid=True) - ds = Datastream(plan_copy, self._get_epoch(), self._lazy) - ds._plan.clear_block_refs() - ds._set_uuid(self._get_uuid()) - - def _reduce_remote_fn(rf: ray.remote_function.RemoteFunction): - # Custom reducer for Ray remote function handles that allows for - # cross-cluster serialization. - # This manually unsets the last export session and job to force re-exporting - # of the function when the handle is deserialized on a new cluster. - # TODO(Clark): Fix this in core Ray, see issue: - # https://github.com/ray-project/ray/issues/24152. - reconstructor, args, state = rf.__reduce__() - state["_last_export_session_and_job"] = None - return reconstructor, args, state - - context = ray._private.worker.global_worker.get_serialization_context() - try: - context._register_cloudpickle_reducer( - ray.remote_function.RemoteFunction, _reduce_remote_fn - ) - serialized = pickle.dumps(ds) - finally: - context._unregister_cloudpickle_reducer(ray.remote_function.RemoteFunction) - return serialized - - @staticmethod - @DeveloperAPI - def deserialize_lineage(serialized_ds: bytes) -> "Datastream": - """ - Deserialize the provided lineage-serialized Datastream. - - This assumes that the provided serialized bytes were serialized using - :py:meth:`Datastream.serialize_lineage`. - - Args: - serialized_ds: The serialized Datastream that we wish to deserialize. - - Returns: - A deserialized ``Datastream`` instance. - """ - return pickle.loads(serialized_ds) - - def _divide(self, block_idx: int) -> ("Datastream[T]", "Datastream[T]"): - block_list = self._plan.execute() - left, right = block_list.divide(block_idx) - l_ds = Datastream( - ExecutionPlan( - left, self._plan.stats(), run_by_consumer=block_list._owned_by_consumer - ), - self._epoch, - self._lazy, - ) - r_ds = Datastream( - ExecutionPlan( - right, self._plan.stats(), run_by_consumer=block_list._owned_by_consumer - ), - self._epoch, - self._lazy, - ) - return l_ds, r_ds - - @ConsumptionAPI(if_more_than_read=True, datasource_metadata="schema") - def default_batch_format(self) -> Type: - """Return this datastream's default batch format. - - The default batch format describes what batches of data look like. To learn more - about batch formats, read - :ref:`writing user-defined functions `. - - Examples: - - If your datastream represents a list of Python objects, then the default batch - format is ``list``. - - >>> import ray - >>> ds = ray.data.range(100) - >>> ds # doctest: +SKIP - Datastream(num_blocks=20, num_rows=100, schema=) - >>> ds.default_batch_format() - - >>> next(ds.iter_batches(batch_size=4)) - [0, 1, 2, 3] - - If your datastream contains a single ``numpy.ndarray`` - column named ``__value__`` (as created by :func:`ray.data.from_numpy`), then - the default batch format is ``np.ndarray``. For more information on tensor - formats, read the :ref:`tensor support guide `. - - >>> ds = ray.data.range_tensor(100) - >>> ds # doctest: +SKIP - Datastream(num_blocks=20, num_rows=100, schema={__value__: numpy.ndarray(shape=(1,), dtype=int64)}) - >>> ds.default_batch_format() - - >>> next(ds.iter_batches(batch_size=4)) - array([[0], - [1], - [2], - [3]]) - - If your datastream represents tabular data and doesn't only consist of a - ``__value__`` tensor column (such as is created by - :meth:`ray.data.from_numpy`), then the default batch format is - ``pd.DataFrame``. - - >>> import pandas as pd - >>> df = pd.DataFrame({"foo": ["a", "b"], "bar": [0, 1]}) - >>> ds = ray.data.from_pandas(df) - >>> ds # doctest: +SKIP - Datastream(num_blocks=1, num_rows=2, schema={foo: object, bar: int64}) - >>> ds.default_batch_format() - - >>> next(ds.iter_batches(batch_size=4)) - foo bar - 0 a 0 - 1 b 1 - - .. seealso:: - - :meth:`~Datastream.map_batches` - Call this function to transform batches of data. - - :meth:`~Datastream.iter_batches` - Call this function to iterate over batches of data. - - """ # noqa: E501 - import pandas as pd - import pyarrow as pa - - schema = self.schema() - assert isinstance(schema, (type, PandasBlockSchema, pa.Schema)) - - if isinstance(schema, type): - return list - - if isinstance(schema, (PandasBlockSchema, pa.Schema)): - if schema.names == [TENSOR_COLUMN_NAME]: - return np.ndarray - return pd.DataFrame - - @ConsumptionAPI( - if_more_than_read=True, - datasource_metadata="schema", - pattern="for the first block.", - insert_after=True, - ) - @Deprecated(message="`dataset_format` is deprecated for streaming execution.") - def dataset_format(self) -> BlockFormat: - """The format of the datastream's underlying data blocks. Possible values - are: "arrow", "pandas" and "simple". - - This may block; if the schema is unknown, this will synchronously fetch - the schema for the first block. - """ - context = DataContext.get_current() - if context.use_streaming_executor: - raise DeprecationWarning( - "`dataset_format` is deprecated for streaming execution. To use " - "`dataset_format`, you must explicitly enable bulk execution by " - "setting `use_streaming_executor` to False in the `DataContext`" - ) - - # We need schema to properly validate, so synchronously - # fetch it if necessary. - schema = self.schema(fetch_if_missing=True) - if schema is None: - raise ValueError( - "Datastream is empty or cleared, can't determine the format of " - "the datastream." - ) - - try: - import pyarrow as pa - - if isinstance(schema, pa.Schema): - return BlockFormat.ARROW - except ModuleNotFoundError: - pass - from ray.data._internal.pandas_block import PandasBlockSchema - - if isinstance(schema, PandasBlockSchema): - return BlockFormat.PANDAS - return BlockFormat.SIMPLE - - def _aggregate_on( - self, agg_cls: type, on: Optional[Union[KeyFn, List[KeyFn]]], *args, **kwargs - ): - """Helper for aggregating on a particular subset of the datastream. - - This validates the `on` argument, and converts a list of column names - or lambdas to a multi-aggregation. A null `on` results in a - multi-aggregation on all columns for an Arrow Datastream, and a single - aggregation on the entire row for a simple Datastream. - """ - aggs = self._build_multicolumn_aggs(agg_cls, on, *args, **kwargs) - return self.aggregate(*aggs) - - def _build_multicolumn_aggs( - self, - agg_cls: type, - on: Optional[Union[KeyFn, List[KeyFn]]], - ignore_nulls: bool, - *args, - skip_cols: Optional[List[str]] = None, - **kwargs, - ): - """Build set of aggregations for applying a single aggregation to - multiple columns. - """ - # Expand None into an aggregation for each column. - if on is None: - schema = self.schema(fetch_if_missing=True) - if schema is not None and not isinstance(schema, type): - if not skip_cols: - skip_cols = [] - if len(schema.names) > 0: - on = [col for col in schema.names if col not in skip_cols] - - if not isinstance(on, list): - on = [on] - return [agg_cls(on_, *args, ignore_nulls=ignore_nulls, **kwargs) for on_ in on] - - def _aggregate_result(self, result: Union[Tuple, TableRow]) -> U: - if result is not None and len(result) == 1: - if isinstance(result, tuple): - return result[0] - else: - # NOTE (kfstorm): We cannot call `result[0]` directly on - # `PandasRow` because indexing a column with position is not - # supported by pandas. - return list(result.values())[0] - else: - return result - - @ensure_notebook_deps( - ["ipywidgets", "8"], - ) - @fallback_if_colab - def _ipython_display_(self): - from ipywidgets import HTML, VBox, Layout - from IPython.display import display - - title = HTML(f"

    {self.__class__.__name__}

    ") - tab = self._tab_repr_() - - if tab: - display(VBox([title, tab], layout=Layout(width="100%"))) - - @ensure_notebook_deps( - ["tabulate", None], - ["ipywidgets", "8"], - ) - def _tab_repr_(self): - from tabulate import tabulate - from ipywidgets import Tab, HTML - - metadata = { - "num_blocks": self._plan.initial_num_blocks(), - "num_rows": self._meta_count(), - } - # Show metadata if available, but don't trigger execution. - schema = self.schema(fetch_if_missing=False) - if schema is None: - schema_repr = Template("rendered_html_common.html.j2").render( - content="
    Unknown schema
    " - ) - elif isinstance(schema, type): - schema_repr = Template("rendered_html_common.html.j2").render( - content=f"
    Data type: {html.escape(str(schema))}
    " - ) - else: - schema_data = {} - for sname, stype in zip(schema.names, schema.types): - schema_data[sname] = getattr(stype, "__name__", str(stype)) - - schema_repr = Template("scrollableTable.html.j2").render( - table=tabulate( - tabular_data=schema_data.items(), - tablefmt="html", - showindex=False, - headers=["Name", "Type"], - ), - max_height="300px", - ) - - children = [] - children.append( - HTML( - Template("scrollableTable.html.j2").render( - table=tabulate( - tabular_data=metadata.items(), - tablefmt="html", - showindex=False, - headers=["Field", "Value"], - ), - max_height="300px", - ) - ) - ) - children.append(HTML(schema_repr)) - return Tab(children, titles=["Metadata", "Schema"]) - - def __repr__(self) -> str: - return self._plan.get_plan_as_string(self.__class__.__name__) - - def __str__(self) -> str: - return repr(self) - - def __bool__(self) -> bool: - # Prevents `__len__` from being called to check if it is None - # see: issue #25152 - return True - - def __len__(self) -> int: - raise AttributeError( - "Use `ds.count()` to compute the length of a distributed Datastream. " - "This may be an expensive operation." - ) - - def __iter__(self): - raise TypeError( - "`Datastream` objects aren't iterable. To iterate records, call " - "`ds.iter_rows()` or `ds.iter_batches()`. For more information, read " - "https://docs.ray.io/en/latest/data/consuming-datasets.html." - ) - - def _block_num_rows(self) -> List[int]: - get_num_rows = cached_remote_fn(_get_num_rows) - return ray.get([get_num_rows.remote(b) for b in self.get_internal_block_refs()]) - - def _block_size_bytes(self) -> List[int]: - get_size_bytes = cached_remote_fn(_get_size_bytes) - return ray.get( - [get_size_bytes.remote(b) for b in self.get_internal_block_refs()] - ) - - def _meta_count(self) -> Optional[int]: - return self._plan.meta_count() - - def _get_uuid(self) -> str: - return self._uuid - - def _set_uuid(self, uuid: str) -> None: - self._uuid = uuid - - def _get_epoch(self) -> int: - return self._epoch - - def _set_epoch(self, epoch: int) -> None: - self._epoch = epoch - - def _warn_slow(self): - if ray.util.log_once("datastream_slow_warned"): - logger.warning( - "The `map`, `flat_map`, and `filter` operations are unvectorized and " - "can be very slow. Consider using `.map_batches()` instead." - ) - - def _synchronize_progress_bar(self): - """Flush progress bar output by shutting down the current executor. - - This should be called at the end of all blocking APIs (e.g., `take`), but not - async APIs (e.g., `iter_batches`). - - The streaming executor runs in a separate generator / thread, so it is - possible the shutdown logic runs even after a call to retrieve rows from the - stream has finished. Explicit shutdown avoids this, which can clobber console - output (https://github.com/ray-project/ray/issues/32414). - """ - if self._current_executor: - self._current_executor.shutdown() - self._current_executor = None - - def __getstate__(self): - # Note: excludes _current_executor which is not serializable. - return { - "plan": self._plan, - "uuid": self._uuid, - "epoch": self._epoch, - "lazy": self._lazy, - "logical_plan": self._logical_plan, - } - - def __setstate__(self, state): - self._plan = state["plan"] - self._uuid = state["uuid"] - self._epoch = state["epoch"] - self._lazy = state["lazy"] - self._logical_plan = state["logical_plan"] - self._current_executor = None - - -# Backwards compatibility alias. -Dataset = Datastream - - -@PublicAPI -class MaterializedDatastream(Datastream, Generic[T]): - """A Datastream materialized in Ray memory, e.g., via `.materialize()`. - - The blocks of a MaterializedDatastream object are materialized into Ray object store - memory, which means that this class can be shared or iterated over by multiple Ray - tasks without re-executing the underlying computations for producing the stream. - """ - - pass - - -@PublicAPI(stability="beta") -class Schema: - """Datastream schema. - Attributes: - names: List of column names of this Datastream. - base_schema: The underlying Arrow or Pandas schema. - """ - - def __init__(self, base_schema: Union["pyarrow.lib.Schema", "PandasBlockSchema"]): - self.base_schema = base_schema - - @property - def names(self) -> List[str]: - """Lists the columns of this Datastream.""" - return self.base_schema.names - - def __str__(self): - # TODO(ekl) we should canonicalize Pandas vs Pyarrow dtypes, which will be - # possible one we support Python objects in Arrow via an extension type. - return f"Schema({dict(zip(self.base_schema.names, self.base_schema.types))})" - - def __repr__(self): - return str(self) - - -def _get_size_bytes(block: Block) -> int: - block = BlockAccessor.for_block(block) - return block.size_bytes() - - -def _block_to_df(block: Block): - block = BlockAccessor.for_block(block) - return block.to_pandas() - - -def _block_to_ndarray(block: Block, column: Optional[str]): - block = BlockAccessor.for_block(block) - return block.to_numpy(column) - - -def _block_to_arrow(block: Block): - block = BlockAccessor.for_block(block) - return block.to_arrow() - - -def _sliding_window(iterable: Iterable, n: int): - """Creates an iterator consisting of n-width sliding windows over - iterable. The sliding windows are constructed lazily such that an - element on the base iterator (iterable) isn't consumed until the - first sliding window containing that element is reached. - - If n > len(iterable), then a single len(iterable) window is - returned. - - Args: - iterable: The iterable on which the sliding window will be - created. - n: The width of the sliding window. - - Returns: - An iterator of n-width windows over iterable. - If n > len(iterable), then a single len(iterable) window is - returned. - """ - it = iter(iterable) - window = collections.deque(itertools.islice(it, n), maxlen=n) - if len(window) > 0: - yield tuple(window) - for elem in it: - window.append(elem) - yield tuple(window) - - -def _do_write( - ds: Datasource, - ctx: DataContext, - blocks: List[Block], - meta: List[BlockMetadata], - ray_remote_args: Dict[str, Any], - write_args: Dict[str, Any], -) -> List[ObjectRef[WriteResult]]: - write_args = _unwrap_arrow_serialization_workaround(write_args) - DataContext._set_current(ctx) - return ds.do_write(blocks, meta, ray_remote_args=ray_remote_args, **write_args) diff --git a/python/ray/data/examples/demo_infer.py b/python/ray/data/examples/demo_infer.py index 3815d23d2a62..22b3f8c9aaf7 100644 --- a/python/ray/data/examples/demo_infer.py +++ b/python/ray/data/examples/demo_infer.py @@ -22,7 +22,7 @@ def __call__(self, x): ds = ( ds.window(blocks_per_window=10) .map(preprocess) - .map(Model, compute="actors", num_gpus=1) + .map(Model, compute=ray.data.ActorPoolStrategy(), num_gpus=1) ) for x in ds.iter_rows(): diff --git a/python/ray/data/grouped_dataset.py b/python/ray/data/grouped_data.py similarity index 59% rename from python/ray/data/grouped_dataset.py rename to python/ray/data/grouped_data.py index 23cb30533fc5..0823fd419d79 100644 --- a/python/ray/data/grouped_dataset.py +++ b/python/ray/data/grouped_data.py @@ -1,7 +1,7 @@ -from typing import Any, Callable, Generic, List, Tuple, Union, Optional +from typing import List, Tuple, Union, Optional from ray.data._internal import sort -from ray.data._internal.compute import CallableClass, ComputeStrategy +from ray.data._internal.compute import ComputeStrategy from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.logical.interfaces import LogicalPlan @@ -25,13 +25,11 @@ BlockAccessor, BlockExecStats, BlockMetadata, - KeyFn, KeyType, - T, - U, + UserDefinedFunction, ) from ray.data.context import DataContext -from ray.data.datastream import DataBatch, Datastream +from ray.data.dataset import DataBatch, Dataset from ray.util.annotations import PublicAPI @@ -42,7 +40,7 @@ def map( block: Block, output_num_blocks: int, boundaries: List[KeyType], - key: KeyFn, + key: str, aggs: Tuple[AggregateFn], ) -> List[Union[BlockMetadata, Block]]: """Partition the block and combine rows with the same key.""" @@ -66,7 +64,7 @@ def map( @staticmethod def reduce( - key: KeyFn, + key: str, aggs: Tuple[AggregateFn], *mapper_outputs: List[Block], partial_reduce: bool = False, @@ -79,7 +77,7 @@ def reduce( @staticmethod def _prune_unused_columns( block: Block, - key: KeyFn, + key: str, aggs: Tuple[AggregateFn], ) -> Block: """Prune unused columns from block before aggregate.""" @@ -118,66 +116,35 @@ class PushBasedGroupbyOp(_GroupbyOp, PushBasedShufflePlan): @PublicAPI -class GroupedData(Generic[T]): - """Represents a grouped datastream created by calling ``Datastream.groupby()``. +class GroupedData: + """Represents a grouped dataset created by calling ``Dataset.groupby()``. The actual groupby is deferred until an aggregation is applied. """ - def __init__(self, datastream: Datastream[T], key: KeyFn): - """Construct a datastream grouped by key (internal API). + def __init__(self, dataset: Dataset, key: str): + """Construct a dataset grouped by key (internal API). The constructor is not part of the GroupedData API. - Use the ``Datastream.groupby()`` method to construct one. + Use the ``Dataset.groupby()`` method to construct one. """ - self._datastream = datastream + self._dataset = dataset self._key = key def __repr__(self) -> str: return ( - f"{self.__class__.__name__}(datastream={self._datastream}, " - f"key={self._key!r})" + f"{self.__class__.__name__}(dataset={self._dataset}, " f"key={self._key!r})" ) - def aggregate(self, *aggs: AggregateFn) -> Datastream[U]: + def aggregate(self, *aggs: AggregateFn) -> Dataset: """Implements an accumulator-based aggregation. - Examples: - - .. testcode:: - - import ray - from ray.data.aggregate import AggregateFn - ds = ray.data.range(100) - grouped_ds = ds.groupby(lambda x: x % 3) - result = grouped_ds.aggregate(AggregateFn( - init=lambda k: [], - accumulate_row=lambda a, r: a + [r], - merge=lambda a1, a2: a1 + a2, - finalize=lambda a: sorted(a) - )) - result.show() - - .. testoutput:: - - (0, [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, \ -51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99]) - (1, [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49, \ -52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97]) - (2, [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, \ -53, 56, 59, 62, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98]) - - Args: aggs: Aggregations to do. Returns: - If the input datastream is simple datastream then the output is a simple - datastream of ``(k, v_1, ..., v_n)`` tuples where ``k`` is the groupby - key and ``v_i`` is the result of the ith given aggregation. - If the input datastream is an Arrow datastream then the output is an - Arrow datastream of ``n + 1`` columns where the first column is the - groupby key and the second through ``n + 1`` columns are the + The output is an dataset of ``n + 1`` columns where the first column + is the groupby key and the second through ``n + 1`` columns are the results of the aggregations. If groupby key is ``None`` then the key part of return is omitted. """ @@ -188,8 +155,8 @@ def do_agg(blocks, task_ctx: TaskContext, clear_input_blocks: bool, *_): if len(aggs) == 0: raise ValueError("Aggregate requires at least one aggregation") for agg in aggs: - agg._validate(self._datastream.schema(fetch_if_missing=True)) - # Handle empty datastream. + agg._validate(self._dataset.schema(fetch_if_missing=True)) + # Handle empty dataset. if blocks.initial_num_blocks() == 0: return blocks, stage_info @@ -222,7 +189,7 @@ def do_agg(blocks, task_ctx: TaskContext, clear_input_blocks: bool, *_): ctx=task_ctx, ) - plan = self._datastream._plan.with_stage( + plan = self._dataset._plan.with_stage( AllToAllStage( "Aggregate", None, @@ -231,7 +198,7 @@ def do_agg(blocks, task_ctx: TaskContext, clear_input_blocks: bool, *_): ) ) - logical_plan = self._datastream._logical_plan + logical_plan = self._dataset._logical_plan if logical_plan is not None: op = Aggregate( logical_plan.dag, @@ -239,44 +206,42 @@ def do_agg(blocks, task_ctx: TaskContext, clear_input_blocks: bool, *_): aggs=aggs, ) logical_plan = LogicalPlan(op) - return Datastream( + return Dataset( plan, - self._datastream._epoch, - self._datastream._lazy, + self._dataset._epoch, + self._dataset._lazy, logical_plan, ) def _aggregate_on( self, agg_cls: type, - on: Union[KeyFn, List[KeyFn]], + on: Union[str, List[str]], ignore_nulls: bool, *args, **kwargs, ): - """Helper for aggregating on a particular subset of the datastream. + """Helper for aggregating on a particular subset of the dataset. This validates the `on` argument, and converts a list of column names - or lambdas to a multi-aggregation. A null `on` results in a - multi-aggregation on all columns for an Arrow Datastream, and a single - aggregation on the entire row for a simple Datastream. + to a multi-aggregation. A null `on` results in a + multi-aggregation on all columns for an Arrow Dataset, and a single + aggregation on the entire row for a simple Dataset. """ - aggs = self._datastream._build_multicolumn_aggs( + aggs = self._dataset._build_multicolumn_aggs( agg_cls, on, ignore_nulls, *args, skip_cols=self._key, **kwargs ) return self.aggregate(*aggs) def map_groups( self, - fn: Union[CallableClass, Callable[[DataBatch], DataBatch]], + fn: UserDefinedFunction[DataBatch, DataBatch], *, compute: Union[str, ComputeStrategy] = None, batch_format: Optional[str] = "default", **ray_remote_args, - ) -> "Datastream[Any]": - # TODO AttributeError: 'GroupedData' object has no attribute 'map_groups' - # in the example below. - """Apply the given function to each group of records of this datastream. + ) -> "Dataset": + """Apply the given function to each group of records of this dataset. While map_groups() is very flexible, note that it comes with downsides: * It may be slower than using more specific methods such as min(), max(). @@ -290,11 +255,6 @@ def map_groups( >>> import ray >>> import pandas as pd >>> import numpy as np - >>> # Get median per group. Note that median is not an associative - >>> # function so cannot be computed with aggregate(). - >>> ds = ray.data.range(100) # doctest: +SKIP - >>> ds.groupby(lambda x: x % 3).map_groups( # doctest: +SKIP - ... lambda x: [np.median(x)]) >>> # Get first value per group. >>> ds = ray.data.from_items([ # doctest: +SKIP ... {"group": 1, "value": 1}, @@ -302,7 +262,7 @@ def map_groups( ... {"group": 2, "value": 3}, ... {"group": 2, "value": 4}]) >>> ds.groupby("group").map_groups( # doctest: +SKIP - ... lambda g: [g["value"][0]]) + ... lambda g: {"result": np.array([g["value"][0]])}) >>> # Return multiple records per group (dataframe in, dataframe out). >>> df = pd.DataFrame( @@ -326,12 +286,10 @@ def map_groups( pool, or ``ray.data.ActorPoolStrategy(min_size=m, max_size=n)`` for an autoscaling actor pool. batch_format: Specify ``"default"`` to use the default block format - (promotes tables to Pandas and tensors to NumPy), ``"pandas"`` to select - ``pandas.DataFrame``, "pyarrow" to select ``pyarrow.Table``, or - ``"numpy"`` to select ``numpy.ndarray`` for tensor datastreams and - ``Dict[str, numpy.ndarray]`` for tabular datastreams, or None - to return the underlying block exactly as is with no additional - formatting. The default is "default". + (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to + select ``pyarrow.Table``, or ``"numpy"`` to select + ``Dict[str, numpy.ndarray]``, or None to return the underlying block + exactly as is with no additional formatting. ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). @@ -343,9 +301,9 @@ def map_groups( # Note that sort() will ensure that records of the same key partitioned # into the same block. if self._key is not None: - sorted_ds = self._datastream.sort(self._key) + sorted_ds = self._dataset.sort(self._key) else: - sorted_ds = self._datastream.repartition(1) + sorted_ds = self._dataset.repartition(1) # Returns the group boundaries. def get_key_boundaries(block_accessor: BlockAccessor): @@ -395,38 +353,35 @@ def group_fn(batch): **ray_remote_args, ) - def count(self) -> Datastream[U]: + def count(self) -> Dataset: """Compute count aggregation. Examples: >>> import ray - >>> ray.data.range(100).groupby(lambda x: x % 3).count() # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": x % 3, "B": x} for x in range(100)]).groupby( # doctest: +SKIP ... "A").count() # doctest: +SKIP Returns: - A simple datastream of ``(k, v)`` pairs or an Arrow datastream of - ``[k, v]`` columns where ``k`` is the groupby key and ``v`` is the - number of rows with that key. + A dataset of ``[k, v]`` columns where ``k`` is the groupby key and + ``v`` is the number of rows with that key. If groupby key is ``None`` then the key part of return is omitted. """ return self.aggregate(Count()) def sum( - self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True - ) -> Datastream[U]: + self, on: Union[str, List[str]] = None, ignore_nulls: bool = True + ) -> Dataset: r"""Compute grouped sum aggregation. Examples: >>> import ray - >>> ray.data.range(100).groupby(lambda x: x % 3).sum() # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... (i % 3, i, i**2) # doctest: +SKIP ... for i in range(100)]) \ # doctest: +SKIP ... .groupby(lambda x: x[0] % 3) \ # doctest: +SKIP ... .sum(lambda x: x[2]) # doctest: +SKIP - >>> ray.data.range_table(100).groupby("value").sum() # doctest: +SKIP + >>> ray.data.range(100).groupby("id").sum() # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP ... for i in range(100)]) \ # doctest: +SKIP @@ -434,13 +389,7 @@ def sum( ... .sum(["B", "C"]) # doctest: +SKIP Args: - on: The data subset on which to compute the sum. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to take a sum of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to do a column-wise sum of all - columns. + on: a column name or a list of column names to aggregate. ignore_nulls: Whether to ignore null values. If ``True``, null values will be ignored when computing the sum; if ``False``, if a null value is encountered, the output will be null. @@ -450,21 +399,12 @@ def sum( Returns: The sum result. - For a simple datastream, the output is: - - - ``on=None``: a simple datastream of ``(k, sum)`` tuples where ``k`` - is the groupby key and ``sum`` is sum of all rows in that group. - - ``on=[callable_1, ..., callable_n]``: a simple datastream of - ``(k, sum_1, ..., sum_n)`` tuples where ``k`` is the groupby key - and ``sum_i`` is sum of the outputs of the ith callable called on - each row in that group. - - For an Arrow datastream, the output is: + For different values of ``on``, the return varies: - - ``on=None``: an Arrow datastream containing a groupby key column, + - ``on=None``: a dataset containing a groupby key column, ``"k"``, and a column-wise sum column for each original column - in the datastream. - - ``on=["col_1", ..., "col_n"]``: an Arrow datastream of ``n + 1`` + in the dataset. + - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1`` columns where the first column is the groupby key and the second through ``n + 1`` columns are the results of the aggregations. @@ -473,19 +413,13 @@ def sum( return self._aggregate_on(Sum, on, ignore_nulls) def min( - self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True - ) -> Datastream[U]: + self, on: Union[str, List[str]] = None, ignore_nulls: bool = True + ) -> Dataset: """Compute grouped min aggregation. Examples: >>> import ray - >>> ray.data.range(100).groupby(lambda x: x % 3).min() # doctest: +SKIP - >>> ray.data.from_items([ # doctest: +SKIP - ... (i % 3, i, i**2) # doctest: +SKIP - ... for i in range(100)]) \ # doctest: +SKIP - ... .groupby(lambda x: x[0] % 3) \ # doctest: +SKIP - ... .min(lambda x: x[2]) # doctest: +SKIP - >>> ray.data.range_table(100).groupby("value").min() # doctest: +SKIP + >>> ray.data.le(100).groupby("value").min() # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP ... for i in range(100)]) \ # doctest: +SKIP @@ -493,13 +427,7 @@ def min( ... .min(["B", "C"]) # doctest: +SKIP Args: - on: The data subset on which to compute the min. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to take a min of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to do a column-wise min of all - columns. + on: a column name or a list of column names to aggregate. ignore_nulls: Whether to ignore null values. If ``True``, null values will be ignored when computing the min; if ``False``, if a null value is encountered, the output will be null. @@ -509,21 +437,12 @@ def min( Returns: The min result. - For a simple datastream, the output is: + For different values of ``on``, the return varies: - - ``on=None``: a simple datastream of ``(k, min)`` tuples where ``k`` - is the groupby key and min is min of all rows in that group. - - ``on=[callable_1, ..., callable_n]``: a simple datastream of - ``(k, min_1, ..., min_n)`` tuples where ``k`` is the groupby key - and ``min_i`` is min of the outputs of the ith callable called on - each row in that group. - - For an Arrow datastream, the output is: - - - ``on=None``: an Arrow datastream containing a groupby key column, + - ``on=None``: a dataset containing a groupby key column, ``"k"``, and a column-wise min column for each original column in - the datastream. - - ``on=["col_1", ..., "col_n"]``: an Arrow datastream of ``n + 1`` + the dataset. + - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1`` columns where the first column is the groupby key and the second through ``n + 1`` columns are the results of the aggregations. @@ -532,19 +451,13 @@ def min( return self._aggregate_on(Min, on, ignore_nulls) def max( - self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True - ) -> Datastream[U]: + self, on: Union[str, List[str]] = None, ignore_nulls: bool = True + ) -> Dataset: """Compute grouped max aggregation. Examples: >>> import ray - >>> ray.data.range(100).groupby(lambda x: x % 3).max() # doctest: +SKIP - >>> ray.data.from_items([ # doctest: +SKIP - ... (i % 3, i, i**2) # doctest: +SKIP - ... for i in range(100)]) \ # doctest: +SKIP - ... .groupby(lambda x: x[0] % 3) \ # doctest: +SKIP - ... .max(lambda x: x[2]) # doctest: +SKIP - >>> ray.data.range_table(100).groupby("value").max() # doctest: +SKIP + >>> ray.data.le(100).groupby("value").max() # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP ... for i in range(100)]) \ # doctest: +SKIP @@ -552,13 +465,7 @@ def max( ... .max(["B", "C"]) # doctest: +SKIP Args: - on: The data subset on which to compute the max. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to take a max of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to do a column-wise max of all - columns. + on: a column name or a list of column names to aggregate. ignore_nulls: Whether to ignore null values. If ``True``, null values will be ignored when computing the max; if ``False``, if a null value is encountered, the output will be null. @@ -568,21 +475,12 @@ def max( Returns: The max result. - For a simple datastream, the output is: - - - ``on=None``: a simple datastream of ``(k, max)`` tuples where ``k`` - is the groupby key and ``max`` is max of all rows in that group. - - ``on=[callable_1, ..., callable_n]``: a simple datastream of - ``(k, max_1, ..., max_n)`` tuples where ``k`` is the groupby key - and ``max_i`` is max of the outputs of the ith callable called on - each row in that group. - - For an Arrow datastream, the output is: + For different values of ``on``, the return varies: - - ``on=None``: an Arrow datastream containing a groupby key column, + - ``on=None``: a dataset containing a groupby key column, ``"k"``, and a column-wise max column for each original column in - the datastream. - - ``on=["col_1", ..., "col_n"]``: an Arrow datastream of ``n + 1`` + the dataset. + - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1`` columns where the first column is the groupby key and the second through ``n + 1`` columns are the results of the aggregations. @@ -591,19 +489,13 @@ def max( return self._aggregate_on(Max, on, ignore_nulls) def mean( - self, on: Union[KeyFn, List[KeyFn]] = None, ignore_nulls: bool = True - ) -> Datastream[U]: + self, on: Union[str, List[str]] = None, ignore_nulls: bool = True + ) -> Dataset: """Compute grouped mean aggregation. Examples: >>> import ray - >>> ray.data.range(100).groupby(lambda x: x % 3).mean() # doctest: +SKIP - >>> ray.data.from_items([ # doctest: +SKIP - ... (i % 3, i, i**2) # doctest: +SKIP - ... for i in range(100)]) \ # doctest: +SKIP - ... .groupby(lambda x: x[0] % 3) \ # doctest: +SKIP - ... .mean(lambda x: x[2]) # doctest: +SKIP - >>> ray.data.range_table(100).groupby("value").mean() # doctest: +SKIP + >>> ray.data.le(100).groupby("value").mean() # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP ... for i in range(100)]) \ # doctest: +SKIP @@ -611,13 +503,7 @@ def mean( ... .mean(["B", "C"]) # doctest: +SKIP Args: - on: The data subset on which to compute the mean. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to take a mean of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to do a column-wise mean of all - columns. + on: a column name or a list of column names to aggregate. ignore_nulls: Whether to ignore null values. If ``True``, null values will be ignored when computing the mean; if ``False``, if a null value is encountered, the output will be null. @@ -627,22 +513,12 @@ def mean( Returns: The mean result. - For a simple datastream, the output is: + For different values of ``on``, the return varies: - - ``on=None``: a simple datastream of ``(k, mean)`` tuples where ``k`` - is the groupby key and ``mean`` is mean of all rows in that - group. - - ``on=[callable_1, ..., callable_n]``: a simple datastream of - ``(k, mean_1, ..., mean_n)`` tuples where ``k`` is the groupby - key and ``mean_i`` is mean of the outputs of the ith callable - called on each row in that group. - - For an Arrow datastream, the output is: - - - ``on=None``: an Arrow datastream containing a groupby key column, + - ``on=None``: a dataset containing a groupby key column, ``"k"``, and a column-wise mean column for each original column - in the datastream. - - ``on=["col_1", ..., "col_n"]``: an Arrow datastream of ``n + 1`` + in the dataset. + - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1`` columns where the first column is the groupby key and the second through ``n + 1`` columns are the results of the aggregations. @@ -652,21 +528,15 @@ def mean( def std( self, - on: Union[KeyFn, List[KeyFn]] = None, + on: Union[str, List[str]] = None, ddof: int = 1, ignore_nulls: bool = True, - ) -> Datastream[U]: + ) -> Dataset: """Compute grouped standard deviation aggregation. Examples: >>> import ray - >>> ray.data.range(100).groupby(lambda x: x % 3).std() # doctest: +SKIP - >>> ray.data.from_items([ # doctest: +SKIP - ... (i % 3, i, i**2) # doctest: +SKIP - ... for i in range(100)]) \ # doctest: +SKIP - ... .groupby(lambda x: x[0] % 3) \ # doctest: +SKIP - ... .std(lambda x: x[2]) # doctest: +SKIP - >>> ray.data.range_table(100).groupby("value").std(ddof=0) # doctest: +SKIP + >>> ray.data.range(100).groupby("id").std(ddof=0) # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP ... for i in range(100)]) \ # doctest: +SKIP @@ -682,13 +552,7 @@ def std( https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm Args: - on: The data subset on which to compute the std. - - - For a simple datastream: it can be a callable or a list thereof, - and the default is to take a std of all rows. - - For an Arrow datastream: it can be a column name or a list - thereof, and the default is to do a column-wise std of all - columns. + on: a column name or a list of column names to aggregate. ddof: Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. ignore_nulls: Whether to ignore null values. If ``True``, null @@ -700,21 +564,12 @@ def std( Returns: The standard deviation result. - For a simple datastream, the output is: - - - ``on=None``: a simple datastream of ``(k, std)`` tuples where ``k`` - is the groupby key and ``std`` is std of all rows in that group. - - ``on=[callable_1, ..., callable_n]``: a simple datastream of - ``(k, std_1, ..., std_n)`` tuples where ``k`` is the groupby key - and ``std_i`` is std of the outputs of the ith callable called on - each row in that group. - - For an Arrow datastream, the output is: + For different values of ``on``, the return varies: - - ``on=None``: an Arrow datastream containing a groupby key column, + - ``on=None``: a dataset containing a groupby key column, ``"k"``, and a column-wise std column for each original column in - the datastream. - - ``on=["col_1", ..., "col_n"]``: an Arrow datastream of ``n + 1`` + the dataset. + - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1`` columns where the first column is the groupby key and the second through ``n + 1`` columns are the results of the aggregations. diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py index f7841673cf03..41e18f5d8347 100644 --- a/python/ray/data/iterator.py +++ b/python/ray/data/iterator.py @@ -14,25 +14,23 @@ ) from ray.types import ObjectRef -from ray.data.block import BlockAccessor, Block, BlockMetadata, DataBatch, T +from ray.data.block import BlockAccessor, Block, BlockMetadata, DataBatch from ray.data.context import DataContext -from ray.data.row import TableRow from ray.util.annotations import PublicAPI from ray.data._internal.block_batching import batch_block_refs from ray.data._internal.block_batching.iter_batches import iter_batches -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats from ray.data._internal.util import _is_tensor_schema if TYPE_CHECKING: - import pyarrow import tensorflow as tf import torch from ray.data._internal.torch_iterable_dataset import TorchTensorBatchType - from ray.data.datastream import TensorFlowTensorBatchType + from ray.data.dataset import TensorFlowTensorBatchType, Schema -def _is_tensor_datastream(schema) -> bool: - """Return ``True`` if this is an iterator over a tensor datastream.""" +def _is_tensor_dataset(schema) -> bool: + """Return ``True`` if this is an iterator over a tensor dataset.""" if schema is None or isinstance(schema, type): return False return _is_tensor_schema(schema.names) @@ -40,13 +38,13 @@ def _is_tensor_datastream(schema) -> bool: @PublicAPI(stability="beta") class DataIterator(abc.ABC): - """An iterator for reading items from a :class:`~Datastream` or + """An iterator for reading records from a :class:`~Dataset` or :class:`~DatasetPipeline`. - For Datastreams, each iteration call represents a complete read of all items in the - Datastream. For DatasetPipelines, each iteration call represents one pass (epoch) - over the base Datastream. Note that for DatasetPipelines, each pass iterates over - the original Datastream, instead of a window (if ``.window()`` was used). + For Datasets, each iteration call represents a complete read of all items in the + Dataset. For DatasetPipelines, each iteration call represents one pass (epoch) + over the base Dataset. Note that for DatasetPipelines, each pass iterates over + the original Dataset, instead of a window (if ``.window()`` was used). If using Ray AIR, each trainer actor should get its own iterator by calling :meth:`session.get_dataset_shard("train") @@ -56,19 +54,15 @@ class DataIterator(abc.ABC): >>> import ray >>> ds = ray.data.range(5) >>> ds - Datastream(num_blocks=5, num_rows=5, schema=) + Dataset(num_blocks=5, num_rows=5, schema={id: int64}) >>> ds.iterator() - DataIterator(Datastream(num_blocks=5, num_rows=5, schema=)) - >>> ds = ds.repeat(); ds - DatasetPipeline(num_windows=inf, num_stages=2) - >>> ds.iterator() - DataIterator(DatasetPipeline(num_windows=inf, num_stages=2)) + DataIterator(Dataset(num_blocks=5, num_rows=5, schema={id: int64})) .. tip:: For debugging purposes, use :meth:`~ray.air.util.check_ingest.make_local_dataset_iterator` to create a - local `DataIterator` from a :class:`~ray.data.Datastream`, a - :class:`~ray.data.Preprocessor`, and a :class:`~ray.air.DatastreamConfig`. + local `DataIterator` from a :class:`~ray.data.Dataset`, a + :class:`~ray.data.Preprocessor`, and a :class:`~ray.air.DatasetConfig`. """ @abc.abstractmethod @@ -76,7 +70,7 @@ def _to_block_iterator( self, ) -> Tuple[ Iterator[Tuple[ObjectRef[Block], BlockMetadata]], - Optional[DatastreamStats], + Optional[DatasetStats], bool, ]: """Returns the iterator to use for `iter_batches`. @@ -84,7 +78,7 @@ def _to_block_iterator( Returns: A tuple. The first item of the tuple is an iterator over pairs of Block object references and their corresponding metadata. The second item of the - tuple is a DatastreamStats object used for recording stats during iteration. + tuple is a DatasetStats object used for recording stats during iteration. The third item is a boolean indicating if the blocks can be safely cleared after use. """ @@ -103,7 +97,7 @@ def iter_batches( # Deprecated. prefetch_blocks: int = 0, ) -> Iterator[DataBatch]: - """Return a local batched iterator over the datastream. + """Return a local batched iterator over the dataset. Examples: >>> import ray @@ -125,14 +119,11 @@ def iter_batches( as batches (blocks may contain different number of rows). The final batch may include fewer than ``batch_size`` rows if ``drop_last`` is ``False``. Defaults to 256. - batch_format: The format in which to return each batch. - Specify "default" to use the default block format (promoting - tables to Pandas and tensors to NumPy), "pandas" to select - ``pandas.DataFrame``, "pyarrow" to select ``pyarrow.Table``, or "numpy" - to select ``numpy.ndarray`` for tensor datastreams and - ``Dict[str, numpy.ndarray]`` for tabular datastreams, or None to return - the underlying block exactly as is with no additional formatting. - The default is "default". + batch_format: Specify ``"default"`` to use the default block format + (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to + select ``pyarrow.Table``, or ``"numpy"`` to select + ``Dict[str, numpy.ndarray]``, or None to return the underlying block + exactly as is with no additional formatting. drop_last: Whether to drop the last batch if it's incomplete. local_shuffle_buffer_size: If non-None, the data will be randomly shuffled using a local in-memory shuffle buffer, and this value will serve as the @@ -200,17 +191,17 @@ def drop_metadata(block_iterator): if stats: stats.iter_total_s.add(time.perf_counter() - time_start) - def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Union[T, TableRow]]: - """Return a local row iterator over the datastream. + def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Dict[str, Any]]: + """Return a local row iterator over the dataset. - If the datastream is a tabular datastream (Arrow/Pandas blocks), dict-like - mappings :py:class:`~ray.data.row.TableRow` are yielded for each row by the - iterator. If the datastream is not tabular, the raw row is yielded. + If the dataset is a tabular dataset (Arrow/Pandas blocks), dicts + are yielded for each row by the iterator. If the dataset is not tabular, + the raw row is yielded. Examples: >>> import ray - >>> datastream = ray.data.range(10) - >>> next(iter(datastream.iterator().iter_rows())) + >>> dataset = ray.data.range(10) + >>> next(iter(dataset.iterator().iter_rows())) 0 Time complexity: O(1) @@ -220,7 +211,7 @@ def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Union[T, TableRow]] current block during the scan. Returns: - An iterator over rows of the datastream. + An iterator over rows of the dataset. """ iter_batch_args = {"batch_size": None, "batch_format": None} @@ -233,7 +224,7 @@ def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Union[T, TableRow]] for batch in self.iter_batches(**iter_batch_args): batch = BlockAccessor.for_block(BlockAccessor.batch_to_block(batch)) - for row in batch.iter_rows(): + for row in batch.iter_rows(public_row_format=True): yield row @abc.abstractmethod @@ -242,8 +233,8 @@ def stats(self) -> str: raise NotImplementedError @abc.abstractmethod - def schema(self) -> Union[type, "pyarrow.lib.Schema"]: - """Return the schema of the datastream iterated over.""" + def schema(self) -> "Schema": + """Return the schema of the dataset iterated over.""" raise NotImplementedError def iter_torch_batches( @@ -262,9 +253,9 @@ def iter_torch_batches( # Deprecated. prefetch_blocks: int = 0, ) -> Iterator["TorchTensorBatchType"]: - """Return a local batched iterator of Torch Tensors over the datastream. + """Return a local batched iterator of Torch Tensors over the dataset. - This iterator will yield single-tensor batches if the underlying datastream + This iterator will yield single-tensor batches if the underlying dataset consists of a single column; otherwise, it will yield a dictionary of column-tensors. If looking for more flexibility in the tensor conversion (e.g. casting dtypes) or the batch format, try using `.iter_batches` directly. @@ -363,15 +354,15 @@ def iter_tf_batches( # Deprecated. prefetch_blocks: int = 0, ) -> Iterator["TensorFlowTensorBatchType"]: - """Return a local batched iterator of TensorFlow Tensors over the datastream. + """Return a local batched iterator of TensorFlow Tensors over the dataset. - This iterator will yield single-tensor batches of the underlying datastream + This iterator will yield single-tensor batches of the underlying dataset consists of a single column; otherwise, it will yield a dictionary of column-tensors. .. tip:: If you don't need the additional flexibility provided by this method, - consider using :meth:`~ray.data.Datastream.to_tf` instead. It's easier + consider using :meth:`~ray.data.Dataset.to_tf` instead. It's easier to use. Examples: @@ -449,9 +440,9 @@ def to_torch( # Deprecated. prefetch_blocks: int = 0, ) -> "torch.utils.data.IterableDataset": - """Return a Torch IterableDataset over this datastream. + """Return a Torch IterableDataset over this dataset. - This is only supported for datastreams convertible to Arrow records. + This is only supported for datasets convertible to Arrow records. It is recommended to use the returned ``IterableDataset`` directly instead of passing it into a torch ``DataLoader``. @@ -481,10 +472,10 @@ def to_torch( If ``unsqueeze_label_tensor=True`` (default), the label tensor will be of shape (N, 1). Otherwise, it will be of shape (N,). If ``label_column`` is specified as ``None``, then no column from the - ``Datastream`` will be treated as the label, and the output label tensor + ``Dataset`` will be treated as the label, and the output label tensor will be ``None``. - Note that you probably want to call ``.split()`` on this datastream if + Note that you probably want to call ``.split()`` on this dataset if there are to be multiple Torch workers consuming the data. Time complexity: O(1) @@ -515,8 +506,8 @@ def to_torch( prefetching behavior that uses `prefetch_blocks` by setting `use_legacy_iter_batches` to True in the DataContext. drop_last: Set to True to drop the last incomplete batch, - if the datastream size is not divisible by the batch size. If - False and the size of datastream is not divisible by the batch + if the dataset size is not divisible by the batch size. If + False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Defaults to False. local_shuffle_buffer_size: If non-None, the data will be randomly shuffled using a local in-memory shuffle buffer, and this value will serve as the @@ -637,12 +628,11 @@ def to_tf( # Deprecated. prefetch_blocks: int = 0, ) -> "tf.data.Dataset": - """Return a TF Dataset over this datastream. + """Return a TF Dataset over this dataset. .. warning:: - If your datastream contains ragged tensors, this method errors. To prevent - errors, resize tensors or - :ref:`disable tensor extension casting `. + If your dataset contains ragged tensors, this method errors. To prevent + errors, :ref:`resize your tensors `. Examples: >>> import ray @@ -650,7 +640,7 @@ def to_tf( ... "s3://anonymous@air-example-data/iris.csv" ... ) >>> it = ds.iterator(); it - DataIterator(Datastream( + DataIterator(Dataset( num_blocks=1, num_rows=150, schema={ @@ -672,7 +662,7 @@ def to_tf( >>> it.to_tf(["sepal length (cm)", "sepal width (cm)"], "target") # doctest: +SKIP <_OptionsDataset element_spec=({'sepal length (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), 'sepal width (cm)': TensorSpec(shape=(None,), dtype=tf.float64, name='sepal width (cm)')}, TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> - If your datastream contains multiple features but your model accepts a single + If your dataset contains multiple features but your model accepts a single tensor as input, combine features with :class:`~ray.data.preprocessors.Concatenator`. @@ -681,7 +671,7 @@ def to_tf( >>> it = preprocessor.transform(ds).iterator() >>> it DataIterator(Concatenator - +- Datastream( + +- Dataset( num_blocks=1, num_rows=150, schema={ @@ -710,8 +700,8 @@ def to_tf( `use_legacy_iter_batches` to True in the DataContext. batch_size: Record batch size. Defaults to 1. drop_last: Set to True to drop the last incomplete batch, - if the datastream size is not divisible by the batch size. If - False and the size of datastream is not divisible by the batch + if the dataset size is not divisible by the batch size. If + False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Defaults to False. local_shuffle_buffer_size: If non-None, the data will be randomly shuffled using a local in-memory shuffle buffer, and this value will serve as the @@ -739,15 +729,15 @@ def to_tf( schema = self.schema() - if _is_tensor_datastream(schema): + if _is_tensor_dataset(schema): raise NotImplementedError( - "`to_tf` doesn't support single-column tensor datastreams. Call the " + "`to_tf` doesn't support single-column tensor datasets. Call the " "more-flexible `iter_batches` instead." ) if isinstance(schema, type): raise NotImplementedError( - "`to_tf` doesn't support simple datastreams. Call `map_batches` and " + "`to_tf` doesn't support simple datasets. Call `map_batches` and " "convert your data to a tabular format. Alternatively, call the more-" "flexible `iter_batches` in place of `to_tf`." ) @@ -759,7 +749,7 @@ def validate_column(column: str) -> None: raise ValueError( f"You specified '{column}' in `feature_columns` or " f"`label_columns`, but there's no column named '{column}' in the " - f"datastream. Valid column names are: {valid_columns}." + f"dataset. Valid column names are: {valid_columns}." ) def validate_columns(columns: Union[str, List]) -> None: @@ -810,7 +800,7 @@ def generator(): label_type_spec = get_type_spec(schema, columns=label_columns) output_signature = (feature_type_spec, label_type_spec) - datastream = tf.data.Dataset.from_generator( + dataset = tf.data.Dataset.from_generator( generator, output_signature=output_signature ) @@ -818,7 +808,7 @@ def generator(): options.experimental_distribute.auto_shard_policy = ( tf.data.experimental.AutoShardPolicy.OFF ) - return datastream.with_options(options) + return dataset.with_options(options) def iter_epochs(self, max_epoch: int = -1) -> None: raise DeprecationWarning( diff --git a/python/ray/data/preprocessor.py b/python/ray/data/preprocessor.py index 5d03155f9ff4..859898d64001 100644 --- a/python/ray/data/preprocessor.py +++ b/python/ray/data/preprocessor.py @@ -8,7 +8,7 @@ from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI if TYPE_CHECKING: - from ray.data import Datastream, DatasetPipeline + from ray.data import Dataset, DatasetPipeline import pandas as pd import numpy as np from ray.air.data_batch_type import DataBatchType @@ -25,7 +25,7 @@ class PreprocessorNotFittedException(RuntimeError): class Preprocessor(abc.ABC): """Implements an ML preprocessing operation. - Preprocessors are stateful objects that can be fitted against a Datastream and used + Preprocessors are stateful objects that can be fitted against a Dataset and used to transform both local data batches and distributed data. For example, a Normalization preprocessor may calculate the mean and stdev of a field during fitting, and uses these attributes to implement its normalization transform. @@ -70,20 +70,20 @@ def fit_status(self) -> "Preprocessor.FitStatus": @Deprecated def transform_stats(self) -> Optional[str]: - """Return Datastream stats for the most recent transform call, if any.""" + """Return Dataset stats for the most recent transform call, if any.""" raise DeprecationWarning( "`preprocessor.transform_stats()` is no longer supported in Ray 2.4. " - "With Datastream now lazy by default, the stats are only populated " - "after execution. Once the datastream transform is executed, the " - "stats can be accessed directly from the transformed datastream " + "With Dataset now lazy by default, the stats are only populated " + "after execution. Once the dataset transform is executed, the " + "stats can be accessed directly from the transformed dataset " "(`ds.stats()`), or can be viewed in the ray-data.log " "file saved in the Ray logs directory " "(defaults to /tmp/ray/session_{SESSION_ID}/logs/)." ) - def fit(self, ds: "Datastream") -> "Preprocessor": - """Fit this Preprocessor to the Datastream. + def fit(self, ds: "Dataset") -> "Preprocessor": + """Fit this Preprocessor to the Dataset. Fitted state attributes will be directly set in the Preprocessor. @@ -91,7 +91,7 @@ def fit(self, ds: "Datastream") -> "Preprocessor": ``preprocessor.fit(A).fit(B)`` is equivalent to ``preprocessor.fit(B)``. Args: - ds: Input datastream. + ds: Input dataset. Returns: Preprocessor: The fitted Preprocessor with state attributes. @@ -113,30 +113,30 @@ def fit(self, ds: "Datastream") -> "Preprocessor": return self._fit(ds) - def fit_transform(self, ds: "Datastream") -> "Datastream": - """Fit this Preprocessor to the Datastream and then transform the Datastream. + def fit_transform(self, ds: "Dataset") -> "Dataset": + """Fit this Preprocessor to the Dataset and then transform the Dataset. Calling it more than once will overwrite all previously fitted state: ``preprocessor.fit_transform(A).fit_transform(B)`` is equivalent to ``preprocessor.fit_transform(B)``. Args: - ds: Input Datastream. + ds: Input Dataset. Returns: - ray.data.Datastream: The transformed Datastream. + ray.data.Dataset: The transformed Dataset. """ self.fit(ds) return self.transform(ds) - def transform(self, ds: "Datastream") -> "Datastream": - """Transform the given datastream. + def transform(self, ds: "Dataset") -> "Dataset": + """Transform the given dataset. Args: - ds: Input Datastream. + ds: Input Dataset. Returns: - ray.data.Datastream: The transformed Datastream. + ray.data.Dataset: The transformed Dataset. Raises: PreprocessorNotFittedException: if ``fit`` is not called yet. @@ -196,8 +196,8 @@ def _transform_pipeline(self, pipeline: "DatasetPipeline") -> "DatasetPipeline": ): raise RuntimeError( "Streaming/pipelined ingest only works with " - "Preprocessors that do not need to be fit on the entire datastream. " - "It is not possible to fit on Datastreams " + "Preprocessors that do not need to be fit on the entire dataset. " + "It is not possible to fit on Datasets " "in a streaming fashion." ) @@ -213,7 +213,7 @@ def _check_is_fitted(self) -> bool: return bool(fitted_vars) @DeveloperAPI - def _fit(self, ds: "Datastream") -> "Preprocessor": + def _fit(self, ds: "Dataset") -> "Preprocessor": """Sub-classes should override this instead of fit().""" raise NotImplementedError() @@ -247,10 +247,10 @@ def _determine_transform_to_use(self) -> BatchFormat: ) def _transform( - self, ds: Union["Datastream", "DatasetPipeline"] - ) -> Union["Datastream", "DatasetPipeline"]: + self, ds: Union["Dataset", "DatasetPipeline"] + ) -> Union["Dataset", "DatasetPipeline"]: # TODO(matt): Expose `batch_size` or similar configurability. - # The default may be too small for some datastreams and too large for others. + # The default may be too small for some datasets and too large for others. transform_type = self._determine_transform_to_use() # Our user-facing batch format should only be pandas or NumPy, other @@ -271,7 +271,7 @@ def _transform( ) def _get_transform_config(self) -> Dict[str, Any]: - """Returns kwargs to be passed to :meth:`ray.data.Datastream.map_batches`. + """Returns kwargs to be passed to :meth:`ray.data.Dataset.map_batches`. This can be implemented by subclassing preprocessors. """ diff --git a/python/ray/data/preprocessors/batch_mapper.py b/python/ray/data/preprocessors/batch_mapper.py index 4aca660cce90..4479bf3e9478 100644 --- a/python/ray/data/preprocessors/batch_mapper.py +++ b/python/ray/data/preprocessors/batch_mapper.py @@ -18,12 +18,12 @@ @PublicAPI(stability="alpha") class BatchMapper(Preprocessor): - """Apply an arbitrary operation to a datastream. + """Apply an arbitrary operation to a dataset. - :class:`BatchMapper` applies a user-defined function to batches of a datastream. A + :class:`BatchMapper` applies a user-defined function to batches of a dataset. A batch is a Pandas ``DataFrame`` that represents a small amount of data. By modifying batches instead of individual records, this class can efficiently transform a - datastream with vectorized operations. + dataset with vectorized operations. Use this preprocessor to apply stateless operations that aren't already built-in. @@ -48,25 +48,25 @@ class BatchMapper(Preprocessor): >>> >>> preprocessor = BatchMapper(fn, batch_format="pandas") >>> preprocessor.transform(ds) # doctest: +SKIP - Datastream(num_blocks=1, num_rows=3, schema={X: int64}) + Dataset(num_blocks=1, num_rows=3, schema={X: int64}) >>> >>> def fn_numpy(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: ... return {"X": batch["X"]} >>> preprocessor = BatchMapper(fn_numpy, batch_format="numpy") >>> preprocessor.transform(ds) # doctest: +SKIP - Datastream(num_blocks=1, num_rows=3, schema={X: int64}) + Dataset(num_blocks=1, num_rows=3, schema={X: int64}) Args: fn: The function to apply to data batches. batch_size: The desired number of rows in each data batch provided to ``fn``. - Semantics are the same as in ```datastream.map_batches()``: specifying + Semantics are the same as in ```dataset.map_batches()``: specifying ``None`` wil use the entire underlying blocks as batches (blocks may contain different number of rows) and the actual size of the batch provided to ``fn`` may be smaller than ``batch_size`` if ``batch_size`` doesn't evenly divide the block(s) sent to a given map task. Defaults to 4096, - which is the same default value as ``datastream.map_batches()``. + which is the same default value as ``dataset.map_batches()``. batch_format: The preferred batch format to use in UDF. If not given, - we will infer based on the input datastream data format. + we will infer based on the input dataset data format. """ _is_fittable = False diff --git a/python/ray/data/preprocessors/chain.py b/python/ray/data/preprocessors/chain.py index 4673b73b871e..48349813160a 100644 --- a/python/ray/data/preprocessors/chain.py +++ b/python/ray/data/preprocessors/chain.py @@ -1,6 +1,6 @@ from typing import TYPE_CHECKING, Union from ray.air.util.data_batch_conversion import BatchFormat -from ray.data import Datastream, DatasetPipeline +from ray.data import Dataset, DatasetPipeline from ray.data.preprocessor import Preprocessor from ray.util.annotations import PublicAPI @@ -12,7 +12,7 @@ class Chain(Preprocessor): """Combine multiple preprocessors into a single :py:class:`Preprocessor`. - When you call ``fit``, each preprocessor is fit on the datastream produced by the + When you call ``fit``, each preprocessor is fit on the dataset produced by the preceeding preprocessor's ``fit_transform``. Example: @@ -69,22 +69,22 @@ def fit_status(self): def __init__(self, *preprocessors: Preprocessor): self.preprocessors = preprocessors - def _fit(self, ds: Datastream) -> Preprocessor: + def _fit(self, ds: Dataset) -> Preprocessor: for preprocessor in self.preprocessors[:-1]: ds = preprocessor.fit_transform(ds) self.preprocessors[-1].fit(ds) return self - def fit_transform(self, ds: Datastream) -> Datastream: + def fit_transform(self, ds: Dataset) -> Dataset: for preprocessor in self.preprocessors: ds = preprocessor.fit_transform(ds) return ds def _transform( - self, ds: Union[Datastream, DatasetPipeline] - ) -> Union[Datastream, DatasetPipeline]: + self, ds: Union[Dataset, DatasetPipeline] + ) -> Union[Dataset, DatasetPipeline]: for preprocessor in self.preprocessors: - if isinstance(ds, Datastream): + if isinstance(ds, Dataset): ds = preprocessor.transform(ds) elif isinstance(ds, DatasetPipeline): ds = preprocessor._transform_pipeline(ds) diff --git a/python/ray/data/preprocessors/concatenator.py b/python/ray/data/preprocessors/concatenator.py index cc4bbdd38aa5..31d0621cf7fb 100644 --- a/python/ray/data/preprocessors/concatenator.py +++ b/python/ray/data/preprocessors/concatenator.py @@ -51,7 +51,7 @@ class Concatenator(Preprocessor): 2 [1.0, 0.9] Sometimes, you might not want to concatenate all of of the columns in your - datastream. In this case, you can exclude columns with the ``exclude`` parameter. + dataset. In this case, you can exclude columns with the ``exclude`` parameter. >>> df = pd.DataFrame({"X0": [0, 3, 1], "X1": [0.5, 0.2, 0.9], "Y": ["blue", "orange", "blue"]}) >>> ds = ray.data.from_pandas(df) # doctest: +SKIP @@ -88,7 +88,7 @@ class Concatenator(Preprocessor): >>> concatenator = Concatenator(include=["X0", "X1"], dtype=np.float32) >>> concatenator.fit_transform(ds) # doctest: +SKIP - Datastream(num_blocks=1, num_rows=3, schema={Y: object, concat_out: TensorDtype(shape=(2,), dtype=float32)}) + Dataset(num_blocks=1, num_rows=3, schema={Y: object, concat_out: TensorDtype(shape=(2,), dtype=float32)}) Args: output_column_name: The desired name for the new column. @@ -106,7 +106,7 @@ class Concatenator(Preprocessor): Raises: ValueError: if `raise_if_missing` is `True` and a column in `include` or - `exclude` doesn't exist in the datastream. + `exclude` doesn't exist in the dataset. """ # noqa: E501 _is_fittable = False diff --git a/python/ray/data/preprocessors/discretizer.py b/python/ray/data/preprocessors/discretizer.py index 0ef294b7a8c1..36e34f805ef1 100644 --- a/python/ray/data/preprocessors/discretizer.py +++ b/python/ray/data/preprocessors/discretizer.py @@ -3,7 +3,7 @@ import pandas as pd import numpy as np -from ray.data import Datastream +from ray.data import Dataset from ray.data.aggregate import Max, Min from ray.data.preprocessor import Preprocessor from ray.util.annotations import PublicAPI @@ -253,7 +253,7 @@ def __init__( self.duplicates = duplicates self.dtypes = dtypes - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: self._validate_on_fit() stats = {} aggregates = [] @@ -267,7 +267,7 @@ def _fit(self, datastream: Datastream) -> Preprocessor: self._fit_uniform_covert_bin_to_aggregate_if_needed(column) ) - aggregate_stats = datastream.aggregate(*aggregates) + aggregate_stats = dataset.aggregate(*aggregates) mins = {} maxes = {} for key, value in aggregate_stats.items(): diff --git a/python/ray/data/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py index 18f2ce115c7f..0df8885aabf5 100644 --- a/python/ray/data/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -6,7 +6,7 @@ import pandas as pd import pandas.api.types -from ray.data import Datastream +from ray.data import Dataset from ray.data.preprocessor import Preprocessor from ray.util.annotations import PublicAPI @@ -43,7 +43,7 @@ class OrdinalEncoder(Preprocessor): 2 1 0 3 0 1 - If you transform a value not present in the original datastream, then the value + If you transform a value not present in the original dataset, then the value is encoded as ``float("nan")``. >>> df = pd.DataFrame({"sex": ["female"], "level": ["L6"]}) @@ -87,9 +87,9 @@ def __init__(self, columns: List[str], *, encode_lists: bool = True): self.columns = columns self.encode_lists = encode_lists - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: self.stats_ = _get_unique_value_indices( - datastream, self.columns, encode_lists=self.encode_lists + dataset, self.columns, encode_lists=self.encode_lists ) return self @@ -135,7 +135,7 @@ class OneHotEncoder(Preprocessor): 1 if the category matches and 0 otherwise. If you encode an infrequent category (see ``max_categories``) or a category - that isn't in the fitted datastream, then the category is encoded as all 0s. + that isn't in the fitted dataset, then the category is encoded as all 0s. Columns must contain hashable objects or lists of hashable objects. @@ -160,7 +160,7 @@ class OneHotEncoder(Preprocessor): 4 1 0 0 5 0 1 0 - If you one-hot encode a value that isn't in the fitted datastream, then the + If you one-hot encode a value that isn't in the fitted dataset, then the value is encoded with zeros. >>> df = pd.DataFrame({"color": ["yellow"]}) @@ -206,9 +206,9 @@ def __init__( self.columns = columns self.max_categories = max_categories - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: self.stats_ = _get_unique_value_indices( - datastream, + dataset, self.columns, max_categories=self.max_categories, encode_lists=False, @@ -315,9 +315,9 @@ def __init__( self.columns = columns self.max_categories = max_categories - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: self.stats_ = _get_unique_value_indices( - datastream, + dataset, self.columns, max_categories=self.max_categories, encode_lists=True, @@ -377,7 +377,7 @@ class LabelEncoder(Preprocessor): 2 4.9 3.0 0 3 6.2 3.4 2 - If you transform a label not present in the original datastream, then the new + If you transform a label not present in the original dataset, then the new label is encoded as ``float("nan")``. >>> df = pd.DataFrame({ @@ -403,8 +403,8 @@ class LabelEncoder(Preprocessor): def __init__(self, label_column: str): self.label_column = label_column - def _fit(self, datastream: Datastream) -> Preprocessor: - self.stats_ = _get_unique_value_indices(datastream, [self.label_column]) + def _fit(self, dataset: Dataset) -> Preprocessor: + self.stats_ = _get_unique_value_indices(dataset, [self.label_column]) return self def _transform_pandas(self, df: pd.DataFrame): @@ -431,7 +431,7 @@ class Categorizer(Preprocessor): .. warning:: If you don't specify ``dtypes``, fit this preprocessor before splitting - your datastream into train and test splits. This ensures categories are + your dataset into train and test splits. This ensures categories are consistent across splits. Examples: @@ -477,13 +477,13 @@ def __init__( self.columns = columns self.dtypes = dtypes - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: columns_to_get = [ column for column in self.columns if column not in self.dtypes ] if columns_to_get: unique_indices = _get_unique_value_indices( - datastream, columns_to_get, drop_na_values=True, key_format="{0}" + dataset, columns_to_get, drop_na_values=True, key_format="{0}" ) unique_indices = { column: pd.CategoricalDtype(values_indices.keys()) @@ -507,7 +507,7 @@ def __repr__(self): def _get_unique_value_indices( - datastream: Datastream, + dataset: Dataset, columns: List[str], drop_na_values: bool = False, key_format: str = "unique_values({0})", @@ -547,19 +547,19 @@ def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: result = {} for col in columns: if col in df_columns: - result[col] = get_pd_value_counts_per_column(df[col]) + result[col] = [get_pd_value_counts_per_column(df[col])] else: raise ValueError( f"Column '{col}' does not exist in DataFrame, which has columns: {df_columns}" # noqa: E501 ) - return [result] + return result - value_counts = datastream.map_batches(get_pd_value_counts, batch_format="pandas") + value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = {col: Counter() for col in columns} for batch in value_counts.iter_batches(batch_size=None): - for col_value_counts in batch: - for col, value_counts in col_value_counts.items(): - final_counters[col] += value_counts + for col, counters in batch.items(): + for counter in counters: + final_counters[col] += counter # Inspect if there is any NA values. for col in columns: diff --git a/python/ray/data/preprocessors/imputer.py b/python/ray/data/preprocessors/imputer.py index 23c7f232a9c8..50eeb8b89c1e 100644 --- a/python/ray/data/preprocessors/imputer.py +++ b/python/ray/data/preprocessors/imputer.py @@ -5,7 +5,7 @@ import pandas as pd from pandas.api.types import is_categorical_dtype -from ray.data import Datastream +from ray.data import Dataset from ray.data.aggregate import Mean from ray.data.preprocessor import Preprocessor from ray.util.annotations import PublicAPI @@ -106,12 +106,12 @@ def __init__( '`fill_value` must be set when using "constant" strategy.' ) - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: if self.strategy == "mean": aggregates = [Mean(col) for col in self.columns] - self.stats_ = datastream.aggregate(*aggregates) + self.stats_ = dataset.aggregate(*aggregates) elif self.strategy == "most_frequent": - self.stats_ = _get_most_frequent_values(datastream, *self.columns) + self.stats_ = _get_most_frequent_values(dataset, *self.columns) return self @@ -142,19 +142,19 @@ def __repr__(self): def _get_most_frequent_values( - datastream: Datastream, *columns: str + dataset: Dataset, *columns: str ) -> Dict[str, Union[str, Number]]: columns = list(columns) def get_pd_value_counts(df: pd.DataFrame) -> List[Dict[str, Counter]]: - return [{col: Counter(df[col].value_counts().to_dict()) for col in columns}] + return {col: [Counter(df[col].value_counts().to_dict())] for col in columns} - value_counts = datastream.map_batches(get_pd_value_counts, batch_format="pandas") + value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") final_counters = {col: Counter() for col in columns} for batch in value_counts.iter_batches(batch_size=None): - for col_value_counts in batch: - for col, value_counts in col_value_counts.items(): - final_counters[col] += value_counts + for col, counters in batch.items(): + for counter in counters: + final_counters[col] += counter return { f"most_frequent({column})": final_counters[column].most_common(1)[0][0] diff --git a/python/ray/data/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py index 713542fea8e3..b53415c954cd 100644 --- a/python/ray/data/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from ray.data import Datastream +from ray.data import Dataset from ray.data.aggregate import Mean, Std, Min, Max, AbsMax from ray.data.preprocessor import Preprocessor from ray.util.annotations import PublicAPI @@ -66,10 +66,10 @@ class StandardScaler(Preprocessor): def __init__(self, columns: List[str]): self.columns = columns - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: mean_aggregates = [Mean(col) for col in self.columns] std_aggregates = [Std(col, ddof=0) for col in self.columns] - self.stats_ = datastream.aggregate(*mean_aggregates, *std_aggregates) + self.stats_ = dataset.aggregate(*mean_aggregates, *std_aggregates) return self def _transform_pandas(self, df: pd.DataFrame): @@ -150,9 +150,9 @@ class MinMaxScaler(Preprocessor): def __init__(self, columns: List[str]): self.columns = columns - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: aggregates = [Agg(col) for Agg in [Min, Max] for col in self.columns] - self.stats_ = datastream.aggregate(*aggregates) + self.stats_ = dataset.aggregate(*aggregates) return self def _transform_pandas(self, df: pd.DataFrame): @@ -230,9 +230,9 @@ class MaxAbsScaler(Preprocessor): def __init__(self, columns: List[str]): self.columns = columns - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: aggregates = [AbsMax(col) for col in self.columns] - self.stats_ = datastream.aggregate(*aggregates) + self.stats_ = dataset.aggregate(*aggregates) return self def _transform_pandas(self, df: pd.DataFrame): @@ -315,12 +315,12 @@ def __init__( self.columns = columns self.quantile_range = quantile_range - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: low = self.quantile_range[0] med = 0.50 high = self.quantile_range[1] - num_records = datastream.count() + num_records = dataset.count() max_index = num_records - 1 split_indices = [int(percentile * max_index) for percentile in (low, med, high)] @@ -328,15 +328,15 @@ def _fit(self, datastream: Datastream) -> Preprocessor: # TODO(matt): Handle case where quantile lands between 2 numbers. # The current implementation will simply choose the closest index. - # This will affect the results of small datastreams more than large datastreams. + # This will affect the results of small datasets more than large datasets. for col in self.columns: - filtered_datastream = datastream.map_batches( + filtered_dataset = dataset.map_batches( lambda df: df[[col]], batch_format="pandas" ) - sorted_datastream = filtered_datastream.sort(col) - _, low, med, high = sorted_datastream.split_at_indices(split_indices) + sorted_dataset = filtered_dataset.sort(col) + _, low, med, high = sorted_dataset.split_at_indices(split_indices) - def _get_first_value(ds: Datastream, c: str): + def _get_first_value(ds: Dataset, c: str): return ds.take(1)[0][c] low_val = _get_first_value(low, col) diff --git a/python/ray/data/preprocessors/torch.py b/python/ray/data/preprocessors/torch.py index cdede252be56..642fd1483ca3 100644 --- a/python/ray/data/preprocessors/torch.py +++ b/python/ray/data/preprocessors/torch.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Callable, Dict, List, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Union, Optional, Mapping import numpy as np @@ -18,9 +18,9 @@ class TorchVisionPreprocessor(Preprocessor): Examples: >>> import ray - >>> datastream = ray.data.read_images("s3://anonymous@air-example-data-2/imagenet-sample-images") - >>> datastream # doctest: +ellipsis - Datastream(num_blocks=..., num_rows=..., schema={image: numpy.ndarray(shape=(..., 3), dtype=float)}) + >>> dataset = ray.data.read_images("s3://anonymous@air-example-data-2/imagenet-sample-images") + >>> dataset # doctest: +ellipsis + Dataset(num_blocks=..., num_rows=..., schema={image: numpy.ndarray(shape=(..., 3), dtype=float)}) Torch models expect inputs of shape :math:`(B, C, H, W)` in the range :math:`[0.0, 1.0]`. To convert images to this format, add ``ToTensor`` to your @@ -33,9 +33,9 @@ class TorchVisionPreprocessor(Preprocessor): ... transforms.Resize((224, 224)), ... ]) >>> preprocessor = TorchVisionPreprocessor(["image"], transform=transform) - >>> datastream = preprocessor.transform(datastream) # doctest: +ellipsis - >>> datastream # doctest: +ellipsis - Datastream(num_blocks=..., num_rows=..., schema={image: numpy.ndarray(shape=(3, 224, 224), dtype=float)}) + >>> dataset = preprocessor.transform(dataset) # doctest: +ellipsis + >>> dataset # doctest: +ellipsis + Dataset(num_blocks=..., num_rows=..., schema={image: numpy.ndarray(shape=(3, 224, 224), dtype=float)}) For better performance, set ``batched`` to ``True`` and replace ``ToTensor`` with a batch-supporting ``Lambda``. @@ -54,15 +54,17 @@ class TorchVisionPreprocessor(Preprocessor): >>> preprocessor = TorchVisionPreprocessor( ... ["image"], transform=transform, batched=True ... ) - >>> datastream = preprocessor.transform(datastream) # doctest: +ellipsis - >>> datastream # doctest: +ellipsis - Datastream(num_blocks=..., num_rows=..., schema={image: numpy.ndarray(shape=(3, 224, 224), dtype=float)}) + >>> dataset = preprocessor.transform(dataset) # doctest: +ellipsis + >>> dataset # doctest: +ellipsis + Dataset(num_blocks=..., num_rows=..., schema={image: numpy.ndarray(shape=(3, 224, 224), dtype=float)}) Args: columns: The columns to apply the TorchVision transform to. transform: The TorchVision transform you want to apply. This transform should accept a ``np.ndarray`` or ``torch.Tensor`` as input and return a ``torch.Tensor`` as output. + output_columns: The output name for each input column. If not specified, this + defaults to the same set of columns as the columns. batched: If ``True``, apply ``transform`` to batches of shape :math:`(B, H, W, C)`. Otherwise, apply ``transform`` to individual images. """ # noqa: E501 @@ -73,21 +75,32 @@ def __init__( self, columns: List[str], transform: Callable[[Union["np.ndarray", "torch.Tensor"]], "torch.Tensor"], + output_columns: Optional[List[str]] = None, batched: bool = False, ): + if not output_columns: + output_columns = columns + if len(columns) != len(output_columns): + raise ValueError( + "The length of columns should match the " + f"length of output_columns: {columns} vs {output_columns}." + ) self._columns = columns + self._output_columns = output_columns self._torchvision_transform = transform self._batched = batched def __repr__(self) -> str: return ( - f"{self.__class__.__name__}(columns={self._columns}, " + f"{self.__class__.__name__}(" + f"columns={self._columns}, " + f"output_columns={self._output_columns}, " f"transform={self._torchvision_transform!r})" ) def _transform_numpy( - self, np_data: Union["np.ndarray", Dict[str, "np.ndarray"]] - ) -> Union["np.ndarray", Dict[str, "np.ndarray"]]: + self, data_batch: Dict[str, "np.ndarray"] + ) -> Dict[str, "np.ndarray"]: import torch from ray.air._internal.torch_utils import convert_ndarray_to_torch_tensor @@ -98,15 +111,15 @@ def apply_torchvision_transform(array: np.ndarray) -> np.ndarray: except TypeError: # Transforms like `ToTensor` expect a `np.ndarray` as input. output = self._torchvision_transform(array) - - if not isinstance(output, torch.Tensor): + if isinstance(output, torch.Tensor): + output = output.numpy() + if not isinstance(output, np.ndarray): raise ValueError( "`TorchVisionPreprocessor` expected your transform to return a " - "`torch.Tensor`, but your transform returned a " + "`torch.Tensor` or `np.ndarray`, but your transform returned a " f"`{type(output).__name__}` instead." ) - - return output.numpy() + return output def transform_batch(batch: np.ndarray) -> np.ndarray: if self._batched: @@ -115,14 +128,15 @@ def transform_batch(batch: np.ndarray) -> np.ndarray: [apply_torchvision_transform(array) for array in batch] ) - if isinstance(np_data, dict): - outputs = np_data - for column in self._columns: - outputs[column] = transform_batch(np_data[column]) + if isinstance(data_batch, Mapping): + for input_col, output_col in zip(self._columns, self._output_columns): + data_batch[output_col] = transform_batch(data_batch[input_col]) else: - outputs = transform_batch(np_data) + # TODO(ekl) deprecate this code path. Unfortunately, predictors are still + # sending schemaless arrays to preprocessors. + data_batch = transform_batch(data_batch) - return outputs + return data_batch def preferred_batch_format(cls) -> BatchFormat: return BatchFormat.NUMPY diff --git a/python/ray/data/preprocessors/vectorizer.py b/python/ray/data/preprocessors/vectorizer.py index 6949ade9275b..bcd8b5f14e1f 100644 --- a/python/ray/data/preprocessors/vectorizer.py +++ b/python/ray/data/preprocessors/vectorizer.py @@ -3,7 +3,7 @@ import pandas as pd -from ray.data import Datastream +from ray.data import Dataset from ray.data.preprocessor import Preprocessor from ray.data.preprocessors.utils import simple_split_tokenizer, simple_hash from ray.util.annotations import PublicAPI @@ -201,7 +201,7 @@ class CountVectorizer(Preprocessor): output. If unspecified, the tokenizer uses a function equivalent to ``lambda s: s.split(" ")``. max_features: The maximum number of tokens to encode in the transformed - datastream. If specified, only the most frequent tokens are encoded. + dataset. If specified, only the most frequent tokens are encoded. """ # noqa: E501 @@ -217,23 +217,21 @@ def __init__( self.tokenization_fn = tokenization_fn or simple_split_tokenizer self.max_features = max_features - def _fit(self, datastream: Datastream) -> Preprocessor: + def _fit(self, dataset: Dataset) -> Preprocessor: def get_pd_value_counts(df: pd.DataFrame) -> List[Counter]: def get_token_counts(col): token_series = df[col].apply(self.tokenization_fn) tokens = token_series.sum() return Counter(tokens) - return [{col: get_token_counts(col) for col in self.columns}] + return {col: [get_token_counts(col)] for col in self.columns} - value_counts = datastream.map_batches( - get_pd_value_counts, batch_format="pandas" - ) + value_counts = dataset.map_batches(get_pd_value_counts, batch_format="pandas") total_counts = {col: Counter() for col in self.columns} for batch in value_counts.iter_batches(batch_size=None): - for x in batch: - for col, col_value_counts in x.items(): - total_counts[col].update(col_value_counts) + for col, counters in batch.items(): + for counter in counters: + total_counts[col].update(counter) def most_common(counter: Counter, n: int): return Counter(dict(counter.most_common(n))) diff --git a/python/ray/data/random_access_dataset.py b/python/ray/data/random_access_dataset.py index fd190529e9c2..1df2eebc4140 100644 --- a/python/ray/data/random_access_dataset.py +++ b/python/ray/data/random_access_dataset.py @@ -4,11 +4,11 @@ import time from collections import defaultdict import numpy as np -from typing import List, Any, Generic, Optional, TYPE_CHECKING +from typing import List, Any, Optional, TYPE_CHECKING import ray from ray.types import ObjectRef -from ray.data.block import T, BlockAccessor +from ray.data.block import BlockAccessor from ray.data.context import DataContext, DEFAULT_SCHEDULING_STRATEGY from ray.data._internal.remote_fn import cached_remote_fn from ray.util.annotations import PublicAPI @@ -19,21 +19,21 @@ pa = None if TYPE_CHECKING: - from ray.data import Datastream + from ray.data import Dataset logger = logging.getLogger(__name__) @PublicAPI(stability="alpha") -class RandomAccessDataset(Generic[T]): - """A class that provides distributed, random access to a Datastream. +class RandomAccessDataset: + """A class that provides distributed, random access to a Dataset. - See: ``Datastream.to_random_access_dataset()``. + See: ``Dataset.to_random_access_dataset()``. """ def __init__( self, - ds: "Datastream[T]", + ds: "Dataset", key: str, num_workers: int, ): @@ -47,7 +47,7 @@ def __init__( raise ValueError("RandomAccessDataset only supports Arrow-format blocks.") start = time.perf_counter() - logger.info("[setup] Indexing datastream by sort key.") + logger.info("[setup] Indexing dataset by sort key.") sorted_ds = ds.sort(key) get_bounds = cached_remote_fn(_get_bounds) blocks = sorted_ds.get_internal_block_refs() @@ -130,7 +130,7 @@ def _compute_block_to_worker_assignments(self): return block_to_workers, worker_to_blocks - def get_async(self, key: Any) -> ObjectRef[Optional[T]]: + def get_async(self, key: Any) -> ObjectRef[Any]: """Asynchronously finds the record for a single key. Args: @@ -144,7 +144,7 @@ def get_async(self, key: Any) -> ObjectRef[Optional[T]]: return ray.put(None) return self._worker_for(block_index).get.remote(block_index, key) - def multiget(self, keys: List[Any]) -> List[Optional[T]]: + def multiget(self, keys: List[Any]) -> List[Optional[Any]]: """Synchronously find the records for a list of keys. Args: diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index b24009ed5a98..3cf4649c61fe 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -37,7 +37,7 @@ from ray.data._internal.logical.operators.read_operator import Read from ray.data._internal.plan import ExecutionPlan from ray.data._internal.remote_fn import cached_remote_fn -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats from ray.data._internal.util import ( _lazy_import_pyarrow_dataset, _autodetect_parallelism, @@ -46,10 +46,9 @@ ndarray_to_block, get_table_block_metadata, ) -from ray.data.row import TableRow from ray.data.block import Block, BlockAccessor, BlockExecStats, BlockMetadata from ray.data.context import DEFAULT_SCHEDULING_STRATEGY, WARN_PREFIX, DataContext -from ray.data.datastream import Datastream, MaterializedDatastream +from ray.data.dataset import Dataset, MaterializedDataset from ray.data.datasource import ( BaseFileMetadataProvider, BinaryDatasource, @@ -84,6 +83,7 @@ from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy +from ray._private.auto_init_hook import wrap_auto_init if TYPE_CHECKING: import dask @@ -110,28 +110,28 @@ def from_items( *, parallelism: int = -1, output_arrow_format: bool = False, -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a list of local Python objects. +) -> MaterializedDataset: + """Create a dataset from a list of local Python objects. Examples: >>> import ray >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP >>> ds # doctest: +SKIP - MaterializedDatastream(num_blocks=5, num_rows=5, schema=) - >>> ds.take(2) # doctest: +SKIP - [1, 2] + MaterializedDataset(num_blocks=5, num_rows=5, schema={item: int64}) + >>> ds.take_batch(2) # doctest: +SKIP + {"item": array([1, 2])} Args: items: List of local Python objects. - parallelism: The amount of parallelism to use for the datastream. + parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. output_arrow_format: If True, always return data in Arrow format, raising an error if this is not possible. Defaults to False. Returns: - MaterializedDatastream holding the items. + MaterializedDataset holding the items. """ - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: output_arrow_format = True @@ -195,10 +195,10 @@ def from_items( from_items_op = FromItems(items, detected_parallelism) logical_plan = LogicalPlan(from_items_op) - return MaterializedDatastream( + return MaterializedDataset( ExecutionPlan( BlockList(blocks, metadata, owned_by_consumer=False), - DatastreamStats(stages={"FromItems": metadata}, parent=None), + DatasetStats(stages={"FromItems": metadata}, parent=None), run_by_consumer=False, ), 0, @@ -208,26 +208,26 @@ def from_items( @PublicAPI -def range(n: int, *, parallelism: int = -1) -> Datastream[TableRow]: - """Create a datastream from a range of integers [0..n). +def range(n: int, *, parallelism: int = -1) -> Dataset: + """Create a dataset from a range of integers [0..n). Examples: >>> import ray >>> ds = ray.data.range(10000) # doctest: +SKIP >>> ds # doctest: +SKIP - Datastream(num_blocks=200, num_rows=10000, schema={id: int64}) + Dataset(num_blocks=200, num_rows=10000, schema={id: int64}) >>> ds.map(lambda x: {"id": x["id"] * 2}).take(4) # doctest: +SKIP [{"id": 0}, {"id": 2}, {"id": 4}, {"id": 6}] Args: n: The upper bound of the range of integers. - parallelism: The amount of parallelism to use for the datastream. + parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: - Datastream producing the integers. + Dataset producing the integers. """ - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: return read_datasource( RangeDatasource(), @@ -241,34 +241,11 @@ def range(n: int, *, parallelism: int = -1) -> Datastream[TableRow]: ) -@PublicAPI -def range_table(n: int, *, parallelism: int = -1) -> Datastream[TableRow]: - """Create a tabular stream from a range of integers [0..n). - - Examples: - >>> import ray - >>> ds = ray.data.range_table(1000) # doctest: +SKIP - >>> ds # doctest: +SKIP - Datastream(num_blocks=200, num_rows=1000, schema={value: int64}) - >>> ds.map(lambda r: {"v2": r["value"] * 2}).take(2) # doctest: +SKIP - [ArrowRow({'v2': 0}), ArrowRow({'v2': 2})] - - This is similar to range(), but uses Arrow tables to hold the integers - in Arrow records. The datastream elements take the form {"value": N}. - - Args: - n: The upper bound of the range of integer records. - parallelism: The amount of parallelism to use for the datastream. - Parallelism may be limited by the number of items. - - Returns: - Datastream producing the integers as Arrow records. - """ - ctx = ray.data.DatasetContext.get_current() +@Deprecated +def range_table(n: int, *, parallelism: int = -1) -> Dataset: + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: - raise DeprecationWarning( - "In strict mode, use range() instead of range_table()." - ) + raise DeprecationWarning("In Ray 2.5, use range() instead of range_table().") return read_datasource( RangeDatasource(), parallelism=parallelism, @@ -284,40 +261,37 @@ def range_arrow(*args, **kwargs): @PublicAPI -def range_tensor( - n: int, *, shape: Tuple = (1,), parallelism: int = -1 -) -> Datastream[TableRow]: +def range_tensor(n: int, *, shape: Tuple = (1,), parallelism: int = -1) -> Dataset: """Create a Tensor stream from a range of integers [0..n). Examples: >>> import ray >>> ds = ray.data.range_tensor(1000, shape=(2, 2)) - >>> ds # doctest: +ellipsis - Datastream( - num_blocks=..., - num_rows=1000, - schema={__value__: numpy.ndarray(shape=(2, 2), dtype=int64)} + >>> ds # doctest: +ELLIPSIS + Dataset( + num_blocks=..., + num_rows=1000, + schema={data: numpy.ndarray(shape=(2, 2), dtype=int64)} ) >>> ds.map_batches(lambda arr: arr * 2).take(2) # doctest: +SKIP [array([[0, 0], [0, 0]]), - array([[2, 2], + array([[2, 2], [2, 2]])] This is similar to range_table(), but uses the ArrowTensorArray extension - type. The datastream elements take the form - {"__value__": array(N, shape=shape)}. + type. The dataset elements take the form {"data": array(N, shape=shape)}. Args: n: The upper bound of the range of integer records. shape: The shape of each record. - parallelism: The amount of parallelism to use for the datastream. + parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: - Datastream producing the integers as Arrow tensor records. + Dataset producing the integers as Arrow tensor records. """ - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() return read_datasource( RangeDatasource(), parallelism=parallelism, @@ -329,13 +303,14 @@ def range_tensor( @PublicAPI +@wrap_auto_init def read_datasource( - datasource: Datasource[T], + datasource: Datasource, *, parallelism: int = -1, ray_remote_args: Dict[str, Any] = None, **read_args, -) -> Datastream[T]: +) -> Dataset: """Read a stream from a custom data source. Args: @@ -348,7 +323,7 @@ def read_datasource( ray_remote_args: kwargs passed to ray.remote in the read tasks. Returns: - Datastream that reads data from the datasource. + Dataset that reads data from the datasource. """ ctx = DataContext.get_current() @@ -416,24 +391,24 @@ def read_datasource( if read_tasks and len(read_tasks) < min_safe_parallelism * 0.7: perc = 1 + round((min_safe_parallelism - len(read_tasks)) / len(read_tasks), 1) logger.warning( - f"{WARN_PREFIX} The blocks of this datastream are estimated to be {perc}x " + f"{WARN_PREFIX} The blocks of this dataset are estimated to be {perc}x " "larger than the target block size " f"of {int(ctx.target_max_block_size / 1024 / 1024)} MiB. This may lead to " "out-of-memory errors during processing. Consider reducing the size of " "input files or using `.repartition(n)` to increase the number of " - "datastream blocks." + "dataset blocks." ) elif len(read_tasks) < requested_parallelism and ( len(read_tasks) < ray.available_resources().get("CPU", 1) // 2 ): logger.warning( - f"{WARN_PREFIX} The number of blocks in this datastream " + f"{WARN_PREFIX} The number of blocks in this dataset " f"({len(read_tasks)}) " f"limits its parallelism to {len(read_tasks)} concurrent tasks. " "This is much less than the number " "of available CPU slots in the cluster. Use `.repartition(n)` to " "increase the number of " - "datastream blocks." + "dataset blocks." ) read_stage_name = f"Read{datasource.get_name()}" @@ -466,7 +441,7 @@ def read_datasource( read_op = Read(datasource, requested_parallelism, ray_remote_args, read_args) logical_plan = LogicalPlan(read_op) - return Datastream( + return Dataset( plan=ExecutionPlan(block_list, block_list.stats(), run_by_consumer=False), epoch=0, lazy=True, @@ -485,11 +460,11 @@ def read_mongo( parallelism: int = -1, ray_remote_args: Dict[str, Any] = None, **mongo_args, -) -> Datastream[TableRow]: - """Create an Arrow datastream from MongoDB. +) -> Dataset: + """Create an Arrow dataset from MongoDB. The data to read from is specified via the ``uri``, ``database`` and ``collection`` - of the MongoDB. The datastream is created from the results of executing + of the MongoDB. The dataset is created from the results of executing ``pipeline`` against the ``collection``. If ``pipeline`` is None, the entire ``collection`` will be read. @@ -518,7 +493,7 @@ def read_mongo( ... ) Args: - uri: The URI of the source MongoDB where the datastream will be + uri: The URI of the source MongoDB where the dataset will be read from. For the URI format, see details in https://www.mongodb.com/docs/manual/reference/connection-string/. database: The name of the database hosted in the MongoDB. This database @@ -526,7 +501,7 @@ def read_mongo( collection: The name of the collection in the database. This collection must exist otherwise ValueError will be raised. pipeline: A MongoDB pipeline, which will be executed on the given collection - with results used to create Datastream. If None, the entire collection will + with results used to create Dataset. If None, the entire collection will be read. schema: The schema used to read the collection. If None, it'll be inferred from the results of pipeline. @@ -538,7 +513,7 @@ def read_mongo( Arrow-formatted results. Returns: - Datastream producing Arrow records from the results of executing the pipeline + Dataset producing Arrow records from the results of executing the pipeline on the specified MongoDB collection. """ return read_datasource( @@ -565,8 +540,8 @@ def read_parquet( tensor_column_schema: Optional[Dict[str, Tuple[np.dtype, Tuple[int, ...]]]] = None, meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(), **arrow_parquet_args, -) -> Datastream[TableRow]: - """Create an Arrow datastream from parquet files. +) -> Dataset: + """Create an Arrow dataset from parquet files. Examples: >>> import ray @@ -585,7 +560,7 @@ def read_parquet( ... ("variety", pa.string())] >>> ray.data.read_parquet("example://iris.parquet", ... schema=pa.schema(fields)) - Datastream( + Dataset( num_blocks=1, num_rows=150, schema={ @@ -607,7 +582,7 @@ def read_parquet( https://arrow.apache.org/docs/python/api/filesystems.html#filesystem-implementations. columns: A list of column names to read. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files of the datastream. + limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. tensor_column_schema: A dict of column name --> tensor dtype and shape mappings for converting a Parquet column containing serialized @@ -621,7 +596,7 @@ def read_parquet( https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_fragment Returns: - Datastream producing Arrow records read from the specified paths. + Dataset producing Arrow records read from the specified paths. """ arrow_parquet_args = _resolve_parquet_args( tensor_column_schema, @@ -656,7 +631,7 @@ def read_images( mode: Optional[str] = None, include_paths: bool = False, ignore_missing_paths: bool = False, -) -> Datastream[TableRow]: +) -> Dataset: """Read images from the specified paths. Examples: @@ -664,13 +639,13 @@ def read_images( >>> path = "s3://anonymous@air-example-data-2/movie-image-small-filesize-1GB" >>> ds = ray.data.read_images(path) # doctest: +SKIP >>> ds # doctest: +SKIP - Datastream(num_blocks=200, num_rows=41979, schema={image: numpy.ndarray(ndim=3, dtype=uint8)}) + Dataset(num_blocks=200, num_rows=41979, schema={image: numpy.ndarray(ndim=3, dtype=uint8)}) If you need image file paths, set ``include_paths=True``. >>> ds = ray.data.read_images(path, include_paths=True) # doctest: +SKIP >>> ds # doctest: +SKIP - Datastream(num_blocks=200, num_rows=41979, schema={image: numpy.ndarray(ndim=3, dtype=uint8), path: string}) + Dataset(num_blocks=200, num_rows=41979, schema={image: numpy.ndarray(ndim=3, dtype=uint8), path: string}) >>> ds.take(1)[0]["path"] # doctest: +SKIP 'air-example-data-2/movie-image-small-filesize-1GB/0.jpg' @@ -693,21 +668,21 @@ def read_images( >>> partitioning = Partitioning("dir", field_names=["class"], base_dir=root) >>> ds = ray.data.read_images(root, size=(224, 224), partitioning=partitioning) # doctest: +SKIP >>> ds # doctest: +SKIP - Datastream(num_blocks=176, num_rows=94946, schema={image: TensorDtype(shape=(224, 224, 3), dtype=uint8), class: object}) + Dataset(num_blocks=176, num_rows=94946, schema={image: TensorDtype(shape=(224, 224, 3), dtype=uint8), class: object}) Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files of the datastream. + limited by the number of files of the dataset. meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_open_file_args: kwargs passed to ``pyarrow.fs.FileSystem.open_input_file``. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. By default, this filters out any file paths whose file extension does not match ``*.png``, ``*.jpg``, ``*.jpeg``, ``*.tiff``, ``*.bmp``, or ``*.gif``. partitioning: A :class:`~ray.data.datasource.partitioning.Partitioning` object @@ -724,9 +699,9 @@ def read_images( that are not found. Defaults to False. Returns: - A :class:`~ray.data.Datastream` producing tensors that represent the images at + A :class:`~ray.data.Dataset` producing tensors that represent the images at the specified paths. For information on working with tensors, read the - :ref:`tensor data guide `. + :ref:`tensor data guide `. Raises: ValueError: if ``size`` contains non-positive numbers. @@ -764,8 +739,8 @@ def read_parquet_bulk( ParquetBaseDatasource.file_extension_filter() ), **arrow_parquet_args, -) -> Datastream[TableRow]: - """Create an Arrow datastream from a large number (such as >1K) of parquet files +) -> Dataset: + """Create an Arrow dataset from a large number (such as >1K) of parquet files quickly. By default, ONLY file paths should be provided as input (i.e. no directory paths), @@ -805,7 +780,7 @@ def read_parquet_bulk( filesystem: The filesystem implementation to read from. columns: A list of column names to read. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files of the datastream. + limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_open_file_args: kwargs passed to ``pyarrow.fs.FileSystem.open_input_file``. @@ -820,13 +795,13 @@ def read_parquet_bulk( files. Change to ``DefaultFileMetadataProvider`` or a custom metadata provider if directory expansion and/or file metadata resolution is required. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. By default, this filters out any file paths whose file extension does not match "*.parquet*". arrow_parquet_args: Other parquet read options to pass to pyarrow. Returns: - Datastream producing Arrow records read from the specified paths. + Dataset producing Arrow records read from the specified paths. """ arrow_parquet_args = _resolve_parquet_args( tensor_column_schema, @@ -861,8 +836,8 @@ def read_json( partitioning: Partitioning = Partitioning("hive"), ignore_missing_paths: bool = False, **arrow_json_args, -) -> Datastream[TableRow]: - """Create an Arrow datastream from json files. +) -> Dataset: + """Create an Arrow dataset from json files. Examples: >>> import ray @@ -881,8 +856,8 @@ def read_json( from file paths. If your data adheres to a different partitioning scheme, set the ``partitioning`` parameter. - >>> ds = ray.data.read_json("example://year=2022/month=09/sales.json") # doctest: + SKIP - >>> ds.take(1) # doctest: + SKIP + >>> ds = ray.data.read_json("example://year=2022/month=09/sales.json") # doctest: +SKIP + >>> ds.take(1) # doctest: +SKIP [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'} Args: @@ -890,14 +865,14 @@ def read_json( A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files of the datastream. + limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. By default, this filters out any file paths whose file extension does not match "*.json*". arrow_json_args: Other json read options to pass to pyarrow. @@ -908,7 +883,7 @@ def read_json( found. Defaults to False. Returns: - Datastream producing Arrow records read from the specified paths. + Dataset producing Arrow records read from the specified paths. """ # noqa: E501 return read_datasource( JSONDatasource(), @@ -938,8 +913,8 @@ def read_csv( partitioning: Partitioning = Partitioning("hive"), ignore_missing_paths: bool = False, **arrow_csv_args, -) -> Datastream[TableRow]: - r"""Create an Arrow datastream from csv files. +) -> Dataset: + r"""Create an Arrow dataset from csv files. Examples: >>> import ray @@ -976,9 +951,9 @@ def read_csv( from file paths. If your data adheres to a different partitioning scheme, set the ``partitioning`` parameter. - >>> ds = ray.data.read_csv("example://year=2022/month=09/sales.csv") # doctest: + SKIP - >>> ds.take(1) # doctest: + SKIP - [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'} + >>> ds = ray.data.read_csv("example://year=2022/month=09/sales.csv") # doctest: +SKIP + >>> ds.take(1) # doctest: +SKIP + [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'}] By default, ``read_csv`` reads all files from file paths. If you want to filter files by file extensions, set the ``partition_filter`` parameter. @@ -994,14 +969,14 @@ def read_csv( A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files of the datastream. + limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. By default, this does not filter out any files. If wishing to filter out all file paths except those whose file extension matches e.g. "*.csv*", a ``FileExtensionFilter("csv")`` can be provided. @@ -1013,7 +988,7 @@ def read_csv( found. Defaults to False. Returns: - Datastream producing Arrow records read from the specified paths. + Dataset producing Arrow records read from the specified paths. """ # noqa: E501 return read_datasource( CSVDatasource(), @@ -1045,8 +1020,8 @@ def read_text( partition_filter: Optional[PathPartitionFilter] = None, partitioning: Partitioning = None, ignore_missing_paths: bool = False, -) -> Datastream[TableRow]: - """Create a datastream from lines stored in text files. +) -> Dataset: + """Create a dataset from lines stored in text files. Examples: >>> import ray @@ -1081,7 +1056,7 @@ def read_text( found. Defaults to False. Returns: - Datastream producing lines of text read from the specified paths. + Dataset producing lines of text read from the specified paths. """ return read_datasource( TextDatasource(), @@ -1113,8 +1088,8 @@ def read_numpy( partitioning: Partitioning = None, ignore_missing_paths: bool = False, **numpy_load_args, -) -> Datastream[TableRow]: - """Create an Arrow datastream from numpy files. +) -> Dataset: + """Create an Arrow dataset from numpy files. Examples: >>> import ray @@ -1133,14 +1108,14 @@ def read_numpy( A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files of the datastream. + limited by the number of files of the dataset. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream numpy_load_args: Other options to pass to np.load. meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. By default, this filters out any file paths whose file extension does not match "*.npy*". partitioning: A :class:`~ray.data.datasource.partitioning.Partitioning` object @@ -1149,7 +1124,7 @@ def read_numpy( found. Defaults to False. Returns: - Datastream holding Tensor records read from the specified paths. + Dataset holding Tensor records read from the specified paths. """ return read_datasource( NumpyDatasource(), @@ -1176,8 +1151,8 @@ def read_tfrecords( partition_filter: Optional[PathPartitionFilter] = None, ignore_missing_paths: bool = False, tf_schema: Optional["schema_pb2.Schema"] = None, -) -> Datastream[TableRow]: - """Create a datastream from TFRecord files that contain +) -> Dataset: + """Create a dataset from TFRecord files that contain `tf.train.Example `_ messages. @@ -1203,7 +1178,7 @@ def read_tfrecords( ... writer.write(example.SerializeToString()) This function reads ``tf.train.Example`` messages into a tabular - :class:`~ray.data.Datastream`. + :class:`~ray.data.Dataset`. >>> import ray >>> ds = ray.data.read_tfrecords(path) @@ -1232,7 +1207,7 @@ def read_tfrecords( A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files in the datastream. + limited by the number of files in the dataset. arrow_open_stream_args: Key-word arguments passed to ``pyarrow.fs.FileSystem.open_input_stream``. To read a compressed TFRecord file, pass the corresponding compression type (e.g. for ``GZIP`` or ``ZLIB``, use @@ -1240,16 +1215,16 @@ def read_tfrecords( meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. By default, this filters out any file paths whose file extension does not match ``"*.tfrecords*"``. ignore_missing_paths: If True, ignores any file paths in ``paths`` that are not found. Defaults to False. tf_schema: Optional TensorFlow Schema which is used to explicitly set the schema - of the underlying Datastream. + of the underlying Dataset. Returns: - A :class:`~ray.data.Datastream` that contains the example features. + A :class:`~ray.data.Dataset` that contains the example features. Raises: ValueError: If a file contains a message that isn't a ``tf.train.Example``. @@ -1281,15 +1256,15 @@ def read_webdataset( filerename: Optional[Union[list, callable]] = None, suffixes: Optional[Union[list, callable]] = None, verbose_open: bool = False, -) -> Datastream[TableRow]: - """Create a datastream from WebDataset files. +) -> Dataset: + """Create a dataset from WebDataset files. Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be - limited by the number of files in the datastream. + limited by the number of files in the dataset. arrow_open_stream_args: Key-word arguments passed to ``pyarrow.fs.FileSystem.open_input_stream``. To read a compressed TFRecord file, pass the corresponding compression type (e.g. for ``GZIP`` or ``ZLIB``, use @@ -1297,7 +1272,7 @@ def read_webdataset( meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. decoder: A function or list of functions to decode the data. fileselect: A callable or list of glob patterns to select files. filerename: A function or list of tuples to rename files prior to grouping. @@ -1305,7 +1280,7 @@ def read_webdataset( verbose_open: Whether to print the file names as they are opened. Returns: - A :class:`~ray.data.Datastream` that contains the example features. + A :class:`~ray.data.Dataset` that contains the example features. Raises: ValueError: If a file contains a message that isn't a ``tf.train.Example``. @@ -1340,8 +1315,8 @@ def read_binary_files( partitioning: Partitioning = None, ignore_missing_paths: bool = False, output_arrow_format: bool = False, -) -> Datastream[TableRow]: - """Create a datastream from binary files of arbitrary contents. +) -> Dataset: + """Create a dataset from binary files of arbitrary contents. Examples: >>> import ray @@ -1355,7 +1330,7 @@ def read_binary_files( Args: paths: A single file path or a list of file paths (or directories). include_paths: Whether to include the full path of the file in the - datastream records. When specified, the stream records will be a + dataset records. When specified, the stream records will be a tuple of the file path and the file contents. filesystem: The filesystem implementation to read from. ray_remote_args: kwargs passed to ray.remote in the read tasks. @@ -1366,7 +1341,7 @@ def read_binary_files( meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used - with a custom callback to read only selected partitions of a datastream. + with a custom callback to read only selected partitions of a dataset. By default, this does not filter out any files. partitioning: A :class:`~ray.data.datasource.partitioning.Partitioning` object that describes how paths are organized. Defaults to ``None``. @@ -1376,17 +1351,17 @@ def read_binary_files( list format. Defaults to False. Returns: - Datastream producing records read from the specified paths. + Dataset producing records read from the specified paths. """ - ctx = ray.data.DatasetContext.get_current() + ctx = ray.data.DataContext.get_current() if ctx.strict_mode: output_arrow_format = True if not output_arrow_format: logger.warning( - "read_binary_files() returns Datastream in Python list format as of Ray " + "read_binary_files() returns Dataset in Python list format as of Ray " "v2.4. Use read_binary_files(output_arrow_format=True) to return " - "Datastream in Arrow format.", + "Dataset in Arrow format.", ) return read_datasource( @@ -1412,7 +1387,7 @@ def read_sql( *, parallelism: int = -1, ray_remote_args: Optional[Dict[str, Any]] = None, -) -> Datastream[Any]: +) -> Dataset: """Read from a database that provides a `Python DB API2-compliant `_ connector. @@ -1476,7 +1451,7 @@ def create_connection(): ray_remote_args: Keyword arguments passed to :func:`ray.remote` in read tasks. Returns: - A :class:`Datastream` containing the queried data. + A :class:`Dataset` containing the queried data. """ datasource = SQLDatasource(connection_factory) return read_datasource( @@ -1488,14 +1463,14 @@ def create_connection(): @PublicAPI -def from_dask(df: "dask.DataFrame") -> MaterializedDatastream[TableRow]: - """Create a datastream from a Dask DataFrame. +def from_dask(df: "dask.DataFrame") -> MaterializedDataset: + """Create a dataset from a Dask DataFrame. Args: df: A Dask DataFrame. Returns: - MaterializedDatastream holding Arrow records read from the DataFrame. + MaterializedDataset holding Arrow records read from the DataFrame. """ import dask @@ -1526,18 +1501,18 @@ def to_ref(df): @PublicAPI -def from_mars(df: "mars.DataFrame") -> MaterializedDatastream[TableRow]: - """Create a datastream from a MARS dataframe. +def from_mars(df: "mars.DataFrame") -> MaterializedDataset: + """Create a dataset from a MARS dataframe. Args: df: A MARS dataframe, which must be executed by MARS-on-Ray. Returns: - MaterializedDatastream holding Arrow records read from the dataframe. + MaterializedDataset holding Arrow records read from the dataframe. """ import mars.dataframe as md - ds: Datastream = md.to_ray_dataset(df) + ds: Dataset = md.to_ray_dataset(df) logical_plan = LogicalPlan(FromMars(ds.dataframe)) ds._logical_plan = logical_plan @@ -1546,14 +1521,14 @@ def from_mars(df: "mars.DataFrame") -> MaterializedDatastream[TableRow]: @PublicAPI -def from_modin(df: "modin.DataFrame") -> MaterializedDatastream[TableRow]: - """Create a datastream from a Modin dataframe. +def from_modin(df: "modin.DataFrame") -> MaterializedDataset: + """Create a dataset from a Modin dataframe. Args: df: A Modin dataframe, which must be using the Ray backend. Returns: - MaterializedDatastream holding Arrow records read from the dataframe. + MaterializedDataset holding Arrow records read from the dataframe. """ from modin.distributed.dataframe.pandas.partitions import unwrap_partitions @@ -1569,14 +1544,14 @@ def from_modin(df: "modin.DataFrame") -> MaterializedDatastream[TableRow]: @PublicAPI def from_pandas( dfs: Union["pandas.DataFrame", List["pandas.DataFrame"]] -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a list of Pandas dataframes. +) -> MaterializedDataset: + """Create a dataset from a list of Pandas dataframes. Args: dfs: A Pandas dataframe or a list of Pandas dataframes. Returns: - MaterializedDatastream holding Arrow records read from the dataframes. + MaterializedDataset holding Arrow records read from the dataframes. """ import pandas as pd @@ -1596,8 +1571,8 @@ def from_pandas( @DeveloperAPI def from_pandas_refs( dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]], -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a list of Ray object references to Pandas +) -> MaterializedDataset: + """Create a dataset from a list of Ray object references to Pandas dataframes. Args: @@ -1605,7 +1580,7 @@ def from_pandas_refs( Ray object references to pandas dataframes. Returns: - MaterializedDatastream holding Arrow records read from the dataframes. + MaterializedDataset holding Arrow records read from the dataframes. """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] @@ -1626,10 +1601,10 @@ def from_pandas_refs( if context.enable_pandas_block: get_metadata = cached_remote_fn(get_table_block_metadata) metadata = ray.get([get_metadata.remote(df) for df in dfs]) - return MaterializedDatastream( + return MaterializedDataset( ExecutionPlan( BlockList(dfs, metadata, owned_by_consumer=False), - DatastreamStats(stages={"FromPandasRefs": metadata}, parent=None), + DatasetStats(stages={"FromPandasRefs": metadata}, parent=None), run_by_consumer=False, ), 0, @@ -1642,10 +1617,10 @@ def from_pandas_refs( res = [df_to_block.remote(df) for df in dfs] blocks, metadata = map(list, zip(*res)) metadata = ray.get(metadata) - return MaterializedDatastream( + return MaterializedDataset( ExecutionPlan( BlockList(blocks, metadata, owned_by_consumer=False), - DatastreamStats(stages={"FromPandasRefs": metadata}, parent=None), + DatasetStats(stages={"FromPandasRefs": metadata}, parent=None), run_by_consumer=False, ), 0, @@ -1655,16 +1630,14 @@ def from_pandas_refs( @PublicAPI -def from_numpy( - ndarrays: Union[np.ndarray, List[np.ndarray]] -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a list of NumPy ndarrays. +def from_numpy(ndarrays: Union[np.ndarray, List[np.ndarray]]) -> MaterializedDataset: + """Create a dataset from a list of NumPy ndarrays. Args: ndarrays: A NumPy ndarray or a list of NumPy ndarrays. Returns: - MaterializedDatastream holding the given ndarrays. + MaterializedDataset holding the given ndarrays. """ if isinstance(ndarrays, np.ndarray): ndarrays = [ndarrays] @@ -1675,15 +1648,15 @@ def from_numpy( @DeveloperAPI def from_numpy_refs( ndarrays: Union[ObjectRef[np.ndarray], List[ObjectRef[np.ndarray]]], -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a list of NumPy ndarray futures. +) -> MaterializedDataset: + """Create a dataset from a list of NumPy ndarray futures. Args: ndarrays: A Ray object reference to a NumPy ndarray or a list of Ray object references to NumPy ndarrays. Returns: - MaterializedDatastream holding the given ndarrays. + MaterializedDataset holding the given ndarrays. """ if isinstance(ndarrays, ray.ObjectRef): ndarrays = [ndarrays] @@ -1702,20 +1675,17 @@ def from_numpy_refs( ctx = DataContext.get_current() ndarray_to_block_remote = cached_remote_fn(ndarray_to_block, num_returns=2) - res = [ - ndarray_to_block_remote.remote(ndarray, strict_mode=ctx.strict_mode) - for ndarray in ndarrays - ] + res = [ndarray_to_block_remote.remote(ndarray, ctx) for ndarray in ndarrays] blocks, metadata = map(list, zip(*res)) metadata = ray.get(metadata) from_numpy_refs_op = FromNumpyRefs(ndarrays) logical_plan = LogicalPlan(from_numpy_refs_op) - return MaterializedDatastream( + return MaterializedDataset( ExecutionPlan( BlockList(blocks, metadata, owned_by_consumer=False), - DatastreamStats(stages={"FromNumpyRefs": metadata}, parent=None), + DatasetStats(stages={"FromNumpyRefs": metadata}, parent=None), run_by_consumer=False, ), 0, @@ -1727,15 +1697,15 @@ def from_numpy_refs( @PublicAPI def from_arrow( tables: Union["pyarrow.Table", bytes, List[Union["pyarrow.Table", bytes]]], -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a list of Arrow tables. +) -> MaterializedDataset: + """Create a dataset from a list of Arrow tables. Args: tables: An Arrow table, or a list of Arrow tables, or its streaming format in bytes. Returns: - MaterializedDatastream holding Arrow records from the tables. + MaterializedDataset holding Arrow records from the tables. """ import pyarrow as pa @@ -1750,15 +1720,15 @@ def from_arrow_refs( ObjectRef[Union["pyarrow.Table", bytes]], List[ObjectRef[Union["pyarrow.Table", bytes]]], ], -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a set of Arrow tables. +) -> MaterializedDataset: + """Create a dataset from a set of Arrow tables. Args: tables: A Ray object reference to Arrow table, or list of Ray object references to Arrow tables, or its streaming format in bytes. Returns: - MaterializedDatastream holding Arrow records from the tables. + MaterializedDataset holding Arrow records from the tables. """ if isinstance(tables, ray.ObjectRef): tables = [tables] @@ -1767,10 +1737,10 @@ def from_arrow_refs( metadata = ray.get([get_metadata.remote(t) for t in tables]) logical_plan = LogicalPlan(FromArrowRefs(tables)) - return MaterializedDatastream( + return MaterializedDataset( ExecutionPlan( BlockList(tables, metadata, owned_by_consumer=False), - DatastreamStats(stages={"FromArrowRefs": metadata}, parent=None), + DatasetStats(stages={"FromArrowRefs": metadata}, parent=None), run_by_consumer=False, ), 0, @@ -1782,18 +1752,18 @@ def from_arrow_refs( @PublicAPI def from_spark( df: "pyspark.sql.DataFrame", *, parallelism: Optional[int] = None -) -> MaterializedDatastream[TableRow]: - """Create a datastream from a Spark dataframe. +) -> MaterializedDataset: + """Create a dataset from a Spark dataframe. Args: spark: A SparkSession, which must be created by RayDP (Spark-on-Ray). df: A Spark dataframe, which must be created by RayDP (Spark-on-Ray). - parallelism: The amount of parallelism to use for the datastream. + parallelism: The amount of parallelism to use for the dataset. If not provided, it will be equal to the number of partitions of the original Spark dataframe. Returns: - MaterializedDatastream holding Arrow records read from the dataframe. + MaterializedDataset holding Arrow records read from the dataframe. """ import raydp @@ -1803,26 +1773,56 @@ def from_spark( @PublicAPI def from_huggingface( dataset: Union["datasets.Dataset", "datasets.DatasetDict"], -) -> Union[ - MaterializedDatastream[TableRow], Dict[str, MaterializedDatastream[TableRow]] -]: - """Create a datastream from a Hugging Face Datasets Dataset. +) -> Union[MaterializedDataset, Dict[str, MaterializedDataset]]: + """Create a dataset from a Hugging Face Datasets Dataset. This function is not parallelized, and is intended to be used with Hugging Face Datasets that are loaded into memory (as opposed to memory-mapped). + Example: + + .. doctest:: + :options: +ELLIPSIS + + >>> import ray + >>> import datasets + >>> hf_dataset = datasets.load_dataset("tweet_eval", "emotion") + Downloading ... + >>> ray_ds = ray.data.from_huggingface(hf_dataset) + >>> ray_ds + {'train': MaterializedDataset( + num_blocks=1, + num_rows=3257, + schema={text: string, label: int64} + ), 'test': MaterializedDataset( + num_blocks=1, + num_rows=1421, + schema={text: string, label: int64} + ), 'validation': MaterializedDataset( + num_blocks=1, + num_rows=374, + schema={text: string, label: int64} + )} + >>> ray_ds = ray.data.from_huggingface(hf_dataset["train"]) + >>> ray_ds + MaterializedDataset( + num_blocks=1, + num_rows=3257, + schema={text: string, label: int64} + ) + Args: - dataset: A Hugging Face ``Dataset``, or ``DatasetDict``. - ``IterableDataset`` is not supported. + dataset: A Hugging Face Dataset, or DatasetDict. IterableDataset is not + supported. ``IterableDataset`` is not supported. Returns: - MaterializedDatastream holding Arrow records from the Hugging Face Dataset, or a - dict of MaterializedDatastream in case ``dataset`` is a ``DatasetDict``. + Dataset holding Arrow records from the Hugging Face Dataset, or a dict of + datasets in case dataset is a DatasetDict. """ import datasets - def convert(ds: "datasets.Dataset") -> Datastream[TableRow]: + def convert(ds: "datasets.Dataset") -> Dataset: ray_ds = from_arrow(ds.data.table) logical_plan = LogicalPlan(FromHuggingFace(ds)) ray_ds._logical_plan = logical_plan @@ -1830,12 +1830,22 @@ def convert(ds: "datasets.Dataset") -> Datastream[TableRow]: return ray_ds if isinstance(dataset, datasets.DatasetDict): + available_keys = list(dataset.keys()) + logger.warning( + "You provided a Huggingface DatasetDict which contains multiple " + "datasets. The output of `from_huggingface` is a dictionary of Ray " + "Datasets. To convert just a single Huggingface Dataset to a " + "Ray Dataset, specify a split. For example, " + "`ray.data.from_huggingface(my_dataset_dictionary" + f"['{available_keys[0]}'])`. " + f"Available splits are {available_keys}." + ) return {k: convert(ds) for k, ds in dataset.items()} elif isinstance(dataset, datasets.Dataset): return convert(dataset) else: raise TypeError( - "`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`, " + "`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`." f"got {type(dataset)}" ) @@ -1843,8 +1853,8 @@ def convert(ds: "datasets.Dataset") -> Datastream[TableRow]: @PublicAPI def from_tf( dataset: "tf.data.Dataset", -) -> MaterializedDatastream: - """Create a datastream from a TensorFlow dataset. +) -> MaterializedDataset: + """Create a dataset from a TensorFlow dataset. This function is inefficient. Use it to read small datasets or prototype. @@ -1863,7 +1873,7 @@ def from_tf( >>> dataset, _ = tfds.load('cifar10', split=["train", "test"]) # doctest: +SKIP >>> ds = ray.data.from_tf(dataset) # doctest: +SKIP >>> ds # doctest: +SKIP - Datastream(num_blocks=200, num_rows=50000, schema={id: binary, image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8), label: int64}) + Dataset(num_blocks=200, num_rows=50000, schema={id: binary, image: numpy.ndarray(shape=(32, 32, 3), dtype=uint8), label: int64}) >>> ds.take(1) # doctest: +SKIP [{'id': b'train_16399', 'image': array([[[143, 96, 70], [141, 96, 72], @@ -1887,7 +1897,7 @@ def from_tf( dataset: A TensorFlow dataset. Returns: - A :class:`MaterializedDatastream` that contains the samples stored in the + A :class:`MaterializedDataset` that contains the samples stored in the TensorFlow dataset. """ # noqa: E501 # FIXME: `as_numpy_iterator` errors if `dataset` contains ragged tensors. @@ -1897,8 +1907,8 @@ def from_tf( @PublicAPI def from_torch( dataset: "torch.utils.data.Dataset", -) -> MaterializedDatastream: - """Create a datastream from a Torch dataset. +) -> MaterializedDataset: + """Create a dataset from a Torch dataset. This function is inefficient. Use it to read small datasets or prototype. @@ -1917,15 +1927,15 @@ def from_torch( >>> dataset = datasets.MNIST("data", download=True) # doctest: +SKIP >>> ds = ray.data.from_torch(dataset) # doctest: +SKIP >>> ds # doctest: +SKIP - Datastream(num_blocks=200, num_rows=60000, schema=) + Dataset(num_blocks=200, num_rows=60000, schema={item: object}) >>> ds.take(1) # doctest: +SKIP - [(, 5)] + {"item": (, 5)} Args: dataset: A Torch dataset. Returns: - A :class:`MaterializedDatastream` containing the Torch dataset samples. + A :class:`MaterializedDataset` containing the Torch dataset samples. """ return from_items(list(dataset)) @@ -1942,7 +1952,7 @@ def _get_read_tasks( Args: ds: Datasource to read from. - ctx: Datastream config to use. + ctx: Dataset config to use. cur_pg: The current placement group, if any. parallelism: The user-requested parallelism, or -1 for autodetection. kwargs: Additional kwargs to pass to the reader. diff --git a/python/ray/data/row.py b/python/ray/data/row.py index c25ca4855643..b7c382736f27 100644 --- a/python/ray/data/row.py +++ b/python/ray/data/row.py @@ -1,13 +1,13 @@ from collections.abc import Mapping from typing import Any -from ray.util.annotations import PublicAPI +from ray.util.annotations import Deprecated -@PublicAPI +@Deprecated("TableRow is no longer part of the public Ray Data API.") class TableRow(Mapping): """ - A dict-like row of a tabular ``Datastream``. + A dict-like row of a tabular ``Dataset``. This implements the dictionary mapping interface, but provides more efficient access with less data copying than converting Arrow Tables diff --git a/python/ray/data/tests/block_batching/test_util.py b/python/ray/data/tests/block_batching/test_util.py index 67eeffa3e777..036331b0d272 100644 --- a/python/ray/data/tests/block_batching/test_util.py +++ b/python/ray/data/tests/block_batching/test_util.py @@ -1,3 +1,4 @@ +import threading import pytest import time @@ -7,6 +8,7 @@ import ray from ray.data._internal.block_batching.util import ( + Queue, _calculate_ref_hits, make_async_gen, blocks_to_batches, @@ -47,15 +49,15 @@ def test_blocks_to_batches(block_size, drop_last): full_batches = 0 leftover_batches = 0 - datastream_size = block_size * num_blocks + dataset_size = block_size * num_blocks for batch in batch_iter: if len(batch.data) == batch_size: full_batches += 1 - if len(batch.data) == (datastream_size % batch_size): + if len(batch.data) == (dataset_size % batch_size): leftover_batches += 1 assert leftover_batches == 1 - assert full_batches == (datastream_size // batch_size) + assert full_batches == (dataset_size // batch_size) assert [batch.batch_idx for batch in batch_iter] == list(range(len(batch_iter))) @@ -173,6 +175,86 @@ def sleep_udf(item): assert end_time - start_time < 9.5 +def test_make_async_gen_multiple_threads_unfinished(): + """Tests that using multiple threads can overlap compute even more. + Do not finish iteration with break in the middle. + """ + + num_items = 5 + + def gen(base_iterator): + for i in base_iterator: + time.sleep(4) + yield i + + def sleep_udf(item): + time.sleep(5) + return item + + # All 5 items should be fetched concurrently. + iterator = make_async_gen( + base_iterator=iter(range(num_items)), fn=gen, num_workers=5 + ) + + start_time = time.time() + + # Only sleep for first item. + sleep_udf(next(iterator)) + + # All subsequent items should already be prefetched and should be ready. + for i, _ in enumerate(iterator): + if i > 2: + break + end_time = time.time() + + # 4 second for first item, 5 seconds for udf, 0.5 seconds buffer + assert end_time - start_time < 9.5 + + +def test_queue(): + queue = Queue(5) + num_producers = 10 + num_producers_finished = 0 + num_items = 20 + + def execute_computation(): + for item in range(num_items): + if queue.put(item): + # Return early when it's instructed to do so. + break + # Put -1 as indicator of thread being finished. + queue.put(-1) + + # Use separate threads as producers. + threads = [ + threading.Thread(target=execute_computation, daemon=True) + for _ in range(num_producers) + ] + + for thread in threads: + thread.start() + + for i in range(num_producers * num_items): + item = queue.get() + if item == -1: + num_producers_finished += 1 + if i > num_producers * num_items / 2: + num_producers_alive = num_producers - num_producers_finished + # Check there are some alive producers. + assert num_producers_alive > 0, num_producers_alive + # Release the alive producers. + queue.release(num_producers_alive) + # Consume the remaining items in queue. + while queue.qsize() > 0: + queue.get() + break + + # Sleep 5 seconds to allow producer threads to exit. + time.sleep(5) + # Then check the queue is still empty. + assert queue.qsize() == 0 + + def test_calculate_ref_hits(ray_start_regular_shared): refs = [ray.put(0), ray.put(1)] hits, misses, unknowns = _calculate_ref_hits(refs) diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py index d5bb43a57b6b..2b0eebdace36 100644 --- a/python/ray/data/tests/conftest.py +++ b/python/ray/data/tests/conftest.py @@ -36,6 +36,22 @@ def ray_start_10_cpus_shared(request): yield res +@pytest.fixture(scope="module") +def enable_strict_mode(): + ctx = ray.data.DataContext.get_current() + ctx.strict_mode = True + yield + ctx.strict_mode = False + + +@pytest.fixture(scope="module") +def enable_nonstrict_mode(): + ctx = ray.data.DataContext.get_current() + ctx.strict_mode = False + yield + ctx.strict_mode = True + + @pytest.fixture(scope="function") def aws_credentials(): import os @@ -157,15 +173,14 @@ def _get_write_path_for_block( base_path, *, filesystem=None, - datastream_uuid=None, + dataset_uuid=None, block=None, block_index=None, file_format=None, ): num_rows = BlockAccessor.for_block(block).num_rows() suffix = ( - f"{block_index:06}_{num_rows:02}_{datastream_uuid}" - f".test.{file_format}" + f"{block_index:06}_{num_rows:02}_{dataset_uuid}" f".test.{file_format}" ) return posixpath.join(base_path, suffix) @@ -253,7 +268,7 @@ def _assert_base_partitioned_ds( actual_input_files = ds.input_files() assert len(actual_input_files) == num_input_files, actual_input_files - # For Datastreams with long string representations, the format will include + # For Datasets with long string representations, the format will include # whitespace and newline characters, which is difficult to generalize # without implementing the formatting logic again (from # `ExecutionPlan.get_plan_as_string()`). Therefore, we remove whitespace @@ -263,12 +278,12 @@ def _remove_whitespace(ds_str): ds_str = ds_str.replace(c, "") return ds_str - assert "Datastream(num_blocks={},num_rows={},schema={})".format( + assert "Dataset(num_blocks={},num_rows={},schema={})".format( num_input_files, num_rows, _remove_whitespace(schema), ) == _remove_whitespace(str(ds)), ds - assert "Datastream(num_blocks={},num_rows={},schema={})".format( + assert "Dataset(num_blocks={},num_rows={},schema={})".format( num_input_files, num_rows, _remove_whitespace(schema), @@ -370,7 +385,7 @@ def enable_streaming_executor(): ctx.use_streaming_executor = use_streaming_executor -# ===== Pandas datastream formats ===== +# ===== Pandas dataset formats ===== @pytest.fixture(scope="function") def ds_pandas_single_column_format(ray_start_regular_shared): in_df = pd.DataFrame({"column_1": [1, 2, 3, 4]}) @@ -389,7 +404,7 @@ def ds_pandas_list_multi_column_format(ray_start_regular_shared): yield ray.data.from_pandas([in_df] * 4) -# ===== Arrow datastream formats ===== +# ===== Arrow dataset formats ===== @pytest.fixture(scope="function") def ds_arrow_single_column_format(ray_start_regular_shared): yield ray.data.from_arrow(pa.table({"column_1": [1, 2, 3, 4]})) @@ -425,7 +440,7 @@ def ds_list_arrow_multi_column_format(ray_start_regular_shared): yield ray.data.from_arrow([pa.table({"column_1": [1], "column_2": [1]})] * 4) -# ===== Numpy datastream formats ===== +# ===== Numpy dataset formats ===== @pytest.fixture(scope="function") def ds_numpy_single_column_tensor_format(ray_start_regular_shared): yield ray.data.from_numpy(np.arange(16).reshape((4, 2, 2))) diff --git a/python/ray/data/tests/mock_server.py b/python/ray/data/tests/mock_server.py index c2a81c2b5546..42dffd7f0690 100644 --- a/python/ray/data/tests/mock_server.py +++ b/python/ray/data/tests/mock_server.py @@ -67,7 +67,7 @@ def stop_process(process): # being unreachable). This appears to only be an issue when using the tmp_dir # fixture as the S3 dir path. We should fix this since "session" scope should # reduce a lot of the per-test overhead (2x faster execution for IO methods in -# test_datastream.py). +# test_dataset.py). @pytest.fixture(scope="function") def s3_server(): host = "localhost" diff --git a/rllib/algorithms/appo/tests/tf/__init__.py b/python/ray/data/tests/preprocessors/__init__.py similarity index 100% rename from rllib/algorithms/appo/tests/tf/__init__.py rename to python/ray/data/tests/preprocessors/__init__.py diff --git a/python/ray/data/tests/preprocessors/test_batch_mapper.py b/python/ray/data/tests/preprocessors/test_batch_mapper.py index 3279ff9e835d..915a236c736a 100644 --- a/python/ray/data/tests/preprocessors/test_batch_mapper.py +++ b/python/ray/data/tests/preprocessors/test_batch_mapper.py @@ -1,4 +1,4 @@ -from typing import Dict, Union +from typing import Dict import numpy as np import pandas as pd @@ -7,7 +7,6 @@ from pytest_lazyfixture import lazy_fixture import ray -from ray.air.constants import TENSOR_COLUMN_NAME from ray.data.preprocessors import BatchMapper from ray.tests.conftest import * # noqa @@ -78,26 +77,16 @@ def add_and_modify_udf(df: "pd.DataFrame"): def test_batch_mapper_pandas_data_format( ray_start_regular_shared, ds, expected_df, expected_numpy_df ): - """Tests batch mapper functionality for pandas data format. - - Note: - For single column pandas dataframes, we automatically convert it to - single column tensor with column name as `__value__`. - """ - def add_and_modify_udf_pandas(df: "pd.DataFrame"): df["column_1"] = df["column_1"] + 1 if "column_2" in df: df["column_2"] *= 2 return df - def add_and_modify_udf_numpy(data: Union[np.ndarray, Dict[str, np.ndarray]]): - if isinstance(data, np.ndarray): - data += 1 - else: - data["column_1"] = data["column_1"] + 1 - if "column_2" in data: - data["column_2"] *= 2 + def add_and_modify_udf_numpy(data: Dict[str, np.ndarray]): + data["column_1"] = data["column_1"] + 1 + if "column_2" in data: + data["column_2"] *= 2 return data # Test map_batches @@ -172,29 +161,6 @@ def check_batch_size(batch): } ), ), - ( - lazy_fixture("ds_arrow_single_column_tensor_format"), - pd.DataFrame( - { - TENSOR_COLUMN_NAME: [ - [[1, 2], [3, 4]], - [[5, 6], [7, 8]], - [[9, 10], [11, 12]], - [[13, 14], [15, 16]], - ] - } - ), - pd.DataFrame( - { - TENSOR_COLUMN_NAME: [ - [[1, 2], [3, 4]], - [[5, 6], [7, 8]], - [[9, 10], [11, 12]], - [[13, 14], [15, 16]], - ] - } - ), - ), ( lazy_fixture("ds_arrow_multi_column_format"), pd.DataFrame( @@ -231,13 +197,10 @@ def add_and_modify_udf_pandas(df: "pd.DataFrame"): df["column_2"] *= 2 return df - def add_and_modify_udf_numpy(data: Union[np.ndarray, Dict[str, np.ndarray]]): - if isinstance(data, np.ndarray): - data = data + 1 - else: - data["column_1"] = data["column_1"] + 1 - if "column_2" in data: - data["column_2"] = data["column_2"] * 2 + def add_and_modify_udf_numpy(data: Dict[str, np.ndarray]): + data["column_1"] = data["column_1"] + 1 + if "column_2" in data: + data["column_2"] = data["column_2"] * 2 return data # Test map_batches @@ -270,7 +233,7 @@ def add_and_modify_udf_numpy(data: Union[np.ndarray, Dict[str, np.ndarray]]): lazy_fixture("ds_numpy_single_column_tensor_format"), pd.DataFrame( { - TENSOR_COLUMN_NAME: [ + "data": [ [[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]], @@ -281,25 +244,18 @@ def add_and_modify_udf_numpy(data: Union[np.ndarray, Dict[str, np.ndarray]]): ), ( lazy_fixture("ds_numpy_list_of_ndarray_tensor_format"), - pd.DataFrame({TENSOR_COLUMN_NAME: [[[1, 2], [3, 4]]] * 4}), + pd.DataFrame({"data": [[[1, 2], [3, 4]]] * 4}), ), ], ) def test_batch_mapper_numpy_data_format(ds, expected_df): - """Tests batch mapper functionality for numpy data format. - - Note: - For single column pandas dataframes, we automatically convert it to - single column tensor with column name as `__value__`. - """ - def add_and_modify_udf_pandas(df: "pd.DataFrame"): col_name = list(df.columns)[0] df[col_name] = df[col_name] + 1 return df - def add_and_modify_udf_numpy(data: Union[np.ndarray, Dict[str, np.ndarray]]): - data = data + 1 + def add_and_modify_udf_numpy(data: Dict[str, np.ndarray]): + data["data"] = data["data"] + 1 return data # Test map_batches diff --git a/python/ray/data/tests/preprocessors/test_preprocessors.py b/python/ray/data/tests/preprocessors/test_preprocessors.py index ed8ebf763260..3fa4c8a27d81 100644 --- a/python/ray/data/tests/preprocessors/test_preprocessors.py +++ b/python/ray/data/tests/preprocessors/test_preprocessors.py @@ -129,7 +129,7 @@ def test_fit_twice(mocked_warn): scaler.fit(ds) assert scaler.stats_ == {"min(B)": 1, "max(B)": 5, "min(C)": 1, "max(C)": 1} - ds = ds.map_batches(lambda x: x * 2) + ds = ds.map_batches(lambda x: {k: v * 2 for k, v in x.items()}) # Fit again scaler.fit(ds) # Assert that the fitted state is corresponding to the second ds. @@ -182,12 +182,12 @@ def _determine_transform_to_use(self): def test_pipeline_fail(): - ds = ray.data.range_table(5).window(blocks_per_window=1).repeat(1) + ds = ray.data.range(5).window(blocks_per_window=1).repeat(1) class FittablePreprocessor(Preprocessor): _is_fittable = True - def _fit(self, datastream): + def _fit(self, dataset): self.fitted_ = True return self @@ -234,7 +234,7 @@ def test_transform_all_formats(create_dummy_preprocessors, pipeline, dataset_for if pipeline: patcher = patch.object(ray.data.dataset_pipeline.DatasetPipeline, "map_batches") else: - patcher = patch.object(ray.data.datastream.Datastream, "map_batches") + patcher = patch.object(ray.data.dataset.Dataset, "map_batches") with patcher as mock_map_batches: _apply_transform(with_pandas, ds) @@ -263,7 +263,7 @@ def test_transform_all_formats(create_dummy_preprocessors, pipeline, dataset_for def test_numpy_pandas_support_transform_batch_wrong_format(create_dummy_preprocessors): - # Case 1: simple datastream. No support + # Case 1: simple dataset. No support ( with_nothing, with_pandas, @@ -290,7 +290,7 @@ def test_numpy_pandas_support_transform_batch_wrong_format(create_dummy_preproce def test_numpy_pandas_support_transform_batch_pandas(create_dummy_preprocessors): - # Case 2: pandas datastream + # Case 2: pandas dataset ( with_nothing, with_pandas, @@ -328,7 +328,7 @@ def test_numpy_pandas_support_transform_batch_pandas(create_dummy_preprocessors) def test_numpy_pandas_support_transform_batch_arrow(create_dummy_preprocessors): - # Case 3: arrow datastream + # Case 3: arrow dataset ( with_nothing, with_pandas, @@ -371,7 +371,7 @@ def test_numpy_pandas_support_transform_batch_arrow(create_dummy_preprocessors): def test_numpy_pandas_support_transform_batch_tensor(create_dummy_preprocessors): - # Case 4: tensor datastream created by from numpy data directly + # Case 4: tensor dataset created by from numpy data directly ( with_nothing, with_pandas, diff --git a/python/ray/data/tests/preprocessors/test_torch.py b/python/ray/data/tests/preprocessors/test_torch.py index 78455bc6753f..cf514f5cc11e 100644 --- a/python/ray/data/tests/preprocessors/test_torch.py +++ b/python/ray/data/tests/preprocessors/test_torch.py @@ -19,9 +19,9 @@ def __repr__(self): preprocessor = TorchVisionPreprocessor( columns=["spam"], transform=StubTransform() ) - assert ( - repr(preprocessor) - == "TorchVisionPreprocessor(columns=['spam'], transform=StubTransform())" + assert repr(preprocessor) == ( + "TorchVisionPreprocessor(columns=['spam'], " + "output_columns=['spam'], transform=StubTransform())" ) @pytest.mark.parametrize( @@ -32,7 +32,7 @@ def __repr__(self): ], ) def test_transform_images(self, transform): - datastream = ray.data.from_items( + dataset = ray.data.from_items( [ {"image": np.zeros((32, 32, 3)), "label": 0}, {"image": np.zeros((32, 32, 3)), "label": 1}, @@ -40,19 +40,19 @@ def test_transform_images(self, transform): ) preprocessor = TorchVisionPreprocessor(columns=["image"], transform=transform) - transformed_datastream = preprocessor.transform(datastream) + transformed_dataset = preprocessor.transform(dataset) - assert transformed_datastream.schema().names == ["image", "label"] + assert transformed_dataset.schema().names == ["image", "label"] transformed_images = [ - record["image"] for record in transformed_datastream.take_all() + record["image"] for record in transformed_dataset.take_all() ] assert all(image.shape == (3, 32, 32) for image in transformed_images) assert all(image.dtype == np.double for image in transformed_images) - labels = {record["label"] for record in transformed_datastream.take_all()} + labels = {record["label"] for record in transformed_dataset.take_all()} assert labels == {0, 1} def test_batch_transform_images(self): - datastream = ray.data.from_items( + dataset = ray.data.from_items( [ {"image": np.zeros((32, 32, 3)), "label": 0}, {"image": np.zeros((32, 32, 3)), "label": 1}, @@ -70,19 +70,19 @@ def test_batch_transform_images(self): columns=["image"], transform=transform, batched=True ) - transformed_datastream = preprocessor.transform(datastream) + transformed_dataset = preprocessor.transform(dataset) - assert transformed_datastream.schema().names == ["image", "label"] + assert transformed_dataset.schema().names == ["image", "label"] transformed_images = [ - record["image"] for record in transformed_datastream.take_all() + record["image"] for record in transformed_dataset.take_all() ] assert all(image.shape == (3, 64, 64) for image in transformed_images) assert all(image.dtype == np.double for image in transformed_images) - labels = {record["label"] for record in transformed_datastream.take_all()} + labels = {record["label"] for record in transformed_dataset.take_all()} assert labels == {0, 1} def test_transform_ragged_images(self): - datastream = ray.data.from_items( + dataset = ray.data.from_items( [ {"image": np.zeros((16, 16, 3)), "label": 0}, {"image": np.zeros((32, 32, 3)), "label": 1}, @@ -91,34 +91,32 @@ def test_transform_ragged_images(self): transform = transforms.ToTensor() preprocessor = TorchVisionPreprocessor(columns=["image"], transform=transform) - transformed_datastream = preprocessor.transform(datastream) + transformed_dataset = preprocessor.transform(dataset) - assert transformed_datastream.schema().names == ["image", "label"] + assert transformed_dataset.schema().names == ["image", "label"] transformed_images = [ - record["image"] for record in transformed_datastream.take_all() + record["image"] for record in transformed_dataset.take_all() ] assert sorted(image.shape for image in transformed_images) == [ (3, 16, 16), (3, 32, 32), ] assert all(image.dtype == np.double for image in transformed_images) - labels = {record["label"] for record in transformed_datastream.take_all()} + labels = {record["label"] for record in transformed_dataset.take_all()} assert labels == {0, 1} def test_invalid_transform_raises_value_error(self): - datastream = ray.data.from_items( + dataset = ray.data.from_items( [ {"image": np.zeros((32, 32, 3)), "label": 0}, {"image": np.zeros((32, 32, 3)), "label": 1}, ] ) - # `TorchVisionPreprocessor` expects transforms to return `torch.Tensor`s, but - # this `transform` returns a `np.ndarray`. - transform = transforms.Lambda(lambda tensor: tensor.numpy()) + transform = transforms.Lambda(lambda tensor: "BLAH BLAH INVALID") preprocessor = TorchVisionPreprocessor(columns=["image"], transform=transform) with pytest.raises(ValueError): - preprocessor.transform(datastream).materialize() + preprocessor.transform(dataset).materialize() if __name__ == "__main__": diff --git a/python/ray/data/tests/test_all_to_all.py b/python/ray/data/tests/test_all_to_all.py index c98021da8b13..625e389f5c9f 100644 --- a/python/ray/data/tests/test_all_to_all.py +++ b/python/ray/data/tests/test_all_to_all.py @@ -9,18 +9,21 @@ import pytest import ray -from ray.data.aggregate import AggregateFn, Count, Max, Mean, Min, Std, Sum +from ray.data.aggregate import AggregateFn, Count, Max, Mean, Min, Std, Sum, Quantile from ray.data.context import DataContext from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import column_udf, named_values, STRICT_MODE from ray.tests.conftest import * # noqa def test_zip(ray_start_regular_shared): ds1 = ray.data.range(5, parallelism=5) - ds2 = ray.data.range(5, parallelism=5).map(lambda x: x + 1) + ds2 = ray.data.range(5, parallelism=5).map(column_udf("id", lambda x: x + 1)) ds = ds1.zip(ds2) - assert ds.schema() == tuple - assert ds.take() == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)] + assert ds.schema().names == ["id", "id_1"] + assert ds.take() == named_values( + ["id", "id_1"], [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)] + ) with pytest.raises(ValueError): ds.zip(ray.data.range(3)).materialize() @@ -34,10 +37,14 @@ def test_zip_different_num_blocks_combinations( ): n = 12 ds1 = ray.data.range(n, parallelism=num_blocks1) - ds2 = ray.data.range(n, parallelism=num_blocks2).map(lambda x: x + 1) + ds2 = ray.data.range(n, parallelism=num_blocks2).map( + column_udf("id", lambda x: x + 1) + ) ds = ds1.zip(ds2) - assert ds.schema() == tuple - assert ds.take() == list(zip(range(n), range(1, n + 1))) + assert ds.schema().names == ["id", "id_1"] + assert ds.take() == named_values( + ["id", "id_1"], list(zip(range(n), range(1, n + 1))) + ) @pytest.mark.parametrize( @@ -81,33 +88,31 @@ def test_zip_pandas(ray_start_regular_shared): ds = ds1.zip(ds2) assert ds.count() == 2 assert "{col1: int64, col2: int64, col3: object, col4: object}" in str(ds) - result = [r.as_pydict() for r in ds.take()] + result = list(ds.take()) assert result[0] == {"col1": 1, "col2": 4, "col3": "a", "col4": "d"} ds3 = ray.data.from_pandas(pd.DataFrame({"col2": ["a", "b"], "col4": ["d", "e"]})) ds = ds1.zip(ds3) assert ds.count() == 2 assert "{col1: int64, col2: int64, col2_1: object, col4: object}" in str(ds) - result = [r.as_pydict() for r in ds.take()] + result = list(ds.take()) assert result[0] == {"col1": 1, "col2": 4, "col2_1": "a", "col4": "d"} def test_zip_arrow(ray_start_regular_shared): - ds1 = ray.data.range_table(5).map(lambda r: {"id": r["value"]}) - ds2 = ray.data.range_table(5).map( - lambda r: {"a": r["value"] + 1, "b": r["value"] + 2} - ) + ds1 = ray.data.range(5).map(lambda r: {"id": r["id"]}) + ds2 = ray.data.range(5).map(lambda r: {"a": r["id"] + 1, "b": r["id"] + 2}) ds = ds1.zip(ds2) assert ds.count() == 5 assert "{id: int64, a: int64, b: int64}" in str(ds) - result = [r.as_pydict() for r in ds.take()] + result = list(ds.take()) assert result[0] == {"id": 0, "a": 1, "b": 2} # Test duplicate column names. ds = ds1.zip(ds1).zip(ds1) assert ds.count() == 5 assert "{id: int64, id_1: int64, id_2: int64}" in str(ds) - result = [r.as_pydict() for r in ds.take()] + result = list(ds.take()) assert result[0] == {"id": 0, "id_1": 0, "id_2": 0} @@ -115,7 +120,7 @@ def test_zip_preserve_order(ray_start_regular_shared): def foo(x): import time - if x[0] < 5: + if x["item"] < 5: time.sleep(1) return x @@ -125,7 +130,9 @@ def foo(x): ds2 = ray.data.from_items(items, parallelism=num_items) ds2 = ds2.map_batches(foo, batch_size=1) result = ds1.zip(ds2).take_all() - assert result == list(zip(range(num_items), range(num_items))), result + assert result == named_values( + ["item", "item_1"], list(zip(range(num_items), range(num_items))) + ), result def test_empty_shuffle(ray_start_regular_shared): @@ -177,8 +184,6 @@ def test_repartition_noshuffle(ray_start_regular_shared): ds4 = ds.repartition(40, shuffle=False) assert ds4.num_blocks() == 40 - blocks = ray.get(ds4.get_internal_block_refs()) - assert all(isinstance(block, list) for block in blocks), blocks assert ds4.sum() == 190 assert ds4._block_num_rows() == [1] * 20 + [0] * 20 @@ -192,7 +197,7 @@ def test_repartition_noshuffle(ray_start_regular_shared): def test_repartition_shuffle_arrow(ray_start_regular_shared): - ds = ray.data.range_table(20, parallelism=10) + ds = ray.data.range(20, parallelism=10) assert ds.num_blocks() == 10 assert ds.count() == 20 assert ds._block_num_rows() == [2] * 10 @@ -207,52 +212,36 @@ def test_repartition_shuffle_arrow(ray_start_regular_shared): assert ds3.count() == 20 assert ds3._block_num_rows() == [2] * 10 + [0] * 10 - large = ray.data.range_table(10000, parallelism=10) + large = ray.data.range(10000, parallelism=10) large = large.repartition(20, shuffle=True) assert large._block_num_rows() == [500] * 20 -def test_grouped_datastream_repr(ray_start_regular_shared): +def test_grouped_dataset_repr(ray_start_regular_shared): ds = ray.data.from_items([{"key": "spam"}, {"key": "ham"}, {"key": "spam"}]) - assert repr(ds.groupby("key")) == f"GroupedData(datastream={ds!r}, key='key')" + assert repr(ds.groupby("key")) == f"GroupedData(dataset={ds!r}, key='key')" def test_groupby_arrow(ray_start_regular_shared, use_push_based_shuffle): - # Test empty datastream. - agg_ds = ( - ray.data.range_table(10) - .filter(lambda r: r["value"] > 10) - .groupby("value") - .count() - ) + # Test empty dataset. + agg_ds = ray.data.range(10).filter(lambda r: r["id"] > 10).groupby("value").count() assert agg_ds.count() == 0 def test_groupby_errors(ray_start_regular_shared): ds = ray.data.range(100) - - ds.groupby(None).count().show() # OK - ds.groupby(lambda x: x % 2).count().show() # OK - with pytest.raises(ValueError): - ds.groupby("foo").count().show() - - ds = ray.data.range_table(100) ds.groupby(None).count().show() # OK with pytest.raises(ValueError): ds.groupby(lambda x: x % 2).count().show() + with pytest.raises(ValueError): + ds.groupby("foo").count().show() def test_agg_errors(ray_start_regular_shared): - ds = ray.data.range(100) from ray.data.aggregate import Max - ds.aggregate(Max()) # OK - ds.aggregate(Max(lambda x: x)) # OK - with pytest.raises(ValueError): - ds.aggregate(Max("foo")) - - ds = ray.data.range_table(100) - ds.aggregate(Max("value")) # OK + ds = ray.data.range(100) + ds.aggregate(Max("id")) # OK with pytest.raises(ValueError): ds.aggregate(Max()) with pytest.raises(ValueError): @@ -287,7 +276,7 @@ def test_groupby_agg_name_conflict(ray_start_regular_shared, num_parts): ), ) assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "foo": 49.5, "foo_2": 49.5}, {"A": 1, "foo": 49.0, "foo_2": 49.0}, {"A": 2, "foo": 50.0, "foo_2": 50.0}, @@ -316,7 +305,7 @@ def _to_pandas(ds): ds = _to_pandas(ds) agg_ds = ds.groupby("A").count() assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "count()": 34}, {"A": 1, "count()": 33}, {"A": 2, "count()": 33}, @@ -346,7 +335,7 @@ def _to_pandas(ds): agg_ds = ds.groupby("A").sum("B") assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "sum(B)": 1683}, {"A": 1, "sum(B)": 1617}, {"A": 2, "sum(B)": 1650}, @@ -361,7 +350,7 @@ def _to_pandas(ds): nan_grouped_ds = ds.groupby("A") nan_agg_ds = nan_grouped_ds.sum("B") assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + assert list(nan_agg_ds.sort("A").iter_rows()) == [ {"A": 0, "sum(B)": 1683}, {"A": 1, "sum(B)": 1617}, {"A": 2, "sum(B)": 1650}, @@ -416,11 +405,11 @@ def _to_pandas(ds): ds = _to_pandas(ds) assert ds.sum("A") == 4950 - # Test empty datastream - ds = ray.data.range_table(10) + # Test empty dataset + ds = ray.data.range(10) if ds_format == "pandas": ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).sum("value") is None + assert ds.filter(lambda r: r["id"] > 10).sum("id") is None # Test built-in global sum aggregation with nans nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( @@ -460,7 +449,7 @@ def _to_pandas(ds): agg_ds = ds.groupby("A").min("B") assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "min(B)": 0}, {"A": 1, "min(B)": 1}, {"A": 2, "min(B)": 2}, @@ -475,7 +464,7 @@ def _to_pandas(ds): nan_grouped_ds = ds.groupby("A") nan_agg_ds = nan_grouped_ds.min("B") assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + assert list(nan_agg_ds.sort("A").iter_rows()) == [ {"A": 0, "min(B)": 0}, {"A": 1, "min(B)": 1}, {"A": 2, "min(B)": 2}, @@ -534,7 +523,7 @@ def _to_pandas(ds): agg_ds = ds.groupby("A").max("B") assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "max(B)": 99}, {"A": 1, "max(B)": 97}, {"A": 2, "max(B)": 98}, @@ -549,7 +538,7 @@ def _to_pandas(ds): nan_grouped_ds = ds.groupby("A") nan_agg_ds = nan_grouped_ds.max("B") assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + assert list(nan_agg_ds.sort("A").iter_rows()) == [ {"A": 0, "max(B)": 99}, {"A": 1, "max(B)": 97}, {"A": 2, "max(B)": 98}, @@ -608,7 +597,7 @@ def _to_pandas(ds): agg_ds = ds.groupby("A").mean("B") assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "mean(B)": 49.5}, {"A": 1, "mean(B)": 49.0}, {"A": 2, "mean(B)": 50.0}, @@ -623,7 +612,7 @@ def _to_pandas(ds): nan_grouped_ds = ds.groupby("A") nan_agg_ds = nan_grouped_ds.mean("B") assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + assert list(nan_agg_ds.sort("A").iter_rows()) == [ {"A": 0, "mean(B)": 49.5}, {"A": 1, "mean(B)": 49.0}, {"A": 2, "mean(B)": 50.0}, @@ -736,7 +725,7 @@ def test_groupby_arrow_multicolumn(ray_start_regular_shared, num_parts): ray.data.from_pandas(df).repartition(num_parts).groupby("A").mean(["B", "C"]) ) assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "mean(B)": 49.5, "mean(C)": 99.0}, {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0}, {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0}, @@ -746,7 +735,7 @@ def test_groupby_arrow_multicolumn(ray_start_regular_shared, num_parts): # groupby keys. agg_ds = ray.data.from_pandas(df).repartition(num_parts).groupby("A").mean() assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + assert list(agg_ds.sort("A").iter_rows()) == [ {"A": 0, "mean(B)": 49.5, "mean(C)": 99.0}, {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0}, {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0}, @@ -764,9 +753,9 @@ def test_groupby_agg_bad_on(ray_start_regular_shared): xs = list(range(100)) df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs, "C": [2 * x for x in xs]}) # Wrong type. - with pytest.raises(TypeError): + with pytest.raises(Exception): ray.data.from_pandas(df).groupby("A").mean(5).materialize() - with pytest.raises(TypeError): + with pytest.raises(Exception): ray.data.from_pandas(df).groupby("A").mean([5]).materialize() # Empty list. with pytest.raises(ValueError): @@ -776,15 +765,15 @@ def test_groupby_agg_bad_on(ray_start_regular_shared): ray.data.from_pandas(df).groupby("A").mean("D").materialize() with pytest.raises(ValueError): ray.data.from_pandas(df).groupby("A").mean(["B", "D"]).materialize() - # Columns for simple Datastream. + # Columns for simple Dataset. with pytest.raises(ValueError): ray.data.from_items(xs).groupby(lambda x: x % 3 == 0).mean("A").materialize() # Test bad on for global aggregation # Wrong type. - with pytest.raises(TypeError): + with pytest.raises(Exception): ray.data.from_pandas(df).mean(5).materialize() - with pytest.raises(TypeError): + with pytest.raises(Exception): ray.data.from_pandas(df).mean([5]).materialize() # Empty list. with pytest.raises(ValueError): @@ -794,7 +783,7 @@ def test_groupby_agg_bad_on(ray_start_regular_shared): ray.data.from_pandas(df).mean("D").materialize() with pytest.raises(ValueError): ray.data.from_pandas(df).mean(["B", "D"]).materialize() - # Columns for simple Datastream. + # Columns for simple Dataset. with pytest.raises(ValueError): ray.data.from_items(xs).mean("A").materialize() @@ -818,13 +807,14 @@ def test_groupby_arrow_multi_agg(ray_start_regular_shared, num_parts): Max("B"), Mean("B"), Std("B"), + Quantile("B"), ) ) assert agg_ds.count() == 3 agg_df = agg_ds.to_pandas() expected_grouped = df.groupby("A")["B"] np.testing.assert_array_equal(agg_df["count()"].to_numpy(), [34, 33, 33]) - for agg in ["sum", "min", "max", "mean", "std"]: + for agg in ["sum", "min", "max", "mean", "quantile", "std"]: result = agg_df[f"{agg}(B)"].to_numpy() expected = getattr(expected_grouped, agg)().to_numpy() if agg == "std": @@ -843,9 +833,10 @@ def test_groupby_arrow_multi_agg(ray_start_regular_shared, num_parts): Max("A"), Mean("A"), Std("A"), + Quantile("A"), ) ) - for agg in ["sum", "min", "max", "mean", "std"]: + for agg in ["sum", "min", "max", "mean", "quantile", "std"]: result = result_row[f"{agg}(A)"] expected = getattr(df["A"], agg)() if agg == "std": @@ -854,6 +845,66 @@ def test_groupby_arrow_multi_agg(ray_start_regular_shared, num_parts): assert result == expected +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_arrow_multi_agg_alias(ray_start_regular_shared, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_arrow_multi_agg with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) + agg_ds = ( + ray.data.from_pandas(df) + .repartition(num_parts) + .groupby("A") + .aggregate( + Sum("B", alias_name="sum_b"), + Min("B", alias_name="min_b"), + Max("B", alias_name="max_b"), + Mean("B", alias_name="mean_b"), + Std("B", alias_name="std_b"), + Quantile("B", alias_name="quantile_b"), + ) + ) + + agg_df = agg_ds.to_pandas() + expected_grouped = df.groupby("A")["B"] + for agg in ["sum", "min", "max", "mean", "quantile", "std"]: + result = agg_df[f"{agg}_b"].to_numpy() + print(agg) + print(result) + expected = getattr(expected_grouped, agg)().to_numpy() + print(expected) + if agg == "std": + np.testing.assert_array_almost_equal(result, expected) + else: + np.testing.assert_array_equal(result, expected) + # Test built-in global std aggregation + df = pd.DataFrame({"A": xs}) + result_row = ( + ray.data.from_pandas(df) + .repartition(num_parts) + .aggregate( + Sum("A", alias_name="sum_b"), + Min("A", alias_name="min_b"), + Max("A", alias_name="max_b"), + Mean("A", alias_name="mean_b"), + Std("A", alias_name="std_b"), + Quantile("A", alias_name="quantile_b"), + ) + ) + for agg in ["sum", "min", "max", "mean", "quantile", "std"]: + result = result_row[f"{agg}_b"] + print(result) + expected = getattr(df["A"], agg)() + print(expected) + if agg == "std": + assert math.isclose(result, expected) + else: + assert result == expected + + +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") def test_groupby_simple(ray_start_regular_shared): seed = int(time.time()) print(f"Seeding RNG for test_groupby_simple with: {seed}") @@ -905,7 +956,7 @@ def test_groupby_simple(ray_start_regular_shared): ("None", 3), ] - # Test empty datastream. + # Test empty dataset. ds = ray.data.from_items([]) agg_ds = ds.groupby(lambda r: r[0]).aggregate( AggregateFn( @@ -921,6 +972,7 @@ def test_groupby_simple(ray_start_regular_shared): assert agg_ds.count() == 0 +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_count(ray_start_regular_shared, num_parts): # Test built-in count aggregation @@ -936,6 +988,7 @@ def test_groupby_simple_count(ray_start_regular_shared, num_parts): assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 34), (1, 33), (2, 33)] +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_sum(ray_start_regular_shared, num_parts): # Test built-in sum aggregation @@ -995,13 +1048,15 @@ def test_groupby_simple_sum(ray_start_regular_shared, num_parts): assert nan_ds.sum() is None -def test_groupby_map_groups_for_empty_datastream(ray_start_regular_shared): +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") +def test_groupby_map_groups_for_empty_dataset(ray_start_regular_shared): ds = ray.data.from_items([]) mapped = ds.groupby(lambda x: x % 3).map_groups(lambda x: [min(x) * min(x)]) assert mapped.count() == 0 assert mapped.take_all() == [] +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") def test_groupby_map_groups_merging_empty_result(ray_start_regular_shared): ds = ray.data.from_items([1, 2, 3]) # This needs to merge empty and non-empty results from different groups. @@ -1010,6 +1065,7 @@ def test_groupby_map_groups_merging_empty_result(ray_start_regular_shared): assert mapped.take_all() == [2, 3] +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") def test_groupby_map_groups_merging_invalid_result(ray_start_regular_shared): ds = ray.data.from_items([1, 2, 3]) grouped = ds.groupby(lambda x: x) @@ -1023,12 +1079,15 @@ def test_groupby_map_groups_merging_invalid_result(ray_start_regular_shared): def test_groupby_map_groups_for_none_groupkey(ray_start_regular_shared, num_parts): ds = ray.data.from_items(list(range(100))) mapped = ( - ds.repartition(num_parts).groupby(None).map_groups(lambda x: [min(x) + max(x)]) + ds.repartition(num_parts) + .groupby(None) + .map_groups(lambda x: {"out": np.array([min(x["item"]) + max(x["item"])])}) ) assert mapped.count() == 1 - assert mapped.take_all() == [99] + assert mapped.take_all() == named_values("out", [99]) +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 2, 30]) def test_groupby_map_groups_returning_empty_result(ray_start_regular_shared, num_parts): xs = list(range(100)) @@ -1053,6 +1112,7 @@ def test_groupby_map_groups_perf(ray_start_regular_shared): assert end - start < 60 +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 2, 3, 30]) def test_groupby_map_groups_for_list(ray_start_regular_shared, num_parts): seed = int(time.time()) @@ -1149,12 +1209,14 @@ def test_groupby_map_groups_with_different_types(ray_start_regular_shared): def func(group): # Test output type is Python list, different from input type. - return [group["value"][0]] + value = int(group["value"][0]) + return {"out": np.array([value])} ds = ds.groupby("group").map_groups(func) - assert sorted(ds.take()) == [1, 3] + assert sorted([x["out"] for x in ds.take()]) == [1, 3] +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_min(ray_start_regular_shared, num_parts): # Test built-in min aggregation @@ -1206,6 +1268,7 @@ def test_groupby_simple_min(ray_start_regular_shared, num_parts): assert nan_ds.min() is None +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_max(ray_start_regular_shared, num_parts): # Test built-in max aggregation @@ -1257,6 +1320,7 @@ def test_groupby_simple_max(ray_start_regular_shared, num_parts): assert nan_ds.max() is None +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_mean(ray_start_regular_shared, num_parts): # Test built-in mean aggregation @@ -1304,7 +1368,7 @@ def test_groupby_simple_mean(ray_start_regular_shared, num_parts): # Test built-in global mean aggregation assert ray.data.from_items(xs).repartition(num_parts).mean() == 49.5 - # Test empty datastream + # Test empty dataset assert ray.data.range(10).filter(lambda r: r > 10).mean() is None # Test built-in global mean aggregation with nans @@ -1317,6 +1381,7 @@ def test_groupby_simple_mean(ray_start_regular_shared, num_parts): assert nan_ds.mean() is None +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_std(ray_start_regular_shared, num_parts): # Test built-in std aggregation @@ -1403,7 +1468,7 @@ def test_groupby_simple_std(ray_start_regular_shared, num_parts): pd.Series(xs).std(ddof=0), ) - # Test empty datastream + # Test empty dataset assert ray.data.from_items([]).std() is None # Test edge cases assert ray.data.from_items([3]).std() == 0 @@ -1418,6 +1483,7 @@ def test_groupby_simple_std(ray_start_regular_shared, num_parts): assert nan_ds.std() is None +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_multilambda(ray_start_regular_shared, num_parts): # Test built-in mean aggregation @@ -1447,6 +1513,7 @@ def test_groupby_simple_multilambda(ray_start_regular_shared, num_parts): ).mean([lambda x: x[0], lambda x: x[1]]) == (None, None) +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") @pytest.mark.parametrize("num_parts", [1, 30]) def test_groupby_simple_multi_agg(ray_start_regular_shared, num_parts): seed = int(time.time()) @@ -1530,7 +1597,7 @@ def test_random_block_order(ray_start_regular_shared, restore_data_context): ds = ds.randomize_block_order(seed=0) results = ds.take() - expected = [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11] + expected = named_values("id", [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11]) assert results == expected # Test LazyBlockList.randomize_block_order. @@ -1542,7 +1609,9 @@ def test_random_block_order(ray_start_regular_shared, restore_data_context): lazy_blocklist_ds = ray.data.range(12, parallelism=4) lazy_blocklist_ds = lazy_blocklist_ds.randomize_block_order(seed=0) lazy_blocklist_results = lazy_blocklist_ds.take() - lazy_blocklist_expected = [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11] + lazy_blocklist_expected = named_values( + "id", [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11] + ) assert lazy_blocklist_results == lazy_blocklist_expected finally: context.optimize_fuse_read_stages = original_optimize_fuse_read_stages @@ -1587,9 +1656,9 @@ def range(n, parallelism=200): assert r1 != r0, (r1, r0) assert r1 != r3, (r1, r3) - r0 = ray.data.range_table(100, parallelism=5).take(999) - r1 = ray.data.range_table(100, parallelism=5).random_shuffle(seed=0).take(999) - r2 = ray.data.range_table(100, parallelism=5).random_shuffle(seed=0).take(999) + r0 = ray.data.range(100, parallelism=5).take(999) + r1 = ray.data.range(100, parallelism=5).random_shuffle(seed=0).take(999) + r2 = ray.data.range(100, parallelism=5).random_shuffle(seed=0).take(999) assert r1 == r2, (r1, r2) assert r1 != r0, (r1, r0) @@ -1604,7 +1673,7 @@ def range(n, parallelism=200): r2 = range(100).random_shuffle().take(999) assert r1 != r2, (r1, r2) - # Test empty datastream. + # Test empty dataset. ds = ray.data.from_items([]) r1 = ds.random_shuffle() assert r1.count() == 0 @@ -1624,6 +1693,7 @@ def test_random_shuffle_check_random(shutdown_only): num_contiguous = 1 prev = -1 for x in part: + x = x["item"] if prev != x: prev = x num_contiguous = 1 @@ -1649,6 +1719,7 @@ def test_random_shuffle_check_random(shutdown_only): num_increasing = 0 prev = -1 for x in part: + x = x["item"] if x >= prev: num_increasing += 1 else: @@ -1670,7 +1741,7 @@ def test_random_shuffle_with_custom_resource(ray_start_cluster): ray.init(cluster.address) - # Run datastream in "bar" nodes. + # Run dataset in "bar" nodes. ds = ray.data.read_parquet( "example://parquet_images_mini", parallelism=2, diff --git a/python/ray/data/tests/test_binary.py b/python/ray/data/tests/test_binary.py index b5275f7afdda..2cadf64a2d79 100644 --- a/python/ray/data/tests/test_binary.py +++ b/python/ray/data/tests/test_binary.py @@ -21,6 +21,7 @@ from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa +from ray.data.tests.util import extract_values from ray.tests.conftest import * # noqa @@ -41,18 +42,12 @@ def test_read_binary_files_partitioning(ray_start_regular_shared, tmp_path): assert ds.take() == [{"bytes": b"foo", "path": path, "country": "us"}] -@pytest.mark.parametrize("output_arrow_format", [False, True]) -def test_read_binary_files(ray_start_regular_shared, output_arrow_format): +def test_read_binary_files(ray_start_regular_shared): with gen_bin_files(10) as (_, paths): - ds = ray.data.read_binary_files( - paths, parallelism=10, output_arrow_format=output_arrow_format - ) + ds = ray.data.read_binary_files(paths, parallelism=10) for i, item in enumerate(ds.iter_rows()): expected = open(paths[i], "rb").read() - if output_arrow_format: - assert expected == item["bytes"] - else: - assert expected == item + assert expected == item["bytes"] # Test metadata ops. assert ds.count() == 10 assert "bytes" in str(ds.schema()), ds @@ -84,28 +79,20 @@ def test_read_binary_files_with_fs(ray_start_regular_shared): ds = ray.data.read_binary_files(paths, filesystem=fs, parallelism=10) for i, item in enumerate(ds.iter_rows()): expected = open(paths[i], "rb").read() - assert expected == item + assert expected == item["bytes"] -@pytest.mark.parametrize("output_arrow_format", [False, True]) -def test_read_binary_files_with_paths(ray_start_regular_shared, output_arrow_format): +def test_read_binary_files_with_paths(ray_start_regular_shared): with gen_bin_files(10) as (_, paths): ds = ray.data.read_binary_files( paths, include_paths=True, parallelism=10, - output_arrow_format=output_arrow_format, ) - if output_arrow_format: - for i, item in enumerate(ds.iter_rows()): - assert paths[i] == item["path"] - expected = open(paths[i], "rb").read() - assert expected == item["bytes"] - else: - for i, (path, item) in enumerate(ds.iter_rows()): - assert path == paths[i] - expected = open(paths[i], "rb").read() - assert expected == item + for i, item in enumerate(ds.iter_rows()): + assert paths[i] == item["path"] + expected = open(paths[i], "rb").read() + assert expected == item["bytes"] # TODO(Clark): Hitting S3 in CI is currently broken due to some AWS @@ -131,7 +118,7 @@ def test_read_binary_snappy(ray_start_regular_shared, tmp_path): path, arrow_open_stream_args=dict(compression="snappy"), ) - assert sorted(ds.take()) == [byte_str] + assert sorted(extract_values("bytes", ds.take())) == [byte_str] def test_read_binary_snappy_inferred(ray_start_regular_shared, tmp_path): @@ -142,7 +129,7 @@ def test_read_binary_snappy_inferred(ray_start_regular_shared, tmp_path): bytes = BytesIO(byte_str) snappy.stream_compress(bytes, f) ds = ray.data.read_binary_files(path) - assert sorted(ds.take()) == [byte_str] + assert sorted(extract_values("bytes", ds.take())) == [byte_str] def test_read_binary_meta_provider( @@ -161,7 +148,7 @@ def test_read_binary_meta_provider( arrow_open_stream_args=dict(compression="snappy"), meta_provider=FastFileMetadataProvider(), ) - assert sorted(ds.take()) == [byte_str] + assert sorted(extract_values("bytes", ds.take())) == [byte_str] with pytest.raises(NotImplementedError): ray.data.read_binary_files( @@ -221,10 +208,10 @@ def skip_unpartitioned(kv_dict): ds, count=2, num_rows=2, - schema="", + schema="{bytes: binary}", num_computed=None, sorted_values=[b"1 a\n1 b\n1 c", b"3 e\n3 f\n3 g"], - ds_take_transform_fn=lambda t: t, + ds_take_transform_fn=lambda t: extract_values("bytes", t), ) assert ray.get(kept_file_counter.get.remote()) == 2 assert ray.get(skipped_file_counter.get.remote()) == 1 diff --git a/python/ray/data/tests/test_bulk_executor.py b/python/ray/data/tests/test_bulk_executor.py index 795c74a69eaf..862f8500ef06 100644 --- a/python/ray/data/tests/test_bulk_executor.py +++ b/python/ray/data/tests/test_bulk_executor.py @@ -1,3 +1,4 @@ +import pandas as pd import pytest import time @@ -13,12 +14,13 @@ from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer from ray.data._internal.execution.util import make_ref_bundles from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import extract_values, column_udf def make_transform(block_fn): def map_fn(block_iter, ctx): for block in block_iter: - yield block_fn(block) + yield pd.DataFrame(block_fn(block)) return map_fn @@ -27,7 +29,7 @@ def ref_bundles_to_list(bundles: List[RefBundle]) -> List[List[Any]]: output = [] for bundle in bundles: for block, _ in bundle.blocks: - output.append(ray.get(block)) + output.append(list(ray.get(block)["id"])) return output @@ -49,14 +51,17 @@ def test_multi_stage_execution(ray_start_10_cpus_shared, preserve_order): o1 = InputDataBuffer(inputs) def delay_first(block): + block = block["id"] if block[0] == 0: print("Delaying first block to force de-ordering") time.sleep(2) result = [b * -1 for b in block] - return result + return {"id": result} o2 = MapOperator.create(make_transform(delay_first), o1) - o3 = MapOperator.create(make_transform(lambda block: [b * 2 for b in block]), o2) + o3 = MapOperator.create( + make_transform(lambda block: {"id": [b * 2 for b in block["id"]]}), o2 + ) def reverse_sort(inputs: List[RefBundle], ctx): reversed_list = inputs[::-1] @@ -79,10 +84,14 @@ def test_basic_stats(ray_start_10_cpus_shared): inputs = make_ref_bundles([[x] for x in range(20)]) o1 = InputDataBuffer(inputs) o2 = MapOperator.create( - make_transform(lambda block: [b * 2 for b in block]), o1, name="Foo" + make_transform(lambda block: {"id": [b * 2 for b in block["id"]]}), + o1, + name="Foo", ) o3 = MapOperator.create( - make_transform(lambda block: [b * 2 for b in block]), o2, name="Bar" + make_transform(lambda block: {"id": [b * 2 for b in block["id"]]}), + o2, + name="Bar", ) it = executor.execute(o3, initial_stats=prev_stats) output = ref_bundles_to_list(it) @@ -99,8 +108,8 @@ def test_basic_stats(ray_start_10_cpus_shared): def test_e2e_bulk_sanity(ray_start_10_cpus_shared): DataContext.get_current().new_execution_backend = True DataContext.get_current().use_streaming_executor = False - result = ray.data.range(5).map(lambda x: x + 1) - assert result.take_all() == [1, 2, 3, 4, 5], result + result = ray.data.range(5).map(column_udf("id", lambda x: x + 1)) + assert extract_values("id", result.take_all()) == [1, 2, 3, 4, 5], result # Checks new executor was enabled. assert "obj_store_mem_alloc" in result.stats(), result.stats() @@ -110,9 +119,11 @@ def test_actor_strategy(ray_start_10_cpus_shared): executor = BulkExecutor(ExecutionOptions()) inputs = make_ref_bundles([[x] for x in range(20)]) o1 = InputDataBuffer(inputs) - o2 = MapOperator.create(make_transform(lambda block: [b * -1 for b in block]), o1) + o2 = MapOperator.create( + make_transform(lambda block: {"id": [b * -1 for b in block["id"]]}), o1 + ) o3 = MapOperator.create( - make_transform(lambda block: [b * 2 for b in block]), + make_transform(lambda block: {"id": [b * 2 for b in block["id"]]}), o2, compute_strategy=ActorPoolStrategy(min_size=1, max_size=2), ray_remote_args={"num_cpus": 1}, @@ -129,11 +140,11 @@ def test_new_execution_backend_invocation(ray_start_10_cpus_shared): DataContext.get_current().use_streaming_executor = False # Read-only: will use legacy executor for now. ds = ray.data.range(10) - assert ds.take_all() == list(range(10)) + assert extract_values("id", ds.take_all()) == list(range(10)) # read->randomize_block_order: will use new executor, although it's also # a read-equivalent once fused. ds = ray.data.range(10).randomize_block_order() - assert set(ds.take_all()) == set(range(10)) + assert set(extract_values("id", ds.take_all())) == set(range(10)) if __name__ == "__main__": diff --git a/python/ray/data/tests/test_consumption.py b/python/ray/data/tests/test_consumption.py index 28c8ec9470d9..a2e373ab85f3 100644 --- a/python/ray/data/tests/test_consumption.py +++ b/python/ray/data/tests/test_consumption.py @@ -1,5 +1,6 @@ import logging import math +import sys import os import random import time @@ -11,18 +12,16 @@ from unittest.mock import patch import ray -from ray.data._internal.arrow_block import ArrowRow from ray.data._internal.block_builder import BlockBuilder -from ray.data._internal.datastream_logger import DatastreamLogger +from ray.data._internal.dataset_logger import DatasetLogger from ray.data._internal.lazy_block_list import LazyBlockList -from ray.data._internal.pandas_block import PandasRow from ray.data.block import BlockAccessor, BlockMetadata from ray.data.context import DataContext -from ray.data.datastream import Dataset, MaterializedDatastream, _sliding_window +from ray.data.dataset import Dataset, MaterializedDataset, _sliding_window from ray.data.datasource.datasource import Datasource, ReadTask from ray.data.datasource.csv_datasource import CSVDatasource -from ray.data.row import TableRow from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import column_udf, extract_values, STRICT_MODE from ray.tests.conftest import * # noqa from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -42,11 +41,13 @@ def test_avoid_placement_group_capture(shutdown_only, pipelined): def run(): ds0 = ray.data.range(5) ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5] + assert sorted( + extract_values("id", ds.map(column_udf("id", lambda x: x + 1)).take()) + ) == [1, 2, 3, 4, 5] ds = maybe_pipeline(ds0, pipelined) assert ds.count() == 5 ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.iter_rows()) == [0, 1, 2, 3, 4] + assert sorted(extract_values("id", ds.iter_rows())) == [0, 1, 2, 3, 4] pg = ray.util.placement_group([{"CPU": 1}]) ray.get( @@ -61,12 +62,12 @@ def run(): def test_dataset_lineage_serialization(shutdown_only): ray.init() ds = ray.data.range(10) - ds = ds.map(lambda x: x + 1) - ds = ds.map(lambda x: x + 1) + ds = ds.map(column_udf("id", lambda x: x + 1)) + ds = ds.map(column_udf("id", lambda x: x + 1)) ds = ds.random_shuffle() epoch = ds._get_epoch() uuid = ds._get_uuid() - plan_uuid = ds._plan._datastream_uuid + plan_uuid = ds._plan._dataset_uuid serialized_ds = ds.serialize_lineage() # Confirm that the original Dataset was properly copied before clearing/mutating. @@ -83,18 +84,18 @@ def test_dataset_lineage_serialization(shutdown_only): # Check Dataset state. assert ds._get_epoch() == epoch assert ds._get_uuid() == uuid - assert ds._plan._datastream_uuid == plan_uuid + assert ds._plan._dataset_uuid == plan_uuid # Check Dataset content. assert ds.count() == 10 - assert sorted(ds.take()) == list(range(2, 12)) + assert sorted(extract_values("id", ds.take())) == list(range(2, 12)) def test_dataset_lineage_serialization_unsupported(shutdown_only): ray.init() # In-memory data sources not supported. ds = ray.data.from_items(list(range(10))) - ds = ds.map(lambda x: x + 1) - ds = ds.map(lambda x: x + 1) + ds = ds.map(column_udf("item", lambda x: x + 1)) + ds = ds.map(column_udf("item", lambda x: x + 1)) with pytest.raises(ValueError): ds.serialize_lineage() @@ -108,8 +109,8 @@ def test_dataset_lineage_serialization_unsupported(shutdown_only): ds2.serialize_lineage() # Post-lazy-read unions not supported. - ds = ray.data.range(10).map(lambda x: x + 1) - ds1 = ray.data.range(20).map(lambda x: 2 * x) + ds = ray.data.range(10).map(column_udf("id", lambda x: x + 1)) + ds1 = ray.data.range(20).map(column_udf("id", lambda x: 2 * x)) ds2 = ds.union(ds1) with pytest.raises(ValueError): @@ -122,7 +123,9 @@ def test_dataset_lineage_serialization_unsupported(shutdown_only): serialized_ds = ds2.serialize_lineage() ds3 = Dataset.deserialize_lineage(serialized_ds) - assert set(ds3.take(30)) == set(list(range(10)) + list(range(20))) + assert set(extract_values("id", ds3.take(30))) == set( + list(range(10)) + list(range(20)) + ) # Zips not supported. ds = ray.data.from_items(list(range(10))) @@ -137,23 +140,25 @@ def test_dataset_lineage_serialization_unsupported(shutdown_only): def test_basic(ray_start_regular_shared, pipelined): ds0 = ray.data.range(5) ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5] + assert sorted( + extract_values("id", ds.map(column_udf("id", lambda x: x + 1)).take()) + ) == [1, 2, 3, 4, 5] ds = maybe_pipeline(ds0, pipelined) assert ds.count() == 5 ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.iter_rows()) == [0, 1, 2, 3, 4] + assert sorted(extract_values("id", ds.iter_rows())) == [0, 1, 2, 3, 4] -def test_range_table(ray_start_regular_shared): - ds = ray.data.range_table(10, parallelism=10) +def test_range(ray_start_regular_shared): + ds = ray.data.range(10, parallelism=10) assert ds.num_blocks() == 10 assert ds.count() == 10 - assert ds.take() == [{"value": i} for i in range(10)] + assert ds.take() == [{"id": i} for i in range(10)] - ds = ray.data.range_table(10, parallelism=2) + ds = ray.data.range(10, parallelism=2) assert ds.num_blocks() == 2 assert ds.count() == 10 - assert ds.take() == [{"value": i} for i in range(10)] + assert ds.take() == [{"id": i} for i in range(10)] def test_empty_dataset(ray_start_regular_shared): @@ -163,11 +168,11 @@ def test_empty_dataset(ray_start_regular_shared): assert ds.schema() is None ds = ray.data.range(1) - ds = ds.filter(lambda x: x > 1) + ds = ds.filter(lambda x: x["id"] > 1) ds = ds.materialize() assert ( str(ds) - == "MaterializedDatastream(num_blocks=1, num_rows=0, schema=Unknown schema)" + == "MaterializedDataset(num_blocks=1, num_rows=0, schema=Unknown schema)" ) # Test map on empty dataset. @@ -203,10 +208,10 @@ def inc(x): ds = ray.data.range(1) ds = ds.map(inc) assert not ds.is_fully_executed() - assert not isinstance(ds, MaterializedDatastream) + assert not isinstance(ds, MaterializedDataset) ds2 = ds.materialize() assert ds2.is_fully_executed() - assert isinstance(ds2, MaterializedDatastream) + assert isinstance(ds2, MaterializedDataset) assert not ds.is_fully_executed() for _ in range(10): @@ -216,20 +221,17 @@ def inc(x): def test_schema(ray_start_regular_shared): - ds = ray.data.range(10, parallelism=10) - ds2 = ray.data.range_table(10, parallelism=10) + ds2 = ray.data.range(10, parallelism=10) ds3 = ds2.repartition(5) ds3 = ds3.materialize() ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1) ds4 = ds4.materialize() - assert str(ds) == "Datastream(num_blocks=10, num_rows=10, schema=)" - assert str(ds2) == "Datastream(num_blocks=10, num_rows=10, schema={value: int64})" + assert str(ds2) == "Dataset(num_blocks=10, num_rows=10, schema={id: int64})" assert ( - str(ds3) - == "MaterializedDatastream(num_blocks=5, num_rows=10, schema={value: int64})" + str(ds3) == "MaterializedDataset(num_blocks=5, num_rows=10, schema={id: int64})" ) assert ( - str(ds4) == "MaterializedDatastream(num_blocks=1, num_rows=5, " + str(ds4) == "MaterializedDataset(num_blocks=1, num_rows=5, " "schema={a: string, b: double})" ) @@ -239,7 +241,7 @@ def test_schema_lazy(ray_start_regular_shared): # We do not kick off the read task by default. assert ds._plan._in_blocks._num_computed() == 0 schema = ds.schema() - assert schema == int + assert schema.names == ["id"] # Fetching the schema does not trigger execution, since # the schema is known beforehand for RangeDatasource. assert ds._plan._in_blocks._num_computed() == 0 @@ -247,6 +249,38 @@ def test_schema_lazy(ray_start_regular_shared): assert ds._plan.execute()._num_computed() == 0 +def test_columns(ray_start_regular_shared): + ds = ray.data.range(1) + assert ds.columns() == ds.schema().names + assert ds.columns() == ["id"] + + ds = ds.map(lambda x: x) + assert ds.columns(fetch_if_missing=False) is None + + +def test_schema_repr(ray_start_regular_shared): + ds = ray.data.from_items([{"text": "spam", "number": 0}]) + # fmt: off + expected_repr = ( + "Column Type\n" + "------ ----\n" + "text string\n" + "number int64" + ) + # fmt:on + assert repr(ds.schema()) == expected_repr + + ds = ray.data.from_items([{"long_column_name": "spam"}]) + # fmt: off + expected_repr = ( + "Column Type\n" + "------ ----\n" + "long_column_name string" + ) + # fmt: on + assert repr(ds.schema()) == expected_repr + + def test_count_lazy(ray_start_regular_shared): ds = ray.data.range(100, parallelism=10) # We do not kick off the read task by default. @@ -269,63 +303,62 @@ def check_num_computed(expected): assert ds._plan.execute()._num_computed() == expected check_num_computed(0) - assert ds.take(10) == list(range(10)) + assert extract_values("id", ds.take(10)) == list(range(10)) check_num_computed(2) - assert ds.take(20) == list(range(20)) + assert extract_values("id", ds.take(20)) == list(range(20)) check_num_computed(4) - assert ds.take(30) == list(range(30)) + assert extract_values("id", ds.take(30)) == list(range(30)) check_num_computed(8) - assert ds.take(50) == list(range(50)) + assert extract_values("id", ds.take(50)) == list(range(50)) check_num_computed(16) - assert ds.take(100) == list(range(100)) + assert extract_values("id", ds.take(100)) == list(range(100)) check_num_computed(20) def test_dataset_repr(ray_start_regular_shared): ds = ray.data.range(10, parallelism=10) - assert repr(ds) == "Datastream(num_blocks=10, num_rows=10, schema=)" + assert repr(ds) == "Dataset(num_blocks=10, num_rows=10, schema={id: int64})" ds = ds.map_batches(lambda x: x) assert repr(ds) == ( "MapBatches()\n" - "+- Datastream(num_blocks=10, num_rows=10, schema=)" + "+- Dataset(num_blocks=10, num_rows=10, schema={id: int64})" ) - ds = ds.filter(lambda x: x > 0) + ds = ds.filter(lambda x: x["id"] > 0) assert repr(ds) == ( "Filter\n" "+- MapBatches()\n" - " +- Datastream(num_blocks=10, num_rows=10, schema=)" + " +- Dataset(num_blocks=10, num_rows=10, schema={id: int64})" ) ds = ds.random_shuffle() assert repr(ds) == ( "RandomShuffle\n" "+- Filter\n" " +- MapBatches()\n" - " +- Datastream(num_blocks=10, num_rows=10, schema=)" + " +- Dataset(num_blocks=10, num_rows=10, schema={id: int64})" ) ds = ds.materialize() assert ( - repr(ds) - == "MaterializedDatastream(num_blocks=10, num_rows=9, schema=)" + repr(ds) == "MaterializedDataset(num_blocks=10, num_rows=9, schema={id: int64})" ) ds = ds.map_batches(lambda x: x) assert repr(ds) == ( "MapBatches()\n" - "+- Datastream(num_blocks=10, num_rows=9, schema=)" + "+- Dataset(num_blocks=10, num_rows=9, schema={id: int64})" ) ds1, ds2 = ds.split(2) assert ( - repr(ds1) == f"MaterializedDatastream(num_blocks=5, num_rows={ds1.count()}, " - "schema=)" + repr(ds1) == f"MaterializedDataset(num_blocks=5, num_rows={ds1.count()}, " + "schema={id: int64})" ) assert ( - repr(ds2) == f"MaterializedDatastream(num_blocks=5, num_rows={ds2.count()}, " - "schema=)" + repr(ds2) == f"MaterializedDataset(num_blocks=5, num_rows={ds2.count()}, " + "schema={id: int64})" ) ds3 = ds1.union(ds2) - assert repr(ds3) == "Datastream(num_blocks=10, num_rows=9, schema=)" + assert repr(ds3) == "Dataset(num_blocks=10, num_rows=9, schema={id: int64})" ds = ds.zip(ds3) assert repr(ds) == ( - "Zip\n" "+- Datastream(num_blocks=10, num_rows=9, schema=)" + "Zip\n" "+- Dataset(num_blocks=10, num_rows=9, schema={id: int64})" ) def my_dummy_fn(x): @@ -335,7 +368,7 @@ def my_dummy_fn(x): ds = ds.map_batches(my_dummy_fn) assert repr(ds) == ( "MapBatches(my_dummy_fn)\n" - "+- Datastream(num_blocks=10, num_rows=10, schema=)" + "+- Dataset(num_blocks=10, num_rows=10, schema={id: int64})" ) @@ -345,13 +378,13 @@ def test_limit(ray_start_regular_shared, lazy): if not lazy: ds = ds.materialize() for i in range(100): - assert ds.limit(i).take(200) == list(range(i)) + assert extract_values("id", ds.limit(i).take(200)) == list(range(i)) # NOTE: We test outside the power-of-2 range in order to ensure that we're not reading # redundant files due to exponential ramp-up. -@pytest.mark.parametrize("limit,expected", [(10, 1), (20, 2), (30, 3), (60, 6)]) -def test_limit_no_redundant_read(ray_start_regular_shared, limit, expected): +@pytest.mark.parametrize("limit,min_read_tasks", [(10, 1), (20, 2), (30, 3), (60, 6)]) +def test_limit_no_redundant_read(ray_start_regular_shared, limit, min_read_tasks): # Test that dataset truncation eliminates redundant reads. @ray.remote class Counter: @@ -374,14 +407,19 @@ def __init__(self): def prepare_read(self, parallelism, n): def range_(i): ray.get(self.counter.increment.remote()) - return [list(range(parallelism * i, parallelism * i + n))] + return [ + pd.DataFrame({"id": range(parallelism * i, parallelism * i + n)}) + ] return [ ReadTask( lambda i=i: range_(i), BlockMetadata( num_rows=n, - size_bytes=None, + size_bytes=sum( + sys.getsizeof(i) + for i in range(parallelism * i, parallelism * i + n) + ), schema=None, input_files=None, exec_stats=None, @@ -392,16 +430,21 @@ def range_(i): source = CountingRangeDatasource() + parallelism = 10 ds = ray.data.read_datasource( source, - parallelism=10, + parallelism=parallelism, n=10, ) ds2 = ds.limit(limit) # Check content. - assert ds2.take(limit) == list(range(limit)) + assert extract_values("id", ds2.take(limit)) == list(range(limit)) # Check number of read tasks launched. - assert ray.get(source.counter.get.remote()) == expected + # min_read_tasks is the minimum number of read tasks needed for the limit. + # We may launch more tasks than this number, in order to to maximize throughput. + # But the actual number of read tasks should be less than the parallelism. + count = ray.get(source.counter.get.remote()) + assert min_read_tasks <= count < parallelism def test_limit_no_num_row_info(ray_start_regular_shared): @@ -411,10 +454,10 @@ class DumbOnesDatasource(Datasource): def prepare_read(self, parallelism, n): return parallelism * [ ReadTask( - lambda: [[1] * n], + lambda: [pd.DataFrame({"id": [1] * n})], BlockMetadata( num_rows=None, - size_bytes=None, + size_bytes=sys.getsizeof(1) * n, schema=None, input_files=None, exec_stats=None, @@ -424,27 +467,25 @@ def prepare_read(self, parallelism, n): ds = ray.data.read_datasource(DumbOnesDatasource(), parallelism=10, n=10) for i in range(1, 100): - assert ds.limit(i).take(100) == [1] * i + assert extract_values("id", ds.limit(i).take(100)) == [1] * i def test_convert_types(ray_start_regular_shared): plain_ds = ray.data.range(1) - arrow_ds = plain_ds.map(lambda x: {"a": x}) + arrow_ds = plain_ds.map(lambda x: {"a": x["id"]}) assert arrow_ds.take() == [{"a": 0}] - assert "ArrowRow" in arrow_ds.map(lambda x: str(type(x))).take()[0] + assert "dict" in str(arrow_ds.map(lambda x: {"out": str(type(x))}).take()[0]) - arrow_ds = ray.data.range_table(1) - assert arrow_ds.map(lambda x: "plain_{}".format(x["value"])).take() == ["plain_0"] - assert arrow_ds.map(lambda x: {"a": (x["value"],)}).take() == [{"a": [0]}] + arrow_ds = ray.data.range(1) + assert arrow_ds.map(lambda x: {"out": "plain_{}".format(x["id"])}).take() == [ + {"out": "plain_0"} + ] + assert arrow_ds.map(lambda x: {"a": (x["id"],)}).take() == [{"a": [0]}] def test_from_items(ray_start_regular_shared): ds = ray.data.from_items(["hello", "world"]) - assert ds.take() == ["hello", "world"] - assert isinstance(next(ds.iter_batches(batch_format=None)), list) - - with pytest.raises(ValueError): - ds = ray.data.from_items(["hello", "world"], output_arrow_format=True) + assert extract_values("item", ds.take()) == ["hello", "world"] ds = ray.data.from_items([{"hello": "world"}], output_arrow_format=True) assert ds.take() == [{"hello": "world"}] @@ -476,23 +517,23 @@ def test_from_items_parallelism_truncated(ray_start_regular_shared): def test_take_batch(ray_start_regular_shared): ds = ray.data.range(10, parallelism=2) - assert ds.take_batch(3) == [0, 1, 2] - assert ds.take_batch(6) == [0, 1, 2, 3, 4, 5] - assert ds.take_batch(100) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + assert ds.take_batch(3)["id"].tolist() == [0, 1, 2] + assert ds.take_batch(6)["id"].tolist() == [0, 1, 2, 3, 4, 5] + assert ds.take_batch(100)["id"].tolist() == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] assert isinstance(ds.take_batch(3, batch_format="pandas"), pd.DataFrame) - assert isinstance(ds.take_batch(3, batch_format="numpy"), np.ndarray) + assert isinstance(ds.take_batch(3, batch_format="numpy"), dict) ds = ray.data.range_tensor(10, parallelism=2) - assert np.all(ds.take_batch(3) == np.array([[0], [1], [2]])) + assert np.all(ds.take_batch(3)["data"] == np.array([[0], [1], [2]])) assert isinstance(ds.take_batch(3, batch_format="pandas"), pd.DataFrame) - assert isinstance(ds.take_batch(3, batch_format="numpy"), np.ndarray) + assert isinstance(ds.take_batch(3, batch_format="numpy"), dict) with pytest.raises(ValueError): ray.data.range(0).take_batch() def test_take_all(ray_start_regular_shared): - assert ray.data.range(5).take_all() == [0, 1, 2, 3, 4] + assert extract_values("id", ray.data.range(5).take_all()) == [0, 1, 2, 3, 4] with pytest.raises(ValueError): assert ray.data.range(5).take_all(4) @@ -522,7 +563,7 @@ def test_iter_rows(ray_start_regular_shared): n = 10 ds = ray.data.range(n) for row, k in zip(ds.iter_rows(), range(n)): - assert row == k + assert row == {"id": k} # Test tabular rows. t1 = pa.Table.from_pydict({"one": [1, 2, 3], "two": [2, 3, 4]}) @@ -544,22 +585,19 @@ def to_pylist(table): # Default ArrowRows. for row, t_row in zip(ds.iter_rows(), to_pylist(t)): - assert isinstance(row, TableRow) - assert isinstance(row, ArrowRow) + assert isinstance(row, dict) assert row == t_row # PandasRows after conversion. pandas_ds = ds.map_batches(lambda x: x, batch_format="pandas") df = t.to_pandas() for row, (index, df_row) in zip(pandas_ds.iter_rows(), df.iterrows()): - assert isinstance(row, TableRow) - assert isinstance(row, PandasRow) + assert isinstance(row, dict) assert row == df_row.to_dict() # Prefetch. for row, t_row in zip(ds.iter_rows(prefetch_blocks=1), to_pylist(t)): - assert isinstance(row, TableRow) - assert isinstance(row, ArrowRow) + assert isinstance(row, dict) assert row == t_row @@ -588,14 +626,6 @@ def test_iter_batches_basic(ray_start_regular_shared): assert all(isinstance(col, np.ndarray) for col in batch.values()) pd.testing.assert_frame_equal(pd.DataFrame(batch), df) - # Numpy format (single column). - ds2 = ds.select_columns(["one"]) - for batch, df in zip(ds2.iter_batches(batch_size=None, batch_format="numpy"), dfs): - assert isinstance(batch, dict) - assert list(batch.keys()) == ["one"] - assert all(isinstance(col, np.ndarray) for col in batch.values()) - pd.testing.assert_frame_equal(pd.DataFrame(batch), df[["one"]]) - # Test NumPy format on Arrow blocks. ds2 = ds.map_batches(lambda b: b, batch_size=None, batch_format="pyarrow") for batch, df in zip(ds2.iter_batches(batch_size=None, batch_format="numpy"), dfs): @@ -604,21 +634,12 @@ def test_iter_batches_basic(ray_start_regular_shared): assert all(isinstance(col, np.ndarray) for col in batch.values()) pd.testing.assert_frame_equal(pd.DataFrame(batch), df) - # Test NumPy format on Arrow blocks (single column). - ds3 = ds2.select_columns(["one"]) - for batch, df in zip(ds3.iter_batches(batch_size=None, batch_format="numpy"), dfs): + # Default format -> numpy. + for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="default"), dfs): assert isinstance(batch, dict) - assert list(batch.keys()) == ["one"] + assert list(batch.keys()) == ["one", "two"] assert all(isinstance(col, np.ndarray) for col in batch.values()) - pd.testing.assert_frame_equal(pd.DataFrame(batch), df[["one"]]) - - # Native format (deprecated). - for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="native"), dfs): - assert BlockAccessor.for_block(batch).to_pandas().equals(df) - - # Default format. - for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="default"), dfs): - assert BlockAccessor.for_block(batch).to_pandas().equals(df) + pd.testing.assert_frame_equal(pd.DataFrame(batch), df) # Batch size. batch_size = 2 @@ -728,12 +749,15 @@ def test_iter_batches_basic(ray_start_regular_shared): def test_iter_batches_empty_block(ray_start_regular_shared): ds = ray.data.range(1).repartition(10) - assert list(ds.iter_batches(batch_size=None)) == [[0]] - assert list(ds.iter_batches(batch_size=1, local_shuffle_buffer_size=1)) == [[0]] + assert str(list(ds.iter_batches(batch_size=None))) == "[{'id': array([0])}]" + assert ( + str(list(ds.iter_batches(batch_size=1, local_shuffle_buffer_size=1))) + == "[{'id': array([0])}]" + ) @pytest.mark.parametrize("pipelined", [False, True]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas", "simple"]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) def test_iter_batches_local_shuffle(shutdown_only, pipelined, ds_format): # Input validation. # Batch size must be given for local shuffle. @@ -745,12 +769,10 @@ def test_iter_batches_local_shuffle(shutdown_only, pipelined, ds_format): ) def range(n, parallelism=200): - if ds_format == "simple": + if ds_format == "arrow": ds = ray.data.range(n, parallelism=parallelism) - elif ds_format == "arrow": - ds = ray.data.range_table(n, parallelism=parallelism) elif ds_format == "pandas": - ds = ray.data.range_table(n, parallelism=parallelism).map_batches( + ds = ray.data.range(n, parallelism=parallelism).map_batches( lambda df: df, batch_size=None, batch_format="pandas" ) if pipelined: @@ -761,16 +783,14 @@ def range(n, parallelism=200): def to_row_dicts(batch): if isinstance(batch, pd.DataFrame): - batch = batch.to_dict(orient="records") - return batch + return batch.to_dict(orient="records") + return [{"id": v} for v in batch["id"]] def unbatch(batches): return [r for batch in batches for r in to_row_dicts(batch)] def sort(r): - if ds_format == "simple": - return sorted(r) - return sorted(r, key=lambda v: v["value"]) + return sorted(r, key=lambda v: v["id"]) base = range(100).take_all() @@ -1098,19 +1118,12 @@ def test_iter_tf_batches_tensor_ds(ray_start_regular_shared, pipelined): for _ in range(num_epochs): iterations = [] for batch in ds.iter_tf_batches(batch_size=2): - iterations.append(batch) + iterations.append(batch["data"]) combined_iterations = np.concatenate(iterations) np.testing.assert_array_equal(arr, combined_iterations) def test_block_builder_for_block(ray_start_regular_shared): - # list - builder = BlockBuilder.for_block(list()) - builder.add_block([1, 2]) - assert builder.build() == [1, 2] - builder.add_block([3, 4]) - assert builder.build() == [1, 2, 3, 4] - # pandas dataframe builder = BlockBuilder.for_block(pd.DataFrame()) b1 = pd.DataFrame({"A": [1], "B": ["a"]}) @@ -1155,10 +1168,10 @@ def _to_pandas(ds): assert ds.min("A") == 0 # Test empty dataset - ds = ray.data.range_table(10) + ds = ray.data.range(10) if ds_format == "pandas": ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).min("value") is None + assert ds.filter(lambda r: r["id"] > 10).min("id") is None # Test built-in global min aggregation with nans nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( @@ -1196,10 +1209,10 @@ def _to_pandas(ds): assert ds.max("A") == 99 # Test empty dataset - ds = ray.data.range_table(10) + ds = ray.data.range(10) if ds_format == "pandas": ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).max("value") is None + assert ds.filter(lambda r: r["id"] > 10).max("id") is None # Test built-in global max aggregation with nans nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( @@ -1237,10 +1250,10 @@ def _to_pandas(ds): assert ds.mean("A") == 49.5 # Test empty dataset - ds = ray.data.range_table(10) + ds = ray.data.range(10) if ds_format == "pandas": ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).mean("value") is None + assert ds.filter(lambda r: r["id"] > 10).mean("id") is None # Test built-in global mean aggregation with nans nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( @@ -1313,11 +1326,8 @@ def test_column_name_type_check(ray_start_regular_shared): df = pd.DataFrame({"1": np.random.rand(10), "a": np.random.rand(10)}) ds = ray.data.from_pandas(df) expected_str = ( - "MaterializedDatastream(\n" - " num_blocks=1,\n" - " num_rows=10,\n" - " schema={1: float64, a: float64}\n" - ")" + "MaterializedDataset(num_blocks=1, num_rows=10, " + "schema={1: float64, a: float64})" ) assert str(ds) == expected_str, str(ds) df = pd.DataFrame({1: np.random.rand(10), "a": np.random.rand(10)}) @@ -1331,6 +1341,7 @@ def test_len(ray_start_regular_shared): len(ds) +@pytest.mark.skipif(STRICT_MODE, reason="Deprecated in strict mode") def test_simple_block_select(): xs = list(range(100)) block_accessor = BlockAccessor.for_block(xs) @@ -1373,7 +1384,7 @@ def test_unsupported_pyarrow_versions_check(shutdown_only, unsupported_pyarrow_v # Test Arrow-native creation APIs. # Test range_table. with pytest.raises(ImportError): - ray.data.range_table(10).take_all() + ray.data.range(10).take_all() # Test from_arrow. with pytest.raises(ImportError): @@ -1405,7 +1416,7 @@ def test_unsupported_pyarrow_versions_check_disabled( # Test Arrow-native creation APIs. # Test range_table. try: - ray.data.range_table(10).take_all() + ray.data.range(10).take_all() except ImportError as e: pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") @@ -1568,7 +1579,7 @@ def test_dataset_retry_exceptions(ray_start_regular, local_path): path1 = os.path.join(local_path, "test1.csv") df1.to_csv(path1, index=False, storage_options={}) ds1 = ray.data.read_datasource(FlakyCSVDatasource(), parallelism=1, paths=path1) - ds1.write_datasource(FlakyCSVDatasource(), path=local_path, datastream_uuid="data") + ds1.write_datasource(FlakyCSVDatasource(), path=local_path, dataset_uuid="data") assert df1.equals( pd.read_csv(os.path.join(local_path, "data_000000.csv"), storage_options={}) ) @@ -1580,9 +1591,9 @@ def flaky_mapper(x): if ray.get(count) == 1: raise ValueError("oops") else: - return ray.get(count) + return {"id": ray.get(count)} - assert sorted(ds1.map(flaky_mapper).take()) == [2, 3, 4] + assert sorted(extract_values("id", ds1.map(flaky_mapper).take())) == [2, 3, 4] with pytest.raises(ValueError): ray.data.read_datasource( @@ -1597,7 +1608,9 @@ def test_datasource(ray_start_regular): source = ray.data.datasource.RandomIntRowDatasource() assert len(ray.data.read_datasource(source, n=10, num_columns=2).take()) == 10 source = ray.data.datasource.RangeDatasource() - assert ray.data.read_datasource(source, n=10).take() == list(range(10)) + assert extract_values( + "value", ray.data.read_datasource(source, n=10).take() + ) == list(range(10)) def test_polars_lazy_import(shutdown_only): @@ -1646,26 +1659,23 @@ def f(should_import_polars): def test_batch_formats(shutdown_only): ds = ray.data.range(100) - assert ds.default_batch_format() == list - assert isinstance(next(ds.iter_batches(batch_format=None)), list) - assert isinstance(next(ds.iter_batches(batch_format="default")), list) + assert isinstance(next(ds.iter_batches(batch_format=None)), pa.Table) + assert isinstance(next(ds.iter_batches(batch_format="default")), dict) assert isinstance(next(ds.iter_batches(batch_format="pandas")), pd.DataFrame) assert isinstance(next(ds.iter_batches(batch_format="pyarrow")), pa.Table) - assert isinstance(next(ds.iter_batches(batch_format="numpy")), np.ndarray) + assert isinstance(next(ds.iter_batches(batch_format="numpy")), dict) ds = ray.data.range_tensor(100) - assert ds.default_batch_format() == np.ndarray assert isinstance(next(ds.iter_batches(batch_format=None)), pa.Table) - assert isinstance(next(ds.iter_batches(batch_format="default")), np.ndarray) + assert isinstance(next(ds.iter_batches(batch_format="default")), dict) assert isinstance(next(ds.iter_batches(batch_format="pandas")), pd.DataFrame) assert isinstance(next(ds.iter_batches(batch_format="pyarrow")), pa.Table) - assert isinstance(next(ds.iter_batches(batch_format="numpy")), np.ndarray) + assert isinstance(next(ds.iter_batches(batch_format="numpy")), dict) df = pd.DataFrame({"foo": ["a", "b"], "bar": [0, 1]}) ds = ray.data.from_pandas(df) - assert ds.default_batch_format() == pd.DataFrame assert isinstance(next(ds.iter_batches(batch_format=None)), pd.DataFrame) - assert isinstance(next(ds.iter_batches(batch_format="default")), pd.DataFrame) + assert isinstance(next(ds.iter_batches(batch_format="default")), dict) assert isinstance(next(ds.iter_batches(batch_format="pandas")), pd.DataFrame) assert isinstance(next(ds.iter_batches(batch_format="pyarrow")), pa.Table) assert isinstance(next(ds.iter_batches(batch_format="numpy")), dict) @@ -1686,8 +1696,8 @@ def test_dataset_schema_after_read_stats(ray_start_cluster): def test_dataset_plan_as_string(ray_start_cluster): ds = ray.data.read_parquet("example://iris.parquet") - assert ds._plan.get_plan_as_string("Datastream") == ( - "Datastream(\n" + assert ds._plan.get_plan_as_string("Dataset") == ( + "Dataset(\n" " num_blocks=1,\n" " num_rows=150,\n" " schema={\n" @@ -1701,13 +1711,13 @@ def test_dataset_plan_as_string(ray_start_cluster): ) for _ in range(5): ds = ds.map_batches(lambda x: x) - assert ds._plan.get_plan_as_string("Datastream") == ( + assert ds._plan.get_plan_as_string("Dataset") == ( "MapBatches()\n" "+- MapBatches()\n" " +- MapBatches()\n" " +- MapBatches()\n" " +- MapBatches()\n" - " +- Datastream(\n" + " +- Dataset(\n" " num_blocks=1,\n" " num_rows=150,\n" " schema={\n" @@ -1738,7 +1748,7 @@ def test_warning_execute_with_no_cpu(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) - logger = DatastreamLogger("ray.data._internal.plan").get_logger() + logger = DatasetLogger("ray.data._internal.plan").get_logger() with patch.object( logger, "warning", @@ -1769,7 +1779,7 @@ def test_nowarning_execute_with_cpu(ray_start_cluster): # Create one node with CPUs to avoid triggering the Dataset warning ray.init(ray_start_cluster.address) - logger = DatastreamLogger("ray.data._internal.plan").get_logger() + logger = DatasetLogger("ray.data._internal.plan").get_logger() with patch.object( logger, "warning", @@ -1782,6 +1792,4 @@ def test_nowarning_execute_with_cpu(ray_start_cluster): if __name__ == "__main__": - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_context_propagation.py b/python/ray/data/tests/test_context_propagation.py index 3774cb376f82..e532e97b94d8 100644 --- a/python/ray/data/tests/test_context_propagation.py +++ b/python/ray/data/tests/test_context_propagation.py @@ -1,13 +1,45 @@ import pytest +import pandas as pd import ray from ray.tests.conftest import * # noqa from ray.data.block import BlockMetadata from ray.data.context import DataContext from ray.data.datasource import Datasource, ReadTask +from ray.data.tests.util import extract_values from ray._private.test_utils import run_string_as_driver +def test_context_saved_when_dataset_created(ray_start_regular_shared): + ctx = DataContext.get_current() + d1 = ray.data.range(10) + d2 = ray.data.range(10) + assert ctx.eager_free + assert d1.context.eager_free + assert d2.context.eager_free + + d1.context.eager_free = False + assert not d1.context.eager_free + assert d2.context.eager_free + assert ctx.eager_free + + @ray.remote(num_cpus=0) + def check(d1, d2): + assert not d1.context.eager_free + assert d2.context.eager_free + + ray.get(check.remote(d1, d2)) + + @ray.remote(num_cpus=0) + def check2(d): + d.take() + + d1.context.execution_options.resource_limits.cpu = 0.1 + with pytest.raises(ValueError): + ray.get(check2.remote(d1)) + ray.get(check2.remote(d2)) + + def test_read(ray_start_regular_shared): class CustomDatasource(Datasource): def prepare_read(self, parallelism: int): @@ -15,30 +47,30 @@ def prepare_read(self, parallelism: int): meta = BlockMetadata( num_rows=1, size_bytes=8, schema=None, input_files=None, exec_stats=None ) - return [ReadTask(lambda: [[value]], meta)] + return [ReadTask(lambda: [pd.DataFrame({"id": [value]})], meta)] context = DataContext.get_current() context.foo = 12345 - assert ray.data.read_datasource(CustomDatasource()).take_all()[0] == 12345 + assert ray.data.read_datasource(CustomDatasource()).take_all()[0]["id"] == 12345 def test_map(ray_start_regular_shared): context = DataContext.get_current() context.foo = 70001 - ds = ray.data.range(1).map(lambda x: DataContext.get_current().foo) - assert ds.take_all()[0] == 70001 + ds = ray.data.range(1).map(lambda x: {"id": DataContext.get_current().foo}) + assert ds.take_all()[0]["id"] == 70001 def test_map_pipeline(ray_start_regular_shared): context = DataContext.get_current() context.foo = 8 pipe = ray.data.range(2).repeat(2) - pipe = pipe.map(lambda x: DataContext.get_current().foo) + pipe = pipe.map(lambda x: {"id": DataContext.get_current().foo}) [a, b] = pipe.split(2) @ray.remote def fetch(shard): - return shard.take_all() + return extract_values("id", shard.take_all()) assert ray.get([fetch.remote(a), fetch.remote(b)]) == [[8, 8], [8, 8]] @@ -46,24 +78,26 @@ def fetch(shard): def test_flat_map(ray_start_regular_shared): context = DataContext.get_current() context.foo = 70002 - ds = ray.data.range(1).flat_map(lambda x: [DataContext.get_current().foo]) - assert ds.take_all()[0] == 70002 + ds = ray.data.range(1).flat_map(lambda x: [{"id": DataContext.get_current().foo}]) + assert ds.take_all()[0]["id"] == 70002 def test_map_batches(ray_start_regular_shared): context = DataContext.get_current() context.foo = 70003 - ds = ray.data.range(1).map_batches(lambda x: [DataContext.get_current().foo]) - assert ds.take_all()[0] == 70003 + ds = ray.data.range(1).map_batches( + lambda x: {"id": [DataContext.get_current().foo]} + ) + assert ds.take_all()[0]["id"] == 70003 def test_filter(shutdown_only): context = DataContext.get_current() context.foo = 70004 ds = ray.data.from_items([70004]).filter( - lambda x: x == DataContext.get_current().foo + lambda x: x["item"] == DataContext.get_current().foo ) - assert ds.take_all()[0] == 70004 + assert ds.take_all()[0]["item"] == 70004 def test_context_placement_group(): @@ -88,8 +122,8 @@ def test_context_placement_group(): context.scheduling_strategy = PlacementGroupSchedulingStrategy(placement_group) pipe = ray.data.range(100, parallelism=2) \ .window(blocks_per_window=1) \ - .map(lambda x: x + 1) -assert pipe.take_all() == list(range(1, 101)) + .map(lambda x: {"id": x["id"] + 1}) +assert pipe.take_all() == [{"id": x} for x in range(1, 101)] placement_group_assert_no_leak([placement_group]) ray.shutdown() """ diff --git a/python/ray/data/tests/test_csv.py b/python/ray/data/tests/test_csv.py index 69c571630d92..9c9852a50315 100644 --- a/python/ray/data/tests/test_csv.py +++ b/python/ray/data/tests/test_csv.py @@ -2,7 +2,7 @@ import os import shutil from functools import partial -from distutils.version import LooseVersion +from packaging.version import Version import pandas as pd import pyarrow as pa @@ -895,7 +895,7 @@ def test_csv_read_filter_non_csv_file(shutdown_only, tmp_path): @pytest.mark.skipif( - LooseVersion(pa.__version__) < LooseVersion("7.0.0"), + Version(pa.__version__) < Version("7.0.0"), reason="invalid_row_handler was added in pyarrow 7.0.0", ) def test_csv_invalid_file_handler(shutdown_only, tmp_path): diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py index 3fb95ae8737f..75eaf1d9c24a 100644 --- a/python/ray/data/tests/test_dynamic_block_split.py +++ b/python/ray/data/tests/test_dynamic_block_split.py @@ -8,7 +8,6 @@ import ray from ray.data._internal.lazy_block_list import LazyBlockList from ray.data.block import BlockMetadata -from ray.data.context import DataContext from ray.data.datasource import Datasource from ray.data.datasource.csv_datasource import CSVDatasource from ray.data.datasource.datasource import ReadTask, Reader @@ -113,18 +112,7 @@ def test_enable_in_ray_client(ray_start_cluster_enabled): "compute", [ "tasks", - # TODO(Clark): Remove skip for old execution backend once the old execution - # backend is removed. - pytest.param( - "actors", - marks=pytest.mark.skipif( - not DataContext.get_current().new_execution_backend, - reason=( - "Dynamic block splitting for the actor compute strategy is only " - "enabled for the new execution backend." - ), - ), - ), + "actors", ], ) def test_dataset( @@ -133,6 +121,10 @@ def test_dataset( target_max_block_size, compute, ): + if compute == "tasks": + compute = ray.data._internal.compute.TaskPoolStrategy() + else: + compute = ray.data.ActorPoolStrategy() ray.shutdown() # We need at least 2 CPUs to run a actorpool streaming ray.init(num_cpus=2) @@ -193,7 +185,7 @@ def test_dataset( assert len(ds.take(5)) == 5 assert len(ds.take_all()) == num_blocks_per_task * num_tasks for batch in ds.iter_batches(batch_size=10): - assert len(batch) == 10 + assert len(batch["one"]) == 10 def test_dataset_pipeline( @@ -217,7 +209,7 @@ def test_dataset_pipeline( dsp = dsp.map_batches(lambda x: x) result_batches = list(ds.iter_batches(batch_size=5)) for batch in result_batches: - assert len(batch) == 5 + assert len(batch["one"]) == 5 assert len(result_batches) == num_blocks_per_task * num_tasks / 5 dsp = ds.window(blocks_per_window=2) diff --git a/python/ray/data/tests/test_ecosystem.py b/python/ray/data/tests/test_ecosystem.py index 34abede73e94..8fb1fa71cf7f 100644 --- a/python/ray/data/tests/test_ecosystem.py +++ b/python/ray/data/tests/test_ecosystem.py @@ -80,7 +80,7 @@ def test_to_dask_tensor_column_cast_pandas(ray_start_regular_shared): ctx.enable_tensor_extension_casting = True in_df = pd.DataFrame({"a": TensorArray(data)}) ds = ray.data.from_pandas(in_df) - dtypes = ds.schema().types + dtypes = ds.schema().base_schema.types assert len(dtypes) == 1 assert isinstance(dtypes[0], TensorDtype) out_df = ds.to_dask().compute() @@ -101,7 +101,7 @@ def test_to_dask_tensor_column_cast_arrow(ray_start_regular_shared): ctx.enable_tensor_extension_casting = True in_table = pa.table({"a": ArrowTensorArray.from_numpy(data)}) ds = ray.data.from_arrow(in_table) - dtype = ds.schema().field(0).type + dtype = ds.schema().base_schema.field(0).type assert isinstance(dtype, ArrowTensorType) out_df = ds.to_dask().compute() assert out_df["a"].dtype.type is np.object_ diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py index b00a45569409..c0e575b877e5 100644 --- a/python/ray/data/tests/test_execution_optimizer.py +++ b/python/ray/data/tests/test_execution_optimizer.py @@ -43,16 +43,18 @@ FlatMap, ) from ray.data._internal.logical.operators.n_ary_operator import Zip -from ray.data._internal.usage import ( +from ray.data._internal.logical.util import ( _recorded_operators, - _recording_lock, + _recorded_operators_lock, _op_name_white_list, ) from ray.data._internal.planner.planner import Planner +from ray.data._internal.stats import DatasetStats from ray.data.aggregate import Count from ray.data.datasource.parquet_datasource import ParquetDatasource from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import extract_values, named_values, column_udf from ray.tests.conftest import * # noqa @@ -62,10 +64,10 @@ def _check_usage_record(op_names: List[str], clear_after_check: Optional[bool] = (so that subsequent checks do not use existing records of operator usage).""" for op_name in op_names: assert op_name in _op_name_white_list - with _recording_lock: + with _recorded_operators_lock: assert _recorded_operators.get(op_name, 0) > 0, _recorded_operators if clear_after_check: - with _recording_lock: + with _recorded_operators_lock: _recorded_operators.clear() @@ -95,7 +97,7 @@ def test_from_items_operator(ray_start_regular_shared, enable_optimizer): def test_from_items_e2e(ray_start_regular_shared, enable_optimizer): data = ["Hello", "World"] ds = ray.data.from_items(data) - assert ds.take_all() == data, ds + assert ds.take_all() == named_values("item", data), ds # Check that metadata fetch is included in stats. assert "FromItems" in ds.stats() @@ -121,8 +123,8 @@ def test_map_batches_operator(ray_start_regular_shared, enable_optimizer): def test_map_batches_e2e(ray_start_regular_shared, enable_optimizer): ds = ray.data.range(5) - ds = ds.map_batches(lambda x: x) - assert ds.take_all() == list(range(5)), ds + ds = ds.map_batches(column_udf("id", lambda x: x)) + assert extract_values("id", ds.take_all()) == list(range(5)), ds _check_usage_record(["ReadRange", "MapBatches"]) @@ -144,8 +146,8 @@ def test_map_rows_operator(ray_start_regular_shared, enable_optimizer): def test_map_rows_e2e(ray_start_regular_shared, enable_optimizer): ds = ray.data.range(5) - ds = ds.map(lambda x: x + 1) - assert ds.take_all() == [1, 2, 3, 4, 5], ds + ds = ds.map(column_udf("id", lambda x: x + 1)) + assert extract_values("id", ds.take_all()) == [1, 2, 3, 4, 5], ds _check_usage_record(["ReadRange", "MapRows"]) @@ -167,8 +169,8 @@ def test_filter_operator(ray_start_regular_shared, enable_optimizer): def test_filter_e2e(ray_start_regular_shared, enable_optimizer): ds = ray.data.range(5) - ds = ds.filter(fn=lambda x: x % 2 == 0) - assert ds.take_all() == [0, 2, 4], ds + ds = ds.filter(fn=lambda x: x["id"] % 2 == 0) + assert extract_values("id", ds.take_all()) == [0, 2, 4], ds _check_usage_record(["ReadRange", "Filter"]) @@ -190,15 +192,15 @@ def test_flat_map(ray_start_regular_shared, enable_optimizer): def test_flat_map_e2e(ray_start_regular_shared, enable_optimizer): ds = ray.data.range(2) - ds = ds.flat_map(fn=lambda x: [x, x]) - assert ds.take_all() == [0, 0, 1, 1], ds + ds = ds.flat_map(fn=lambda x: [{"id": x["id"]}, {"id": x["id"]}]) + assert extract_values("id", ds.take_all()) == [0, 0, 1, 1], ds _check_usage_record(["ReadRange", "FlatMap"]) def test_column_ops_e2e(ray_start_regular_shared, enable_optimizer): ds = ray.data.range(2) ds = ds.add_column(fn=lambda df: df.iloc[:, 0], col="new_col") - assert ds.take_all() == [{"value": 0, "new_col": 0}, {"value": 1, "new_col": 1}], ds + assert ds.take_all() == [{"id": 0, "new_col": 0}, {"id": 1, "new_col": 1}], ds _check_usage_record(["ReadRange", "MapBatches"]) select_ds = ds.select_columns(cols=["new_col"]) @@ -206,7 +208,7 @@ def test_column_ops_e2e(ray_start_regular_shared, enable_optimizer): _check_usage_record(["ReadRange", "MapBatches"]) ds = ds.drop_columns(cols=["new_col"]) - assert ds.take_all() == [{"value": 0}, {"value": 1}], ds + assert ds.take_all() == [{"id": 0}, {"id": 1}], ds _check_usage_record(["ReadRange", "MapBatches"]) @@ -222,7 +224,7 @@ def ensure_sample_size_close(dataset, sample_percent=0.5): ds = ray.data.range(10, parallelism=2) ensure_sample_size_close(ds) - ds = ray.data.range_table(10, parallelism=2) + ds = ray.data.range(10, parallelism=2) ensure_sample_size_close(ds) ds = ray.data.range_tensor(5, parallelism=2, shape=(2, 2)) @@ -251,18 +253,22 @@ def test_random_shuffle_e2e( ray_start_regular_shared, enable_optimizer, use_push_based_shuffle ): ds = ray.data.range(12, parallelism=4) - r1 = ds.random_shuffle(seed=0).take_all() - r2 = ds.random_shuffle(seed=1024).take_all() + r1 = extract_values("id", ds.random_shuffle(seed=0).take_all()) + r2 = extract_values("id", ds.random_shuffle(seed=1024).take_all()) assert r1 != r2, (r1, r2) assert sorted(r1) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], r1 assert sorted(r2) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], r2 _check_usage_record(["ReadRange", "RandomShuffle"]) -def test_repartition_operator(ray_start_regular_shared, enable_optimizer): +@pytest.mark.parametrize( + "shuffle", + [True, False], +) +def test_repartition_operator(ray_start_regular_shared, enable_optimizer, shuffle): planner = Planner() read_op = Read(ParquetDatasource()) - op = Repartition(read_op, num_outputs=5, shuffle=True) + op = Repartition(read_op, num_outputs=5, shuffle=shuffle) plan = LogicalPlan(op) physical_op = planner.plan(plan).dag @@ -271,25 +277,57 @@ def test_repartition_operator(ray_start_regular_shared, enable_optimizer): assert len(physical_op.input_dependencies) == 1 assert isinstance(physical_op.input_dependencies[0], MapOperator) - # Check error is thrown for non-shuffle repartition. - op = Repartition(read_op, num_outputs=5, shuffle=False) - plan = LogicalPlan(op) - with pytest.raises(AssertionError): - planner.plan(plan) - +@pytest.mark.parametrize( + "shuffle", + [True, False], +) def test_repartition_e2e( - ray_start_regular_shared, enable_optimizer, use_push_based_shuffle + ray_start_regular_shared, enable_optimizer, use_push_based_shuffle, shuffle ): - ds = ray.data.range(10000, parallelism=10) - ds1 = ds.repartition(20, shuffle=True) - assert ds1._block_num_rows() == [500] * 20, ds - - # Check error is thrown for non-shuffle repartition. - with pytest.raises(AssertionError): - ds.repartition(20, shuffle=False).take_all() - - _check_usage_record(["ReadRange", "Repartition"]) + def _check_repartition_usage_and_stats(ds): + _check_usage_record(["ReadRange", "Repartition"]) + ds_stats: DatasetStats = ds._plan.stats() + if shuffle: + assert ds_stats.base_name == "DoRead->Repartition" + assert "DoRead->RepartitionMap" in ds_stats.stages + else: + assert ds_stats.base_name == "Repartition" + assert "RepartitionSplit" in ds_stats.stages + assert "RepartitionReduce" in ds_stats.stages + + ds = ray.data.range(10000, parallelism=10).repartition(20, shuffle=shuffle) + assert ds.num_blocks() == 20, ds.num_blocks() + assert ds.sum() == sum(range(10000)) + assert ds._block_num_rows() == [500] * 20, ds._block_num_rows() + _check_repartition_usage_and_stats(ds) + + # Test num_output_blocks > num_rows to trigger empty block handling. + ds = ray.data.range(20, parallelism=10).repartition(40, shuffle=shuffle) + assert ds.num_blocks() == 40, ds.num_blocks() + assert ds.sum() == sum(range(20)) + if shuffle: + assert ds._block_num_rows() == [10] * 2 + [0] * (40 - 2), ds._block_num_rows() + else: + assert ds._block_num_rows() == [1] * 20 + [0] * 20, ds._block_num_rows() + _check_repartition_usage_and_stats(ds) + + # Test case where number of rows does not divide equally into num_output_blocks. + ds = ray.data.range(22).repartition(4, shuffle=shuffle) + assert ds.num_blocks() == 4, ds.num_blocks() + assert ds.sum() == sum(range(22)) + if shuffle: + assert ds._block_num_rows() == [6, 6, 6, 4], ds._block_num_rows() + else: + assert ds._block_num_rows() == [5, 6, 5, 6], ds._block_num_rows() + _check_repartition_usage_and_stats(ds) + + # Test case where we do not split on repartitioning. + ds = ray.data.range(10, parallelism=1).repartition(1, shuffle=shuffle) + assert ds.num_blocks() == 1, ds.num_blocks() + assert ds.sum() == sum(range(10)) + assert ds._block_num_rows() == [10], ds._block_num_rows() + _check_repartition_usage_and_stats(ds) def test_read_map_batches_operator_fusion(ray_start_regular_shared, enable_optimizer): @@ -386,8 +424,8 @@ def test_read_map_batches_operator_fusion_compute_tasks_to_actors( # the former comes before the latter. planner = Planner() read_op = Read(ParquetDatasource()) - op = MapBatches(read_op, lambda x: x, compute="tasks") - op = MapBatches(op, lambda x: x, compute="actors") + op = MapBatches(read_op, lambda x: x) + op = MapBatches(op, lambda x: x, compute=ray.data.ActorPoolStrategy()) logical_plan = LogicalPlan(op) physical_plan = planner.plan(logical_plan) physical_plan = PhysicalOptimizer().optimize(physical_plan) @@ -406,7 +444,7 @@ def test_read_map_batches_operator_fusion_compute_read_to_actors( # Test that reads fuse into an actor-based map operator. planner = Planner() read_op = Read(ParquetDatasource()) - op = MapBatches(read_op, lambda x: x, compute="actors") + op = MapBatches(read_op, lambda x: x, compute=ray.data.ActorPoolStrategy()) logical_plan = LogicalPlan(op) physical_plan = planner.plan(logical_plan) physical_plan = PhysicalOptimizer().optimize(physical_plan) @@ -425,8 +463,8 @@ def test_read_map_batches_operator_fusion_incompatible_compute( # Test that map operators are not fused when compute strategies are incompatible. planner = Planner() read_op = Read(ParquetDatasource()) - op = MapBatches(read_op, lambda x: x, compute="actors") - op = MapBatches(op, lambda x: x, compute="tasks") + op = MapBatches(read_op, lambda x: x, compute=ray.data.ActorPoolStrategy()) + op = MapBatches(op, lambda x: x) logical_plan = LogicalPlan(op) physical_plan = planner.plan(logical_plan) physical_plan = PhysicalOptimizer().optimize(physical_plan) @@ -478,8 +516,8 @@ class UDF: def __call__(self, x): return x - op = MapBatches(read_op, UDF, compute="actors") - op = MapBatches(op, UDF, compute="actors") + op = MapBatches(read_op, UDF, compute=ray.data.ActorPoolStrategy()) + op = MapBatches(op, UDF, compute=ray.data.ActorPoolStrategy()) logical_plan = LogicalPlan(op) physical_plan = planner.plan(logical_plan) physical_plan = PhysicalOptimizer().optimize(physical_plan) @@ -507,8 +545,8 @@ class UDF2: def __call__(self, x): return x + 1 - op = MapBatches(read_op, UDF, compute="actors") - op = MapBatches(op, UDF2, compute="actors") + op = MapBatches(read_op, UDF, compute=ray.data.ActorPoolStrategy()) + op = MapBatches(op, UDF2, compute=ray.data.ActorPoolStrategy()) logical_plan = LogicalPlan(op) physical_plan = planner.plan(logical_plan) physical_plan = PhysicalOptimizer().optimize(physical_plan) @@ -539,10 +577,18 @@ def __init__(self, a): def __call__(self, x): return x + self._a - op = MapBatches(read_op, UDF, compute="actors", fn_constructor_args=(1,)) - op = MapBatches(op, UDF, compute="actors", fn_constructor_args=(2,)) - op = MapBatches(op, UDF, compute="actors", fn_constructor_kwargs={"a": 1}) - op = MapBatches(op, UDF, compute="actors", fn_constructor_kwargs={"a": 2}) + op = MapBatches( + read_op, UDF, compute=ray.data.ActorPoolStrategy(), fn_constructor_args=(1,) + ) + op = MapBatches( + op, UDF, compute=ray.data.ActorPoolStrategy(), fn_constructor_args=(2,) + ) + op = MapBatches( + op, UDF, compute=ray.data.ActorPoolStrategy(), fn_constructor_kwargs={"a": 1} + ) + op = MapBatches( + op, UDF, compute=ray.data.ActorPoolStrategy(), fn_constructor_kwargs={"a": 2} + ) logical_plan = LogicalPlan(op) physical_plan = planner.plan(logical_plan) physical_plan = PhysicalOptimizer().optimize(physical_plan) @@ -562,13 +608,169 @@ def __call__(self, x): assert isinstance(physical_op.input_dependencies[0], InputDataBuffer) +def test_read_map_batches_operator_fusion_with_randomize_blocks_operator( + ray_start_regular_shared, enable_optimizer +): + # Note: We currently do not fuse MapBatches->RandomizeBlocks. + # This test is to ensure that we don't accidentally fuse them. + # There is also an additional optimization rule, under ReorderRandomizeBlocksRule, + # which collapses RandomizeBlocks operators, so we should not be fusing them + # to begin with. + def fn(batch): + return {"id": [x + 1 for x in batch["id"]]} + + n = 10 + ds = ray.data.range(n) + ds = ds.randomize_block_order() + ds = ds.map_batches(fn, batch_size=None) + assert set(extract_values("id", ds.take_all())) == set(range(1, n + 1)) + assert "RandomizeBlocks" not in ds.stats() + assert "DoRead->MapBatches->RandomizeBlocks" not in ds.stats() + assert "DoRead->MapBatches" in ds.stats() + _check_usage_record(["ReadRange", "MapBatches", "RandomizeBlocks"]) + + +def test_read_map_batches_operator_fusion_with_random_shuffle_operator( + ray_start_regular_shared, enable_optimizer, use_push_based_shuffle +): + # Note: we currently only support fusing MapOperator->AllToAllOperator. + def fn(batch): + return {"id": [x + 1 for x in batch["id"]]} + + n = 10 + ds = ray.data.range(n) + ds = ds.map_batches(fn, batch_size=None) + ds = ds.random_shuffle() + assert set(extract_values("id", ds.take_all())) == set(range(1, n + 1)) + assert "DoRead->MapBatches->RandomShuffle" in ds.stats() + _check_usage_record(["ReadRange", "MapBatches", "RandomShuffle"]) + + ds = ray.data.range(n) + ds = ds.random_shuffle() + ds = ds.map_batches(fn, batch_size=None) + assert set(extract_values("id", ds.take_all())) == set(range(1, n + 1)) + # TODO(Scott): Update below assertion after supporting fusion in + # the other direction (AllToAllOperator->MapOperator) + assert "DoRead->RandomShuffle->MapBatches" not in ds.stats() + assert all(op in ds.stats() for op in ("DoRead", "RandomShuffle", "MapBatches")) + _check_usage_record(["ReadRange", "RandomShuffle", "MapBatches"]) + + # Test fusing multiple `map_batches` with multiple `random_shuffle` operations. + ds = ray.data.range(n) + for _ in range(5): + ds = ds.map_batches(fn, batch_size=None) + ds = ds.random_shuffle() + assert set(extract_values("id", ds.take_all())) == set(range(5, n + 5)) + assert f"DoRead->{'MapBatches->' * 5}RandomShuffle" in ds.stats() + + # For interweaved map_batches and random_shuffle operations, we expect to fuse the + # two pairs of MapBatches->RandomShuffle, but not the resulting + # RandomShuffle operators. + ds = ray.data.range(n) + ds = ds.map_batches(fn, batch_size=None) + ds = ds.random_shuffle() + ds = ds.map_batches(fn, batch_size=None) + ds = ds.random_shuffle() + assert set(extract_values("id", ds.take_all())) == set(range(2, n + 2)) + assert "Stage 1 DoRead->MapBatches->RandomShuffle" in ds.stats() + assert "Stage 2 MapBatches->RandomShuffle" + _check_usage_record(["ReadRange", "RandomShuffle", "MapBatches"]) + + +@pytest.mark.parametrize("shuffle", (True, False)) +def test_read_map_batches_operator_fusion_with_repartition_operator( + ray_start_regular_shared, enable_optimizer, shuffle, use_push_based_shuffle +): + def fn(batch): + return {"id": [x + 1 for x in batch["id"]]} + + n = 10 + ds = ray.data.range(n) + ds = ds.map_batches(fn, batch_size=None) + ds = ds.repartition(2, shuffle=shuffle) + assert set(extract_values("id", ds.take_all())) == set(range(1, n + 1)) + + # Operator fusion is only supported for shuffle repartition. + if shuffle: + assert "DoRead->MapBatches->Repartition" in ds.stats() + else: + assert "DoRead->MapBatches->Repartition" not in ds.stats() + assert "DoRead->MapBatches" in ds.stats() + assert "Repartition" in ds.stats() + _check_usage_record(["ReadRange", "MapBatches", "Repartition"]) + + +def test_read_map_batches_operator_fusion_with_sort_operator( + ray_start_regular_shared, enable_optimizer +): + # Note: We currently do not fuse MapBatches->Sort. + # This test is to ensure that we don't accidentally fuse them, until + # we implement it later. + def fn(batch): + return {"id": [x + 1 for x in batch["id"]]} + + n = 10 + ds = ray.data.range(n) + ds = ds.map_batches(fn, batch_size=None) + ds = ds.sort("id") + assert extract_values("id", ds.take_all()) == list(range(1, n + 1)) + # TODO(Scott): update the below assertions after we support fusion. + assert "DoRead->MapBatches->Sort" not in ds.stats() + assert "DoRead->MapBatches" in ds.stats() + assert "Sort" in ds.stats() + _check_usage_record(["ReadRange", "MapBatches", "Sort"]) + + +def test_read_map_batches_operator_fusion_with_aggregate_operator( + ray_start_regular_shared, enable_optimizer +): + from ray.data.aggregate import AggregateFn + + # Note: We currently do not fuse MapBatches->Aggregate. + # This test is to ensure that we don't accidentally fuse them, until + # we implement it later. + def fn(batch): + return {"id": [x % 2 for x in batch["id"]]} + + n = 100 + grouped_ds = ray.data.range(n).map_batches(fn, batch_size=None).groupby("id") + agg_ds = grouped_ds.aggregate( + AggregateFn( + init=lambda k: [0, 0], + accumulate_row=lambda a, r: [a[0] + r["id"], a[1] + 1], + merge=lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]], + finalize=lambda a: a[0] / a[1], + name="foo", + ), + ) + agg_ds.take_all() == [{"id": 0, "foo": 0.0}, {"id": 1, "foo": 1.0}] + # TODO(Scott): update the below assertions after we support fusion. + assert "DoRead->MapBatches->Aggregate" not in agg_ds.stats() + assert "DoRead->MapBatches" in agg_ds.stats() + assert "Aggregate" in agg_ds.stats() + _check_usage_record(["ReadRange", "MapBatches", "Aggregate"]) + + def test_read_map_chain_operator_fusion_e2e(ray_start_regular_shared, enable_optimizer): ds = ray.data.range(10, parallelism=2) - ds = ds.filter(lambda x: x % 2 == 0) - ds = ds.map(lambda x: x + 1) - ds = ds.map_batches(lambda batch: [2 * x for x in batch], batch_size=None) - ds = ds.flat_map(lambda x: [-x, x]) - assert ds.take_all() == [-2, 2, -6, 6, -10, 10, -14, 14, -18, 18] + ds = ds.filter(lambda x: x["id"] % 2 == 0) + ds = ds.map(column_udf("id", lambda x: x + 1)) + ds = ds.map_batches( + lambda batch: {"id": [2 * x for x in batch["id"]]}, batch_size=None + ) + ds = ds.flat_map(lambda x: [{"id": -x["id"]}, {"id": x["id"]}]) + assert extract_values("id", ds.take_all()) == [ + -2, + 2, + -6, + 6, + -10, + 10, + -14, + 14, + -18, + 18, + ] name = "DoRead->Filter->MapRows->MapBatches->FlatMap:" assert name in ds.stats() _check_usage_record(["ReadRange", "Filter", "MapRows", "MapBatches", "FlatMap"]) @@ -620,8 +822,8 @@ def test_sort_e2e( ): ds = ray.data.range(100, parallelism=4) ds = ds.random_shuffle() - ds = ds.sort() - assert ds.take_all() == list(range(100)) + ds = ds.sort("id") + assert extract_values("id", ds.take_all()) == list(range(100)) _check_usage_record(["ReadRange", "RandomShuffle", "Sort"]) # TODO: write_XXX and from_XXX are not supported yet in new execution plan. @@ -648,13 +850,11 @@ def test_sort_validate_keys( enable_optimizer, ): ds = ray.data.range(10) - assert ds.sort().take_all() == list(range(10)) + assert extract_values("id", ds.sort("id").take_all()) == list(range(10)) invalid_col_name = "invalid_column" with pytest.raises( - ValueError, - match=f"String key '{invalid_col_name}' requires datastream format to be " - "'arrow' or 'pandas', was 'simple'", + ValueError, match=f"The column '{invalid_col_name}' does not exist" ): ds.sort(invalid_col_name).take_all() @@ -679,16 +879,6 @@ def test_sort_validate_keys( ): ds_named.sort(invalid_col_name).take_all() - def dummy_sort_fn(x): - return x - - with pytest.raises( - ValueError, - match=f"Callable key '{dummy_sort_fn}' requires datastream format to be " - "'simple'", - ): - ds_named.sort(dummy_sort_fn).take_all() - def test_aggregate_operator(ray_start_regular_shared, enable_optimizer): planner = Planner() @@ -712,11 +902,11 @@ def test_aggregate_e2e( enable_optimizer, use_push_based_shuffle, ): - ds = ray.data.range_table(100, parallelism=4) - ds = ds.groupby("value").count() + ds = ray.data.range(100, parallelism=4) + ds = ds.groupby("id").count() assert ds.count() == 100 - for idx, row in enumerate(ds.sort("value").iter_rows()): - assert row.as_pydict() == {"value": idx, "count()": 1} + for idx, row in enumerate(ds.sort("id").iter_rows()): + assert row == {"id": idx, "count()": 1} _check_usage_record(["ReadRange", "Aggregate"]) @@ -725,14 +915,9 @@ def test_aggregate_validate_keys( enable_optimizer, ): ds = ray.data.range(10) - # Test case with key=None, i.e. grouped into a single group. - assert ds.groupby(key=None).count().take_all() == [(10,)] - invalid_col_name = "invalid_column" with pytest.raises( - ValueError, - match=f"String key '{invalid_col_name}' requires datastream format to be " - "'arrow' or 'pandas', was 'simple'", + ValueError, match=f"The column '{invalid_col_name}' does not exist" ): ds.groupby(invalid_col_name).count() @@ -764,16 +949,6 @@ def test_aggregate_validate_keys( ): ds_named.groupby(invalid_col_name).count() - def dummy_sort_fn(x): - return x - - with pytest.raises( - ValueError, - match=f"Callable key '{dummy_sort_fn}' requires datastream format to be " - "'simple'", - ): - ds_named.groupby(dummy_sort_fn).count() - def test_zip_operator(ray_start_regular_shared, enable_optimizer): planner = Planner() @@ -797,9 +972,11 @@ def test_zip_operator(ray_start_regular_shared, enable_optimizer): def test_zip_e2e(ray_start_regular_shared, enable_optimizer, num_blocks1, num_blocks2): n = 12 ds1 = ray.data.range(n, parallelism=num_blocks1) - ds2 = ray.data.range(n, parallelism=num_blocks2).map(lambda x: x + 1) + ds2 = ray.data.range(n, parallelism=num_blocks2).map( + column_udf("id", lambda x: x + 1) + ) ds = ds1.zip(ds2) - assert ds.take() == list(zip(range(n), range(1, n + 1))) + assert ds.take() == named_values(["id", "id_1"], zip(range(n), range(1, n + 1))) _check_usage_record(["ReadRange", "Zip"]) @@ -978,7 +1155,7 @@ def test_from_numpy_refs_e2e(ray_start_regular_shared, enable_optimizer): arr2 = np.expand_dims(np.arange(4, 8), axis=1) ds = ray.data.from_numpy_refs([ray.put(arr1), ray.put(arr2)]) - values = np.stack(ds.take(8)) + values = np.stack(extract_values("data", ds.take(8))) np.testing.assert_array_equal(values, np.concatenate((arr1, arr2))) # Check that conversion task is included in stats. assert "FromNumpyRefs" in ds.stats() @@ -987,7 +1164,7 @@ def test_from_numpy_refs_e2e(ray_start_regular_shared, enable_optimizer): # Test chaining multiple operations ds2 = ds.map_batches(lambda x: x) - values = np.stack(ds2.take(8)) + values = np.stack(extract_values("data", ds2.take(8))) np.testing.assert_array_equal(values, np.concatenate((arr1, arr2))) assert "MapBatches" in ds2.stats() assert "FromNumpyRefs" in ds2.stats() @@ -996,7 +1173,7 @@ def test_from_numpy_refs_e2e(ray_start_regular_shared, enable_optimizer): # Test from single NumPy ndarray. ds = ray.data.from_numpy_refs(ray.put(arr1)) - values = np.stack(ds.take(4)) + values = np.stack(extract_values("data", ds.take(4))) np.testing.assert_array_equal(values, arr1) # Check that conversion task is included in stats. assert "FromNumpyRefs" in ds.stats() @@ -1132,7 +1309,7 @@ def test_from_tf_e2e(ray_start_regular_shared, enable_optimizer): ray_dataset = ray.data.from_tf(tf_dataset) - actual_data = ray_dataset.take_all() + actual_data = extract_values("item", ray_dataset.take_all()) expected_data = list(tf_dataset) assert len(actual_data) == len(expected_data) for (expected_features, expected_label), (actual_features, actual_label) in zip( @@ -1172,7 +1349,7 @@ def test_from_torch_e2e(ray_start_regular_shared, enable_optimizer, tmp_path): expected_data = list(torch_dataset) actual_data = list(ray_dataset.take_all()) - assert actual_data == expected_data + assert extract_values("item", actual_data) == expected_data # Check that metadata fetch is included in stats. assert "FromItems" in ray_dataset.stats() @@ -1185,7 +1362,7 @@ def test_blocks_to_input_buffer_op_name( ray_start_regular_shared, enable_streaming_executor, ): - ds: ray.data.Datastream = ray.data.range(10) + ds: ray.data.Dataset = ray.data.range(10) blocks, _, _ = ds._plan._optimize() assert hasattr(blocks, "_tasks"), blocks physical_op = _blocks_to_input_buffer(blocks, owns_blocks=False) @@ -1202,7 +1379,7 @@ def test_execute_to_legacy_block_list( assert ds._plan._snapshot_stats is None for i, row in enumerate(ds.iter_rows()): - assert row == i + assert row["id"] == i assert ds._plan._snapshot_stats is not None assert "DoRead" in ds._plan._snapshot_stats.stages @@ -1231,12 +1408,13 @@ def test_streaming_executor( ): ds = ray.data.range(100, parallelism=4) ds = ds.map_batches(lambda x: x) - ds = ds.filter(lambda x: x > 0) + ds = ds.filter(lambda x: x["id"] > 0) ds = ds.random_shuffle() ds = ds.map_batches(lambda x: x) result = [] for batch in ds.iter_batches(batch_size=3): + batch = batch["id"] assert len(batch) == 3, batch result.extend(batch) assert sorted(result) == list(range(1, 100)), result diff --git a/python/ray/data/tests/test_executor_resource_management.py b/python/ray/data/tests/test_executor_resource_management.py index 4a26cb58e084..d00b28e5b5c3 100644 --- a/python/ray/data/tests/test_executor_resource_management.py +++ b/python/ray/data/tests/test_executor_resource_management.py @@ -13,7 +13,7 @@ from ray.data.tests.conftest import * # noqa -SMALL_STR = "hello" * 12 +SMALL_STR = "hello" * 120 def test_resource_utils(ray_start_10_cpus_shared): @@ -97,7 +97,7 @@ def test_task_pool_resource_reporting(ray_start_10_cpus_shared): usage = op.current_resource_usage() assert usage.cpu == 2, usage assert usage.gpu == 0, usage - assert usage.object_store_memory == pytest.approx(128, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(1280, rel=0.5), usage def test_task_pool_resource_reporting_with_bundling(ray_start_10_cpus_shared): @@ -119,20 +119,20 @@ def test_task_pool_resource_reporting_with_bundling(ray_start_10_cpus_shared): assert usage.cpu == 0, usage assert usage.gpu == 0, usage # Queued bundles (in bundler) still count against object storage usage. - assert usage.object_store_memory == pytest.approx(80, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(800, rel=0.5), usage op.add_input(input_op.get_next(), 0) usage = op.current_resource_usage() # No tasks submitted yet due to bundling. assert usage.cpu == 0, usage assert usage.gpu == 0, usage # Queued bundles (in bundler) still count against object storage usage. - assert usage.object_store_memory == pytest.approx(160, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(1600, rel=0.5), usage op.add_input(input_op.get_next(), 0) usage = op.current_resource_usage() # Task has now been submitted since we've met the minimum bundle size. assert usage.cpu == 1, usage assert usage.gpu == 0, usage - assert usage.object_store_memory == pytest.approx(240, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(2400, rel=0.5), usage def test_actor_pool_resource_reporting(ray_start_10_cpus_shared): @@ -163,13 +163,13 @@ def test_actor_pool_resource_reporting(ray_start_10_cpus_shared): assert usage.cpu == 2, usage assert usage.gpu == 0, usage # Queued bundles still count against object store usage. - assert usage.object_store_memory == pytest.approx((i + 1) * 80, rel=0.5), usage + assert usage.object_store_memory == pytest.approx((i + 1) * 800, rel=0.5), usage # Pool is still idle while waiting for actors to start. usage = op.current_resource_usage() assert usage.cpu == 2, usage assert usage.gpu == 0, usage # Queued bundles still count against object store usage. - assert usage.object_store_memory == pytest.approx(320, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(3200, rel=0.5), usage # Wait for actors to start. work_refs = op.get_work_refs() @@ -189,7 +189,7 @@ def test_actor_pool_resource_reporting(ray_start_10_cpus_shared): assert usage.cpu == 2, usage assert usage.gpu == 0, usage # Now that tasks have been submitted, object store memory is accounted for. - assert usage.object_store_memory == pytest.approx(256, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(2560, rel=0.5), usage # Indicate that no more inputs will arrive. op.inputs_done() @@ -206,7 +206,7 @@ def test_actor_pool_resource_reporting(ray_start_10_cpus_shared): usage = op.current_resource_usage() assert usage.cpu == 0, usage assert usage.gpu == 0, usage - assert usage.object_store_memory == pytest.approx(550, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(5500, rel=0.5), usage # Consume task outputs. while op.has_next(): @@ -248,13 +248,13 @@ def test_actor_pool_resource_reporting_with_bundling(ray_start_10_cpus_shared): assert usage.cpu == 2, usage assert usage.gpu == 0, usage # Queued bundles still count against object store usage. - assert usage.object_store_memory == pytest.approx((i + 1) * 80, rel=0.5), usage + assert usage.object_store_memory == pytest.approx((i + 1) * 800, rel=0.5), usage # Pool is still idle while waiting for actors to start. usage = op.current_resource_usage() assert usage.cpu == 2, usage assert usage.gpu == 0, usage # Queued bundles still count against object store usage. - assert usage.object_store_memory == pytest.approx(320, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(3200, rel=0.5), usage # Wait for actors to start. work_refs = op.get_work_refs() @@ -273,7 +273,7 @@ def test_actor_pool_resource_reporting_with_bundling(ray_start_10_cpus_shared): usage = op.current_resource_usage() assert usage.cpu == 2, usage assert usage.gpu == 0, usage - assert usage.object_store_memory == pytest.approx(320, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(3200, rel=0.5), usage # Indicate that no more inputs will arrive. op.inputs_done() @@ -290,7 +290,7 @@ def test_actor_pool_resource_reporting_with_bundling(ray_start_10_cpus_shared): usage = op.current_resource_usage() assert usage.cpu == 0, usage assert usage.gpu == 0, usage - assert usage.object_store_memory == pytest.approx(550, rel=0.5), usage + assert usage.object_store_memory == pytest.approx(5500, rel=0.5), usage # Consume task outputs. while op.has_next(): diff --git a/python/ray/data/tests/test_formats.py b/python/ray/data/tests/test_formats.py index 5658e5a425f5..5790eff6a0a4 100644 --- a/python/ray/data/tests/test_formats.py +++ b/python/ray/data/tests/test_formats.py @@ -1,5 +1,5 @@ import os -from typing import List, Union +from typing import List import pandas as pd import pyarrow as pa @@ -13,7 +13,6 @@ import ray from ray._private.test_utils import wait_for_condition -from ray.data._internal.arrow_block import ArrowRow from ray.data._internal.execution.interfaces import TaskContext from ray.data.block import Block, BlockAccessor from ray.data.datasource import ( @@ -24,6 +23,7 @@ from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa +from ray.data.tests.util import extract_values from ray.tests.conftest import * # noqa from ray.types import ObjectRef from typing import Iterable @@ -82,17 +82,7 @@ def test_from_arrow_refs(ray_start_regular_shared): def test_to_arrow_refs(ray_start_regular_shared): n = 5 - - # Zero-copy. - df = pd.DataFrame({"value": list(range(n))}) - ds = ray.data.range_table(n) - dfds = pd.concat( - [t.to_pandas() for t in ray.get(ds.to_arrow_refs())], ignore_index=True - ) - assert df.equals(dfds) - - # Conversion. - df = pd.DataFrame({"value": list(range(n))}) + df = pd.DataFrame({"id": list(range(n))}) ds = ray.data.range(n) dfds = pd.concat( [t.to_pandas() for t in ray.get(ds.to_arrow_refs())], ignore_index=True @@ -105,7 +95,7 @@ def test_get_internal_block_refs(ray_start_regular_shared): assert len(blocks) == 10 out = [] for b in ray.get(blocks): - out.extend(list(BlockAccessor.for_block(b).iter_rows())) + out.extend(extract_values("id", BlockAccessor.for_block(b).iter_rows(True))) out = sorted(out) assert out == list(range(10)), out @@ -203,7 +193,7 @@ def test_from_tf(ray_start_regular_shared): ray_dataset = ray.data.from_tf(tf_dataset) - actual_data = ray_dataset.take_all() + actual_data = extract_values("item", ray_dataset.take_all()) expected_data = list(tf_dataset) assert len(actual_data) == len(expected_data) for (expected_features, expected_label), (actual_features, actual_label) in zip( @@ -219,11 +209,11 @@ def test_from_torch(shutdown_only, tmp_path): ray_dataset = ray.data.from_torch(torch_dataset) - actual_data = list(ray_dataset.take_all()) + actual_data = extract_values("item", list(ray_dataset.take_all())) assert actual_data == expected_data -class NodeLoggerOutputDatasource(Datasource[Union[ArrowRow, int]]): +class NodeLoggerOutputDatasource(Datasource): """A writable datasource that logs node IDs of write tasks, for testing.""" def __init__(self): @@ -329,13 +319,8 @@ def test_read_s3_file_error(shutdown_only, s3_path): # tests should only be carefully reordered to retain this invariant! -def test_get_read_tasks(ray_start_cluster): - ray.shutdown() - cluster = ray_start_cluster - cluster.add_node(num_cpus=4) - cluster.add_node(num_cpus=4) - cluster.wait_for_nodes() - ray.init(cluster.address) +def test_get_read_tasks(shutdown_only): + ray.init() head_node_id = ray.get_runtime_context().get_node_id() @@ -344,11 +329,9 @@ def test_get_read_tasks(ray_start_cluster): # Verify `_get_read_tasks` being executed on same node (head node). def verify_get_read_tasks(): - from ray.experimental.state.api import list_tasks + from ray.util.state import list_tasks - task_states = list_tasks( - address=cluster.address, filters=[("name", "=", "_get_read_tasks")] - ) + task_states = list_tasks(filters=[("name", "=", "_get_read_tasks")]) # Verify only one task being executed on same node. assert len(task_states) == 1 assert task_states[0]["name"] == "_get_read_tasks" diff --git a/python/ray/data/tests/test_image.py b/python/ray/data/tests/test_image.py index 4b0c9e265f82..17a3f911de7f 100644 --- a/python/ray/data/tests/test_image.py +++ b/python/ray/data/tests/test_image.py @@ -162,7 +162,7 @@ def test_e2e_prediction(self, shutdown_only): transform = transforms.ToTensor() def preprocess(batch: Dict[str, np.ndarray]): - return np.stack([transform(image) for image in batch["image"]]) + return {"out": np.stack([transform(image) for image in batch["image"]])} dataset = dataset.map_batches(preprocess, batch_format="numpy") diff --git a/python/ray/data/tests/test_iterator.py b/python/ray/data/tests/test_iterator.py index e6b57154c0fd..0f7f9e02edc2 100644 --- a/python/ray/data/tests/test_iterator.py +++ b/python/ray/data/tests/test_iterator.py @@ -29,7 +29,8 @@ def test_basic_dataset(ray_start_regular_shared): for _ in range(2): result = [] for batch in it.iter_batches(): - result += batch + batch = batch["id"] + result += batch.tolist() assert result == list(range(100)) # TODO(swang): This check currently fails nondeterministically because @@ -44,6 +45,7 @@ def test_basic_dataset_iter_rows(ray_start_regular_shared): for _ in range(2): result = [] for row in it.iter_rows(): + row = row["id"] result.append(row) assert result == list(range(100)) @@ -59,6 +61,7 @@ def test_basic_dataset_pipeline(ray_start_regular_shared): for _ in range(2): result = [] for batch in it.iter_batches(): + batch = batch["id"].tolist() result += batch assert result == list(range(100)) @@ -71,6 +74,7 @@ def test_basic_dataset_pipeline_iter_rows(ray_start_regular_shared): for _ in range(2): result = [] for row in it.iter_rows(): + row = row["id"] result.append(row) assert result == list(range(100)) @@ -78,9 +82,9 @@ def test_basic_dataset_pipeline_iter_rows(ray_start_regular_shared): def test_tf_conversion(ray_start_regular_shared): - ds = ray.data.range_table(5) + ds = ray.data.range(5) it = ds.iterator() - tf_dataset = it.to_tf("value", "value") + tf_dataset = it.to_tf("id", "id") for i, row in enumerate(tf_dataset): assert all(row[0] == i) assert all(row[1] == i) @@ -89,30 +93,30 @@ def test_tf_conversion(ray_start_regular_shared): def test_tf_e2e(ray_start_regular_shared): - ds = ray.data.range_table(5) + ds = ray.data.range(5) it = ds.iterator() model = build_model() - model.fit(it.to_tf("value", "value"), epochs=3) + model.fit(it.to_tf("id", "id"), epochs=3) def test_tf_e2e_pipeline(ray_start_regular_shared): - ds = ray.data.range_table(5).repeat(2) + ds = ray.data.range(5).repeat(2) it = ds.iterator() model = build_model() - model.fit(it.to_tf("value", "value"), epochs=2) + model.fit(it.to_tf("id", "id"), epochs=2) - ds = ray.data.range_table(5).repeat(2) + ds = ray.data.range(5).repeat(2) it = ds.iterator() model = build_model() # 3 epochs fails since we only repeated twice. with pytest.raises(Exception, match=r"generator raised StopIteration"): - model.fit(it.to_tf("value", "value"), epochs=3) + model.fit(it.to_tf("id", "id"), epochs=3) def test_tf_conversion_pipeline(ray_start_regular_shared): - ds = ray.data.range_table(5).repeat(2) + ds = ray.data.range(5).repeat(2) it = ds.iterator() - tf_dataset = it.to_tf("value", "value") + tf_dataset = it.to_tf("id", "id") for i, row in enumerate(tf_dataset): assert all(row[0] == i) assert all(row[1] == i) @@ -120,7 +124,7 @@ def test_tf_conversion_pipeline(ray_start_regular_shared): assert isinstance(row[1], tf.Tensor) # Repeated twice. - tf_dataset = it.to_tf("value", "value") + tf_dataset = it.to_tf("id", "id") for i, row in enumerate(tf_dataset): assert all(row[0] == i) assert all(row[1] == i) @@ -129,32 +133,32 @@ def test_tf_conversion_pipeline(ray_start_regular_shared): # Fails on third try. with pytest.raises(Exception, match=r"generator raised StopIteration"): - tf_dataset = it.to_tf("value", "value") + tf_dataset = it.to_tf("id", "id") for _ in tf_dataset: pass def test_torch_conversion(ray_start_regular_shared): - ds = ray.data.range_table(5) + ds = ray.data.range(5) it = ds.iterator() for batch in it.iter_torch_batches(): - assert isinstance(batch["value"], torch.Tensor) - assert batch["value"].tolist() == list(range(5)) + assert isinstance(batch["id"], torch.Tensor) + assert batch["id"].tolist() == list(range(5)) def test_torch_conversion_pipeline(ray_start_regular_shared): - ds = ray.data.range_table(5).repeat(2) + ds = ray.data.range(5).repeat(2) it = ds.iterator() # First epoch. for batch in it.iter_torch_batches(): - assert isinstance(batch["value"], torch.Tensor) - assert batch["value"].tolist() == list(range(5)) + assert isinstance(batch["id"], torch.Tensor) + assert batch["id"].tolist() == list(range(5)) # Second epoch. for batch in it.iter_torch_batches(): - assert isinstance(batch["value"], torch.Tensor) - assert batch["value"].tolist() == list(range(5)) + assert isinstance(batch["id"], torch.Tensor) + assert batch["id"].tolist() == list(range(5)) # Fails on third iteration. with pytest.raises(Exception, match=r"generator raised StopIteration"): @@ -164,9 +168,9 @@ def test_torch_conversion_pipeline(ray_start_regular_shared): def test_torch_conversion_collate_fn(ray_start_regular_shared): def collate_fn(batch: Dict[str, np.ndarray]): - return torch.as_tensor(batch["value"] + 5) + return torch.as_tensor(batch["id"] + 5) - ds = ray.data.range_table(5) + ds = ray.data.range(5) it = ds.iterator() for batch in it.iter_torch_batches(collate_fn=collate_fn): assert isinstance(batch, torch.Tensor) diff --git a/python/ray/data/tests/test_logger.py b/python/ray/data/tests/test_logger.py index e077159d56fa..c15da873250d 100644 --- a/python/ray/data/tests/test_logger.py +++ b/python/ray/data/tests/test_logger.py @@ -8,18 +8,18 @@ from datetime import datetime import ray -from ray.data._internal.datastream_logger import DatastreamLogger +from ray.data._internal.dataset_logger import DatasetLogger -def test_datastream_logger(shutdown_only): +def test_dataset_logger(shutdown_only): ray.init() log_name, msg = "test_name", "test_message_1234" - logger = DatastreamLogger(log_name) + logger = DatasetLogger(log_name) logger.get_logger().info(msg) # Read from log file, and parse each component of emitted log row session_dir = ray._private.worker._global_node.get_session_dir_path() - log_file_path = os.path.join(session_dir, DatastreamLogger.DEFAULT_DATASET_LOG_PATH) + log_file_path = os.path.join(session_dir, DatasetLogger.DEFAULT_DATASET_LOG_PATH) with open(log_file_path, "r") as f: raw_logged_msg = f.read() ( diff --git a/python/ray/data/tests/test_map.py b/python/ray/data/tests/test_map.py index c71b4d973cd1..4a7c2ca9cc0e 100644 --- a/python/ray/data/tests/test_map.py +++ b/python/ray/data/tests/test_map.py @@ -6,6 +6,7 @@ import time from typing import Iterator +import numpy as np import pandas as pd import pyarrow as pa import pyarrow.parquet as pq @@ -16,6 +17,7 @@ from ray.data.block import BlockAccessor from ray.data.context import DataContext from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import extract_values, column_udf from ray.tests.conftest import * # noqa @@ -32,44 +34,60 @@ def test_basic_actors(shutdown_only, pipelined): n = 5 ds = ray.data.range(n) ds = maybe_pipeline(ds, pipelined) - assert sorted(ds.map(lambda x: x + 1, compute="actors").take()) == list( - range(1, n + 1) - ) + assert sorted( + extract_values( + "id", + ds.map( + column_udf("id", lambda x: x + 1), compute=ray.data.ActorPoolStrategy() + ).take(), + ) + ) == list(range(1, n + 1)) # Should still work even if num actors > num cpus. ds = ray.data.range(n) ds = maybe_pipeline(ds, pipelined) assert sorted( - ds.map(lambda x: x + 1, compute=ray.data.ActorPoolStrategy(size=4)).take() + extract_values( + "id", + ds.map( + column_udf("id", lambda x: x + 1), + compute=ray.data.ActorPoolStrategy(size=4), + ).take(), + ) ) == list(range(1, n + 1)) # Test setting custom max inflight tasks. ds = ray.data.range(10, parallelism=5) ds = maybe_pipeline(ds, pipelined) assert sorted( - ds.map( - lambda x: x + 1, - compute=ray.data.ActorPoolStrategy(max_tasks_in_flight_per_actor=3), - ).take() + extract_values( + "id", + ds.map( + column_udf("id", lambda x: x + 1), + compute=ray.data.ActorPoolStrategy(max_tasks_in_flight_per_actor=3), + ).take(), + ) ) == list(range(1, 11)) # Test invalid max tasks inflight arg. with pytest.raises(ValueError): ray.data.range(10).map( - lambda x: x, + column_udf("id", lambda x: x), compute=ray.data.ActorPoolStrategy(max_tasks_in_flight_per_actor=0), ) # Test min no more than max check. with pytest.raises(ValueError): ray.data.range(10).map( - lambda x: x, compute=ray.data.ActorPoolStrategy(min_size=8, max_size=4) + column_udf("id", lambda x: x), + compute=ray.data.ActorPoolStrategy(min_size=8, max_size=4), ) # Test conflicting args. with pytest.raises(ValueError): ray.data.range(10).map( - lambda x: x, compute=ray.data.ActorPoolStrategy(min_size=8, size=4) + column_udf("id", lambda x: x), + compute=ray.data.ActorPoolStrategy(min_size=8, size=4), ) @@ -84,7 +102,7 @@ def __init__(self): def __call__(self, x): r = self.num_reuses self.num_reuses += 1 - return r + return {"id": np.array([r])} # Need to specify compute explicitly. with pytest.raises(ValueError): @@ -92,7 +110,7 @@ def __call__(self, x): # Need to specify actor compute strategy. with pytest.raises(ValueError): - ds.map(StatefulFn, compute="tasks").take() + ds.map(StatefulFn).take() # Need to specify compute explicitly. with pytest.raises(ValueError): @@ -100,7 +118,7 @@ def __call__(self, x): # Need to specify actor compute strategy. with pytest.raises(ValueError): - ds.flat_map(StatefulFn, compute="tasks") + ds.flat_map(StatefulFn) # Need to specify compute explicitly. with pytest.raises(ValueError): @@ -108,11 +126,11 @@ def __call__(self, x): # Need to specify actor compute strategy. with pytest.raises(ValueError): - ds.filter(StatefulFn, compute="tasks") + ds.filter(StatefulFn) # map - actor_reuse = ds.map(StatefulFn, compute="actors").take() - assert sorted(actor_reuse) == list(range(10)), actor_reuse + actor_reuse = ds.map(StatefulFn, compute=ray.data.ActorPoolStrategy()).take() + assert sorted(extract_values("id", actor_reuse)) == list(range(10)), actor_reuse class StatefulFn: def __init__(self): @@ -121,14 +139,30 @@ def __init__(self): def __call__(self, x): r = self.num_reuses self.num_reuses += 1 - return [r] + return [{"id": r}] # flat map - actor_reuse = ds.flat_map(StatefulFn, compute="actors").take() + actor_reuse = extract_values( + "id", ds.flat_map(StatefulFn, compute=ray.data.ActorPoolStrategy()).take() + ) assert sorted(actor_reuse) == list(range(10)), actor_reuse + class StatefulFn: + def __init__(self): + self.num_reuses = 0 + + def __call__(self, x): + r = self.num_reuses + self.num_reuses += 1 + return {"id": np.array([r])} + # map batches - actor_reuse = ds.map_batches(StatefulFn, batch_size=1, compute="actors").take() + actor_reuse = extract_values( + "id", + ds.map_batches( + StatefulFn, batch_size=1, compute=ray.data.ActorPoolStrategy() + ).take(), + ) assert sorted(actor_reuse) == list(range(10)), actor_reuse class StatefulFn: @@ -141,7 +175,7 @@ def __call__(self, x): return r > 0 # filter - actor_reuse = ds.filter(StatefulFn, compute="actors").take() + actor_reuse = ds.filter(StatefulFn, compute=ray.data.ActorPoolStrategy()).take() assert len(actor_reuse) == 9, actor_reuse @@ -154,11 +188,14 @@ class StatefulFn: def __call__(self, x): thread_id = threading.get_ident() assert threading.current_thread() is not threading.main_thread() - return [thread_id] + return {"tid": np.array([thread_id])} - thread_ids = ds.map_batches( - StatefulFn, compute="actors", max_concurrency=2 - ).take_all() + thread_ids = extract_values( + "tid", + ds.map_batches( + StatefulFn, compute=ray.data.ActorPoolStrategy(), max_concurrency=2 + ).take_all(), + ) # Make sure user's UDF is not running concurrently. assert len(set(thread_ids)) == 1 @@ -167,7 +204,9 @@ def __call__(self, x): raise ValueError with pytest.raises(ValueError): - ds.map_batches(ErrorFn, compute="actors", max_concurrency=2).take_all() + ds.map_batches( + ErrorFn, compute=ray.data.ActorPoolStrategy(), max_concurrency=2 + ).take_all() def test_transform_failure(shutdown_only): @@ -186,25 +225,32 @@ def mapper(x): def test_flat_map_generator(ray_start_regular_shared): ds = ray.data.range(3) - def map_generator(item: int) -> Iterator[int]: + def map_generator(item: dict) -> Iterator[int]: for _ in range(2): - yield item + 1 + yield {"id": item["id"] + 1} - assert sorted(ds.flat_map(map_generator).take()) == [1, 1, 2, 2, 3, 3] + assert sorted(extract_values("id", ds.flat_map(map_generator).take())) == [ + 1, + 1, + 2, + 2, + 3, + 3, + ] def test_add_column(ray_start_regular_shared): ds = ray.data.range(5).add_column("foo", lambda x: 1) - assert ds.take(1) == [{"value": 0, "foo": 1}] + assert ds.take(1) == [{"id": 0, "foo": 1}] - ds = ray.data.range_table(5).add_column("foo", lambda x: x["value"] + 1) - assert ds.take(1) == [{"value": 0, "foo": 1}] + ds = ray.data.range(5).add_column("foo", lambda x: x["id"] + 1) + assert ds.take(1) == [{"id": 0, "foo": 1}] - ds = ray.data.range_table(5).add_column("value", lambda x: x["value"] + 1) - assert ds.take(2) == [{"value": 1}, {"value": 2}] + ds = ray.data.range(5).add_column("id", lambda x: x["id"] + 1) + assert ds.take(2) == [{"id": 1}, {"id": 2}] with pytest.raises(ValueError): - ds = ray.data.range(5).add_column("value", 0) + ds = ray.data.range(5).add_column("id", 0) def test_drop_columns(ray_start_regular_shared, tmp_path): @@ -254,11 +300,6 @@ def test_select_columns(ray_start_regular_shared): with pytest.raises(KeyError): each_ds.select_columns(cols=["col1", "col2", "dummy_col"]).materialize() - # Test simple - ds3 = ray.data.range(10) - with pytest.raises(ValueError): - ds3.select_columns(cols=[]).materialize() - def test_map_batches_basic(ray_start_regular_shared, tmp_path, restore_data_context): ctx = DataContext.get_current() @@ -267,7 +308,9 @@ def test_map_batches_basic(ray_start_regular_shared, tmp_path, restore_data_cont # Test input validation ds = ray.data.range(5) with pytest.raises(ValueError): - ds.map_batches(lambda x: x + 1, batch_format="pyarrow", batch_size=-1).take() + ds.map_batches( + column_udf("id", lambda x: x + 1), batch_format="pyarrow", batch_size=-1 + ).take() # Set up. df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) @@ -301,21 +344,23 @@ def test_map_batches_basic(ray_start_regular_shared, tmp_path, restore_data_cont # The pandas column is "value", and it originally has rows from 0~299. # After the map batch, it should have 1~300. row = ds_list[i] - assert row["value"] == i + 1 + assert row["id"] == i + 1 assert ds.count() == 300 # Test the lambda returns different types than the batch_format # pandas => list block ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda df: [1], batch_size=1) - ds_list = ds2.take() + ds2 = ds.map_batches(lambda df: {"id": np.array([1])}, batch_size=1) + ds_list = extract_values("id", ds2.take()) assert ds_list == [1, 1, 1] assert ds.count() == 3 # pyarrow => list block ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda df: [1], batch_size=1, batch_format="pyarrow") - ds_list = ds2.take() + ds2 = ds.map_batches( + lambda df: {"id": np.array([1])}, batch_size=1, batch_format="pyarrow" + ) + ds_list = extract_values("id", ds2.take()) assert ds_list == [1, 1, 1] assert ds.count() == 3 @@ -351,14 +396,13 @@ def __call__(self, df): with pytest.raises(ValueError): # CallableClass not supported for task compute strategy. - ds.map_batches(Foo, compute="tasks") + ds.map_batches(Foo) with pytest.raises(ValueError): # fn_constructor_args and fn_constructor_kwargs only supported for actor # compute strategy. ds.map_batches( lambda x: x, - compute="tasks", fn_constructor_args=(1,), fn_constructor_kwargs={"a": 1}, ) @@ -368,7 +412,7 @@ def __call__(self, df): # class UDFs. ds.map_batches( lambda x: x, - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=(1,), fn_constructor_kwargs={"a": 1}, ) @@ -450,7 +494,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=(put(1),), ) ds_list = ds2.take() @@ -473,7 +517,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_kwargs={"b": put(2)}, ) ds_list = ds2.take() @@ -498,7 +542,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=(put(1),), fn_constructor_kwargs={"b": put(2)}, ) @@ -518,7 +562,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=fn_constructor_args, fn_constructor_kwargs=fn_constructor_kwargs, ) @@ -526,7 +570,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=fn_constructor_args, fn_constructor_kwargs=fn_constructor_kwargs, ) @@ -547,7 +591,7 @@ def __call__(self, x): lambda df, a, b=None: b * df + a, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_args=(put(1),), fn_kwargs={"b": put(2)}, ) @@ -555,7 +599,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=fn_constructor_args, fn_constructor_kwargs=fn_constructor_kwargs, ) @@ -602,7 +646,9 @@ def test_map_batches_actors_preserves_order(shutdown_only): ray.init(num_cpus=2) # Test that actor compute model preserves block order. ds = ray.data.range(10, parallelism=5) - assert ds.map_batches(lambda x: x, compute="actors").take() == list(range(10)) + assert extract_values( + "id", ds.map_batches(lambda x: x, compute=ray.data.ActorPoolStrategy()).take() + ) == list(range(10)) @pytest.mark.parametrize( @@ -622,16 +668,16 @@ def test_map_batches_batch_mutation( # Test that batch mutation works without encountering a read-only error (e.g. if the # batch is a zero-copy view on data in the object store). def mutate(df): - df["value"] += 1 + df["id"] += 1 return df - ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks) + ds = ray.data.range(num_rows, parallelism=num_blocks).repartition(num_blocks) # Convert to Pandas blocks. ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None) # Apply UDF that mutates the batches. ds = ds.map_batches(mutate, batch_size=batch_size) - assert [row["value"] for row in ds.iter_rows()] == list(range(1, num_rows + 1)) + assert [row["id"] for row in ds.iter_rows()] == list(range(1, num_rows + 1)) @pytest.mark.parametrize( @@ -649,10 +695,10 @@ def test_map_batches_batch_zero_copy( def mutate(df): # Check that batch is read-only. assert not df.values.flags.writeable - df["value"] += 1 + df["id"] += 1 return df - ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks) + ds = ray.data.range(num_rows, parallelism=num_blocks).repartition(num_blocks) # Convert to Pandas blocks. ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None) ds = ds.materialize() @@ -660,7 +706,9 @@ def mutate(df): # Apply UDF that mutates the batches, which should fail since the batch is # read-only. with pytest.raises(ValueError, match="tried to mutate a zero-copy read-only batch"): - ds = ds.map_batches(mutate, batch_size=batch_size, zero_copy_batch=True) + ds = ds.map_batches( + mutate, batch_format="pandas", batch_size=batch_size, zero_copy_batch=True + ) ds.materialize() @@ -755,13 +803,13 @@ def test_map_batches_block_bundling_skewed_auto( def test_map_with_mismatched_columns(ray_start_regular_shared): def bad_fn(row): - if row > 5: + if row["id"] > 5: return {"a": "hello1"} else: return {"b": "hello1"} def good_fn(row): - if row > 5: + if row["id"] > 5: return {"a": "hello1", "b": "hello2"} else: return {"b": "hello2", "a": "hello1"} @@ -786,14 +834,14 @@ def test_map_batches_combine_empty_blocks(ray_start_regular_shared): xs = [x % 3 for x in list(range(100))] # ds1 has 1 block which contains 100 rows. - ds1 = ray.data.from_items(xs).repartition(1).sort().map_batches(lambda x: x) + ds1 = ray.data.from_items(xs).repartition(1).sort("item").map_batches(lambda x: x) assert ds1._block_num_rows() == [100] # ds2 has 30 blocks, but only 3 of them are non-empty ds2 = ( ray.data.from_items(xs) .repartition(30) - .sort() + .sort("item") .map_batches(lambda x: x, batch_size=1) ) assert len(ds2._block_num_rows()) == 3 @@ -816,7 +864,7 @@ def ensure_sample_size_close(dataset, sample_percent=0.5): ds = ray.data.range(10, parallelism=2) ensure_sample_size_close(ds) - ds = ray.data.range_table(10, parallelism=2) + ds = ray.data.range(10, parallelism=2) ensure_sample_size_close(ds) ds = ray.data.range_tensor(5, parallelism=2, shape=(2, 2)) diff --git a/python/ray/data/tests/test_mars.py b/python/ray/data/tests/test_mars.py index 6dc1f00693e5..d563f1fd337d 100644 --- a/python/ray/data/tests/test_mars.py +++ b/python/ray/data/tests/test_mars.py @@ -45,10 +45,6 @@ def test_mars(ray_start_regular): pdf2, ) - # Test simple datasets - with pytest.raises(NotImplementedError): - ray.data.range(10).to_mars() - cluster.stop() @@ -99,10 +95,6 @@ def test_from_mars_e2e(ray_start_regular, enable_optimizer): assert ds3._plan._logical_plan.dag.name == "FromArrowRefs" _check_usage_record(["FromArrowRefs"]) - # Test simple datasets - with pytest.raises(NotImplementedError): - ray.data.range(10).to_mars() - cluster.stop() diff --git a/python/ray/data/tests/test_mongo.py b/python/ray/data/tests/test_mongo.py index 276f6933f3c6..c52e43f4043d 100644 --- a/python/ray/data/tests/test_mongo.py +++ b/python/ray/data/tests/test_mongo.py @@ -78,7 +78,7 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=2,\n" " num_rows=5,\n" " schema={float_field: double, int_field: int32}\n" @@ -96,7 +96,7 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=2,\n" " num_rows=5,\n" " schema={_id: fixed_size_binary[12], float_field: double, " @@ -115,7 +115,7 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [2, 1] assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=2,\n" " num_rows=3,\n" " schema={_id: fixed_size_binary[12], float_field: double, " @@ -131,7 +131,7 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): collection=foo_collection, ) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=5,\n" " num_rows=5,\n" " schema={_id: fixed_size_binary[12], float_field: double, " @@ -148,7 +148,7 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): parallelism=1000, ) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=5,\n" " num_rows=5,\n" " schema={_id: fixed_size_binary[12], float_field: double, " @@ -211,7 +211,7 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): ).materialize() assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=2,\n" " num_rows=5,\n" " schema={float_field: double, int_field: int32}\n" @@ -230,7 +230,7 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): ).materialize() assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=2,\n" " num_rows=5,\n" " schema={_id: fixed_size_binary[12], float_field: double, " @@ -247,7 +247,7 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): collection=foo_collection, ).materialize() assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=5,\n" " num_rows=5,\n" " schema={_id: fixed_size_binary[12], float_field: double, " @@ -265,7 +265,7 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): collection=foo_collection, ) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=5,\n" " num_rows=5,\n" " schema={_id: fixed_size_binary[12], float_field: double, " @@ -285,7 +285,7 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [2, 1] assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=2,\n" " num_rows=3,\n" " schema={_id: fixed_size_binary[12], float_field: double, " diff --git a/python/ray/data/tests/test_nonstrict_mode.py b/python/ray/data/tests/test_nonstrict_mode.py new file mode 100644 index 000000000000..40742940badf --- /dev/null +++ b/python/ray/data/tests/test_nonstrict_mode.py @@ -0,0 +1,200 @@ +import numpy as np +import pandas as pd +from collections import UserDict +import pytest + +import ray +from ray.data.tests.conftest import * # noqa +from ray.tests.conftest import * # noqa + + +def test_nonstrict_read_schemas(ray_start_10_cpus_shared, enable_nonstrict_mode): + ds = ray.data.range(1) + assert ds.take()[0] == 0 + + ds = ray.data.range_table(1) + assert ds.take()[0] == {"value": 0} + + ds = ray.data.range_tensor(1) + assert ds.take()[0] == np.array([0]) + + ds = ray.data.from_items([1]) + assert ds.take()[0] == 1 + + ds = ray.data.from_items([object()]) + assert isinstance(ds.take()[0], object) + + ds = ray.data.read_numpy("example://mnist_subset.npy") + assert isinstance(ds.take()[0], np.ndarray) + + ds = ray.data.from_numpy(np.ones((100, 10))) + assert isinstance(ds.take()[0], np.ndarray) + + ds = ray.data.from_numpy_refs(ray.put(np.ones((100, 10)))) + assert isinstance(ds.take()[0], np.ndarray) + + ds = ray.data.read_binary_files("example://image-datasets/simple") + assert isinstance(ds.take()[0], bytes) + + ds = ray.data.read_images("example://image-datasets/simple") + assert "image" in ds.take()[0] + + ds = ray.data.read_text("example://sms_spam_collection_subset.txt") + assert "text" in ds.take()[0] + + +def test_nonstrict_map_output(ray_start_10_cpus_shared, enable_nonstrict_mode): + ds = ray.data.range(1) + + ds.map(lambda x: 0, max_retries=0).materialize() + ds.map(lambda x: {"id": 0}).materialize() + ds.map(lambda x: UserDict({"id": 0})).materialize() + + ds.map_batches(lambda x: np.array([0]), max_retries=0).materialize() + ds.map_batches(lambda x: {"id": np.array([0])}).materialize() + ds.map_batches(lambda x: UserDict({"id": np.array([0])})).materialize() + + ds.map(lambda x: np.ones(10), max_retries=0).materialize() + ds.map(lambda x: {"x": np.ones(10)}).materialize() + ds.map(lambda x: UserDict({"x": np.ones(10)})).materialize() + + ds.map_batches(lambda x: np.ones(10), max_retries=0).materialize() + ds.map_batches(lambda x: {"x": np.ones(10)}).materialize() + ds.map_batches(lambda x: UserDict({"x": np.ones(10)})).materialize() + + # Not allowed in normal mode either. + with pytest.raises(ValueError): + ds.map_batches(lambda x: object(), max_retries=0).materialize() + with pytest.raises(ValueError): + ds.map_batches(lambda x: {"x": object()}, max_retries=0).materialize() + ds.map_batches(lambda x: {"x": np.array([object()])}).materialize() + ds.map_batches(lambda x: UserDict({"x": np.array([object()])})).materialize() + + ds.map(lambda x: object(), max_retries=0).materialize() + ds.map(lambda x: {"x": object()}).materialize() + ds.map(lambda x: UserDict({"x": object()})).materialize() + + +def test_nonstrict_convert_map_output(ray_start_10_cpus_shared, enable_nonstrict_mode): + ds = ray.data.range(1).map_batches(lambda x: {"id": [0, 1, 2, 3]}).materialize() + assert ds.take_batch()["id"].tolist() == [0, 1, 2, 3] + + with pytest.raises(ValueError): + # Strings not converted into array. + ray.data.range(1).map_batches( + lambda x: {"id": "string"}, max_retries=0 + ).materialize() + + class UserObj: + def __eq__(self, other): + return isinstance(other, UserObj) + + ds = ( + ray.data.range(1) + .map_batches(lambda x: {"id": [0, 1, 2, UserObj()]}) + .materialize() + ) + assert ds.take_batch()["id"].tolist() == [0, 1, 2, UserObj()] + + +def test_nonstrict_default_batch_format( + ray_start_10_cpus_shared, enable_nonstrict_mode +): + ds = ray.data.range_table(1) + + @ray.remote + class Queue: + def __init__(self): + self.item = None + + def put(self, item): + old = self.item + self.item = item + return old + + q = Queue.remote() + + assert isinstance(next(ds.iter_batches()), pd.DataFrame) + assert isinstance(ds.take_batch(), pd.DataFrame) + + def f(x): + ray.get(q.put.remote(x)) + return x + + ds.map_batches(f).materialize() + batch = ray.get(q.put.remote(None)) + assert isinstance(batch, pd.DataFrame), batch + + +def test_nonstrict_tensor_support(ray_start_10_cpus_shared, enable_nonstrict_mode): + ds = ray.data.from_items([np.ones(10), np.ones(10)]) + assert np.array_equal(ds.take()[0], np.ones(10)) + + ds = ds.map(lambda x: x * 2) + assert np.array_equal(ds.take()[0], 2 * np.ones(10)) + + ds = ds.map_batches(lambda x: x * 2) + assert np.array_equal(ds.take()[0], 4 * np.ones(10)) + + +def test_nonstrict_value_repr(ray_start_10_cpus_shared, enable_nonstrict_mode): + ds = ray.data.from_items([{"__value__": np.ones(10)}]) + + ds = ds.map_batches(lambda x: {"__value__": x * 2}) + ds = ds.map(lambda x: {"__value__": x * 2}) + assert np.array_equal(ds.take()[0], 4 * np.ones(10)) + assert np.array_equal(ds.take_batch()[0], 4 * np.ones(10)) + + +def test_nonstrict_compute(ray_start_10_cpus_shared, enable_nonstrict_mode): + ray.data.range(10).map(lambda x: x, compute="actors").show() + ray.data.range(10).map(lambda x: x, compute=ray.data.ActorPoolStrategy(1, 1)).show() + ray.data.range(10).map(lambda x: x, compute="tasks").show() + + +def test_nonstrict_schema(ray_start_10_cpus_shared, enable_nonstrict_mode): + import pyarrow + from ray.data._internal.pandas_block import PandasBlockSchema + + ds = ray.data.from_items([{"x": 2}]) + schema = ds.schema() + assert isinstance(schema, pyarrow.lib.Schema) + + ds = ray.data.from_items([{"x": 2, "y": [1, 2]}]) + schema = ds.schema() + assert isinstance(schema, pyarrow.lib.Schema) + + ds = ray.data.from_items([{"x": 2, "y": object(), "z": [1, 2]}]) + schema = ds.schema() + assert isinstance(schema, type) + + ds = ray.data.from_numpy(np.ones((100, 10))) + schema = ds.schema() + assert isinstance(schema, pyarrow.lib.Schema) + + schema = ds.map_batches(lambda x: x, batch_format="pandas").schema() + assert isinstance(schema, PandasBlockSchema) + + +def test_nouse_raw_dicts(ray_start_10_cpus_shared, enable_nonstrict_mode): + assert type(ray.data.range_table(10).take(1)[0].as_pydict()) is dict + assert type(ray.data.from_items([{"x": 1}]).take(1)[0].as_pydict()) is dict + + def checker(x): + assert type(x.as_pydict()) is dict + return x + + ray.data.range_table(10).map(checker).show() + + +def test_nonstrict_require_batch_size_for_gpu(enable_nonstrict_mode): + ray.shutdown() + ray.init(num_cpus=4, num_gpus=1) + ds = ray.data.range(1) + ds.map_batches(lambda x: x, num_gpus=1) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_numpy.py b/python/ray/data/tests/test_numpy.py index 9ef31dd916c7..4c9565ba5234 100644 --- a/python/ray/data/tests/test_numpy.py +++ b/python/ray/data/tests/test_numpy.py @@ -20,6 +20,7 @@ from ray.data.tests.conftest import * # noqa from ray.data.tests.mock_http_server import * # noqa +from ray.data.tests.util import extract_values from ray.tests.conftest import * # noqa @@ -43,7 +44,7 @@ def test_from_numpy(ray_start_regular_shared, from_ref): ds = ray.data.from_numpy_refs([ray.put(arr) for arr in arrs]) else: ds = ray.data.from_numpy(arrs) - values = np.stack(ds.take(8)) + values = np.stack(extract_values("data", ds.take(8))) np.testing.assert_array_equal(values, np.concatenate((arr1, arr2))) # Check that conversion task is included in stats. assert "FromNumpyRefs" in ds.stats() @@ -53,7 +54,7 @@ def test_from_numpy(ray_start_regular_shared, from_ref): ds = ray.data.from_numpy_refs(ray.put(arr1)) else: ds = ray.data.from_numpy(arr1) - values = np.stack(ds.take(4)) + values = np.stack(extract_values("data", ds.take(4))) np.testing.assert_array_equal(values, arr1) # Check that conversion task is included in stats. assert "FromNumpyRefs" in ds.stats() @@ -62,7 +63,7 @@ def test_from_numpy(ray_start_regular_shared, from_ref): def test_from_numpy_variable_shaped(ray_start_regular_shared): arr = np.array([np.ones((2, 2)), np.ones((3, 3))], dtype=object) ds = ray.data.from_numpy(arr) - values = np.array(ds.take(2), dtype=object) + values = np.array(extract_values("data", ds.take(2)), dtype=object) def recursive_to_list(a): if not isinstance(a, (list, np.ndarray)): @@ -75,19 +76,14 @@ def recursive_to_list(a): def test_to_numpy_refs(ray_start_regular_shared): - # Simple Dataset - ds = ray.data.range(10) - arr = np.concatenate(ray.get(ds.to_numpy_refs())) - np.testing.assert_equal(arr, np.arange(0, 10)) - # Tensor Dataset ds = ray.data.range_tensor(10, parallelism=2) - arr = np.concatenate(ray.get(ds.to_numpy_refs())) + arr = np.concatenate(extract_values("data", ray.get(ds.to_numpy_refs()))) np.testing.assert_equal(arr, np.expand_dims(np.arange(0, 10), 1)) # Table Dataset - ds = ray.data.range_table(10) - arr = np.concatenate([t["value"] for t in ray.get(ds.to_numpy_refs())]) + ds = ray.data.range(10) + arr = np.concatenate([t["id"] for t in ray.get(ds.to_numpy_refs())]) np.testing.assert_equal(arr, np.arange(0, 10)) # Test multi-column Arrow dataset. @@ -119,16 +115,18 @@ def test_to_numpy_refs(ray_start_regular_shared): ) def test_numpy_roundtrip(ray_start_regular_shared, fs, data_path): ds = ray.data.range_tensor(10, parallelism=2) - ds.write_numpy(data_path, filesystem=fs) + ds.write_numpy(data_path, filesystem=fs, column="data") ds = ray.data.read_numpy(data_path, filesystem=fs) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=2,\n" " num_rows=?,\n" - " schema={__value__: numpy.ndarray(shape=(1,), dtype=int64)}\n" + " schema={data: numpy.ndarray(shape=(1,), dtype=int64)}\n" ")" ) - np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) + np.testing.assert_equal( + extract_values("data", ds.take(2)), [np.array([0]), np.array([1])] + ) def test_numpy_read(ray_start_regular_shared, tmp_path): @@ -137,13 +135,15 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): np.save(os.path.join(path, "test.npy"), np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=1,\n" " num_rows=10,\n" - " schema={__value__: numpy.ndarray(shape=(1,), dtype=int64)}\n" + " schema={data: numpy.ndarray(shape=(1,), dtype=int64)}\n" ")" ) - np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) + np.testing.assert_equal( + extract_values("data", ds.take(2)), [np.array([0]), np.array([1])] + ) # Add a file with a non-matching file extension. This file should be ignored. with open(os.path.join(path, "foo.txt"), "w") as f: @@ -153,13 +153,13 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): assert ds.num_blocks() == 1 assert ds.count() == 10 assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=1,\n" " num_rows=10,\n" - " schema={__value__: numpy.ndarray(shape=(1,), dtype=int64)}\n" + " schema={data: numpy.ndarray(shape=(1,), dtype=int64)}\n" ")" ) - assert [v.item() for v in ds.take(2)] == [0, 1] + assert [v["data"].item() for v in ds.take(2)] == [0, 1] @pytest.mark.parametrize("ignore_missing_paths", [True, False]) @@ -191,13 +191,15 @@ def test_numpy_read_meta_provider(ray_start_regular_shared, tmp_path): np.save(path, np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path, meta_provider=FastFileMetadataProvider()) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=1,\n" " num_rows=10,\n" - " schema={__value__: numpy.ndarray(shape=(1,), dtype=int64)}\n" + " schema={data: numpy.ndarray(shape=(1,), dtype=int64)}\n" ")" ) - np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) + np.testing.assert_equal( + extract_values("data", ds.take(2)), [np.array([0]), np.array([1])] + ) with pytest.raises(NotImplementedError): ray.data.read_binary_files( @@ -252,9 +254,9 @@ def skip_unpartitioned(kv_dict): val_str = "".join(f"array({v}, dtype=int8), " for v in vals)[:-2] assert_base_partitioned_ds( ds, - schema="{__value__: numpy.ndarray(shape=(2,), dtype=int8)}", + schema="{data: numpy.ndarray(shape=(2,), dtype=int8)}", sorted_values=f"[[{val_str}]]", - ds_take_transform_fn=lambda taken: [taken], + ds_take_transform_fn=lambda taken: [extract_values("data", taken)], sorted_values_transform_fn=lambda sorted_values: str(sorted_values), ) assert ray.get(kept_file_counter.get.remote()) == 2 @@ -274,7 +276,7 @@ def skip_unpartitioned(kv_dict): def test_numpy_write(ray_start_regular_shared, fs, data_path, endpoint_url): ds = ray.data.range_tensor(10, parallelism=2) ds._set_uuid("data") - ds.write_numpy(data_path, filesystem=fs) + ds.write_numpy(data_path, filesystem=fs, column="data") file_path1 = os.path.join(data_path, "data_000000.npy") file_path2 = os.path.join(data_path, "data_000001.npy") if endpoint_url is None: @@ -291,7 +293,7 @@ def test_numpy_write(ray_start_regular_shared, fs, data_path, endpoint_url): assert len(arr2) == 5 assert arr1.sum() == 10 assert arr2.sum() == 35 - np.testing.assert_equal(ds.take(1), [np.array([0])]) + np.testing.assert_equal(extract_values("data", ds.take(1)), [np.array([0])]) @pytest.mark.parametrize( @@ -312,7 +314,10 @@ def test_numpy_write_block_path_provider( ds = ray.data.range_tensor(10, parallelism=2) ds._set_uuid("data") ds.write_numpy( - data_path, filesystem=fs, block_path_provider=test_block_write_path_provider + data_path, + filesystem=fs, + block_path_provider=test_block_write_path_provider, + column="data", ) file_path1 = os.path.join(data_path, "000000_05_data.test.npy") file_path2 = os.path.join(data_path, "000001_05_data.test.npy") @@ -330,7 +335,7 @@ def test_numpy_write_block_path_provider( assert len(arr2) == 5 assert arr1.sum() == 10 assert arr2.sum() == 35 - np.testing.assert_equal(ds.take(1), [np.array([0])]) + np.testing.assert_equal(extract_values("data", ds.take(1)), [np.array([0])]) if __name__ == "__main__": diff --git a/python/ray/data/tests/test_numpy_support.py b/python/ray/data/tests/test_numpy_support.py new file mode 100644 index 000000000000..fcbfd8388de6 --- /dev/null +++ b/python/ray/data/tests/test_numpy_support.py @@ -0,0 +1,141 @@ +import numpy as np +import torch +import pytest + +import ray +from ray.air.util.tensor_extensions.utils import create_ragged_ndarray +from ray.data.tests.conftest import * # noqa +from ray.tests.conftest import * # noqa + + +class UserObj: + def __eq__(self, other): + return isinstance(other, UserObj) + + +def do_map_batches(data): + ds = ray.data.range(1) + ds = ds.map_batches(lambda x: {"output": data}) + return ds.take_batch()["output"] + + +def assert_structure_equals(a, b): + assert type(a) == type(b), (type(a), type(b)) + assert type(a[0]) == type(b[0]), (type(a[0]), type(b[0])) # noqa: E721 + assert a.dtype == b.dtype + assert a.shape == b.shape + for i in range(len(a)): + assert np.array_equiv(a[i], b[i]), (i, a, b) + + +def test_list_of_scalars(ray_start_regular_shared): + data = [1, 2, 3] + output = do_map_batches(data) + assert_structure_equals(output, np.array([1, 2, 3], dtype=np.int64)) + + +def test_list_of_numpy_scalars(ray_start_regular_shared): + data = [np.int64(1), np.int64(2), np.int64(3)] + output = do_map_batches(data) + assert_structure_equals(output, np.array([1, 2, 3], dtype=np.int64)) + + +def test_list_of_objects(ray_start_regular_shared): + data = [1, 2, 3, UserObj()] + output = do_map_batches(data) + assert_structure_equals(output, np.array([1, 2, 3, UserObj()])) + + +def test_array_like(ray_start_regular_shared): + data = torch.Tensor([1, 2, 3]) + output = do_map_batches(data) + assert_structure_equals(output, np.array([1.0, 2.0, 3.0], dtype=np.float32)) + + +def test_list_of_arrays(ray_start_regular_shared): + data = [np.array([1, 2, 3]), np.array([4, 5, 6])] + output = do_map_batches(data) + assert_structure_equals(output, np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)) + + +def test_list_of_array_like(ray_start_regular_shared): + data = [torch.Tensor([1, 2, 3]), torch.Tensor([4, 5, 6])] + output = do_map_batches(data) + assert_structure_equals(output, np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)) + + +def test_ragged_array_like(ray_start_regular_shared): + data = [torch.Tensor([1, 2, 3]), torch.Tensor([1, 2])] + output = do_map_batches(data) + assert_structure_equals( + output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object) + ) + + +def test_ragged_lists(ray_start_regular_shared): + data = [[1, 2, 3], [1, 2]] + output = do_map_batches(data) + assert_structure_equals( + output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object) + ) + + +def test_scalar_numpy(ray_start_regular_shared): + data = np.int64(1) + ds = ray.data.range(2) + ds = ds.map(lambda x: {"output": data}) + output = ds.take_batch()["output"] + assert_structure_equals(output, np.array([1, 1], dtype=np.int64)) + + +def test_scalar_arrays(ray_start_regular_shared): + data = np.array([1, 2, 3]) + ds = ray.data.range(2) + ds = ds.map(lambda x: {"output": data}) + output = ds.take_batch()["output"] + assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.int64)) + + +def test_scalar_array_like(ray_start_regular_shared): + data = torch.Tensor([1, 2, 3]) + ds = ray.data.range(2) + ds = ds.map(lambda x: {"output": data}) + output = ds.take_batch()["output"] + assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)) + + +def test_scalar_ragged_arrays(ray_start_regular_shared): + data = [np.array([1, 2, 3]), np.array([1, 2])] + ds = ray.data.range(2) + ds = ds.map(lambda x: {"output": data[x["id"]]}) + output = ds.take_batch()["output"] + assert_structure_equals( + output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object) + ) + + +def test_scalar_ragged_array_like(ray_start_regular_shared): + data = [torch.Tensor([1, 2, 3]), torch.Tensor([1, 2])] + ds = ray.data.range(2) + ds = ds.map(lambda x: {"output": data[x["id"]]}) + output = ds.take_batch()["output"] + assert_structure_equals( + output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object) + ) + + +# https://github.com/ray-project/ray/issues/35340 +def test_complex_ragged_arrays(ray_start_regular_shared): + data = [[{"a": 1}, {"a": 2}, {"a": 3}], [{"b": 1}]] + output = do_map_batches(data) + assert_structure_equals(output, create_ragged_ndarray(data)) + + data = ["hi", 1, None, [[[[]]]], {"a": [[{"b": 2, "c": UserObj()}]]}, UserObj()] + output = do_map_batches(data) + assert_structure_equals(output, create_ragged_ndarray(data)) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_object_gc.py b/python/ray/data/tests/test_object_gc.py index 825602539f69..724fd9caef01 100644 --- a/python/ray/data/tests/test_object_gc.py +++ b/python/ray/data/tests/test_object_gc.py @@ -44,7 +44,7 @@ def check_to_tf_no_spill(ctx, pipe): max_epoch = 10 for p in pipe.iter_epochs(max_epoch): for _ in p.to_tf( - feature_columns="__value__", label_columns="label", batch_size=None + feature_columns="data", label_columns="label", batch_size=None ): pass meminfo = memory_summary(ctx.address_info["address"], stats_only=True) diff --git a/python/ray/data/tests/test_operators.py b/python/ray/data/tests/test_operators.py index 18b1ecb461c4..f69fc830668d 100644 --- a/python/ray/data/tests/test_operators.py +++ b/python/ray/data/tests/test_operators.py @@ -1,9 +1,11 @@ import collections +import pandas as pd import random import pytest import numpy as np from typing import List, Iterable, Any import time +from unittest.mock import MagicMock import ray from ray.data.block import Block @@ -14,6 +16,7 @@ ExecutionOptions, ) from ray.data._internal.execution.operators.all_to_all_operator import AllToAllOperator +from ray.data._internal.execution.operators.limit_operator import LimitOperator from ray.data._internal.execution.operators.map_operator import ( MapOperator, _BlockRefBundler, @@ -33,12 +36,12 @@ def _get_blocks(bundle: RefBundle, output_list: List[Block]): for block, _ in bundle.blocks: - output_list.append(ray.get(block)) + output_list.append(list(ray.get(block)["id"])) def _mul2_transform(block_iter: Iterable[Block], ctx) -> Iterable[Block]: for block in block_iter: - yield [b * 2 for b in block] + yield pd.DataFrame({"id": [b * 2 for b in block["id"]]}) def _take_outputs(op: PhysicalOperator) -> List[Any]: @@ -234,7 +237,7 @@ def test_split_operator(ray_start_regular_shared, equal, chunk_size): ref = op.get_next() assert ref.owns_blocks, ref for block, _ in ref.blocks: - output_splits[ref.output_split_idx].extend(ray.get(block)) + output_splits[ref.output_split_idx].extend(list(ray.get(block)["id"])) op.inputs_done() if equal: for i in range(3): @@ -267,7 +270,7 @@ def test_split_operator_random(ray_start_regular_shared, equal, random_seed): ref = op.get_next() assert ref.owns_blocks, ref for block, _ in ref.blocks: - output_splits[ref.output_split_idx].extend(ray.get(block)) + output_splits[ref.output_split_idx].extend(list(ray.get(block)["id"])) if equal: actual = [len(output_splits[i]) for i in range(3)] expected = [num_inputs // 3] * 3 @@ -281,13 +284,16 @@ def test_split_operator_locality_hints(ray_start_regular_shared): op = OutputSplitter(input_op, 2, equal=False, locality_hints=["node1", "node2"]) def get_fake_loc(item): + assert isinstance(item, int), item if item in [0, 1, 4, 5, 8]: return "node1" else: return "node2" def get_bundle_loc(bundle): - return get_fake_loc(ray.get(bundle.blocks[0][0])[0]) + block = ray.get(bundle.blocks[0][0]) + fval = list(block["id"])[0] + return get_fake_loc(fval) op._get_location = get_bundle_loc @@ -301,7 +307,7 @@ def get_bundle_loc(bundle): ref = op.get_next() assert ref.owns_blocks, ref for block, _ in ref.blocks: - output_splits[ref.output_split_idx].extend(ray.get(block)) + output_splits[ref.output_split_idx].extend(list(ray.get(block)["id"])) total = 0 for i in range(2): @@ -584,10 +590,59 @@ def test_map_operator_pool_delegation(compute, expected): assert isinstance(op, expected) +def test_limit_operator(ray_start_regular_shared): + """Test basic functionalities of LimitOperator.""" + num_refs = 3 + num_rows_per_block = 3 + total_rows = num_refs * num_rows_per_block + # Test limits with different values, from 0 to more than input size. + limits = list(range(0, total_rows + 2)) + for limit in limits: + refs = make_ref_bundles([[i] * num_rows_per_block for i in range(num_refs)]) + input_op = InputDataBuffer(refs) + limit_op = LimitOperator(limit, input_op) + limit_op.inputs_done = MagicMock(wraps=limit_op.inputs_done) + if limit == 0: + # If the limit is 0, the operator should be completed immediately. + assert limit_op.completed() + assert limit_op._limit_reached() + else: + # The number of output bundles is unknown until + # inputs are completed. + assert limit_op.num_outputs_total() is None, limit + cur_rows = 0 + loop_count = 0 + while input_op.has_next() and not limit_op._limit_reached(): + loop_count += 1 + assert not limit_op.completed(), limit + assert limit_op.need_more_inputs(), limit + limit_op.add_input(input_op.get_next(), 0) + while limit_op.has_next(): + # Drain the outputs. So the limit operator + # will be completed when the limit is reached. + limit_op.get_next() + cur_rows += num_rows_per_block + if cur_rows >= limit: + assert limit_op.inputs_done.call_count == 1, limit + assert limit_op.completed(), limit + assert limit_op._limit_reached(), limit + assert not limit_op.need_more_inputs(), limit + else: + assert limit_op.inputs_done.call_count == 0, limit + assert not limit_op.completed(), limit + assert not limit_op._limit_reached(), limit + assert limit_op.need_more_inputs(), limit + limit_op.inputs_done() + # After inputs done, the number of output bundles + # should be the same as the number of `add_input`s. + assert limit_op.num_outputs_total() == loop_count, limit + assert limit_op.completed(), limit + + def _get_bundles(bundle: RefBundle): output = [] for block, _ in bundle.blocks: - output.extend(ray.get(block)) + output.extend(list(ray.get(block)["id"])) return output @@ -672,7 +727,7 @@ def test_block_ref_bundler_uniform( i for bundle in out_bundles for block, _ in bundle.blocks - for i in ray.get(block) + for i in list(ray.get(block)["id"]) ] assert flat_out == list(range(n)) diff --git a/python/ray/data/tests/test_optimize.py b/python/ray/data/tests/test_optimize.py index b4f4dccfd55f..dd8643d433d9 100644 --- a/python/ray/data/tests/test_optimize.py +++ b/python/ray/data/tests/test_optimize.py @@ -14,6 +14,7 @@ from ray.data.context import DataContext from ray.data.datasource import Datasource, ReadTask from ray.data.datasource.csv_datasource import CSVDatasource +from ray.data.tests.util import column_udf, extract_values from ray.tests.conftest import * # noqa @@ -67,7 +68,7 @@ def dummy_map(x): def test_memory_sanity(shutdown_only): info = ray.init(num_cpus=1, object_store_memory=500e6) ds = ray.data.range(10) - ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) + ds = ds.map(lambda x: {"data": np.ones(100 * 1024 * 1024, dtype=np.uint8)}) ds.materialize() meminfo = memory_summary(info.address_info["address"], stats_only=True) @@ -142,7 +143,7 @@ def inc(x): # TODO(Clark): Remove this sleep once we have fixed memory pressure handling. time.sleep(2) - return x + 1 + return {"id": x["id"] + 1} num_rounds = 10 for _ in range(num_rounds): @@ -167,9 +168,9 @@ def test_memory_release_lazy(shutdown_only): # Should get fused into single stage. ds = ds.lazy() - ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) - ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) - ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) + ds = ds.map(lambda x: {"data": np.ones(100 * 1024 * 1024, dtype=np.uint8)}) + ds = ds.map(lambda x: {"data": np.ones(100 * 1024 * 1024, dtype=np.uint8)}) + ds = ds.map(lambda x: {"data": np.ones(100 * 1024 * 1024, dtype=np.uint8)}) ds.materialize() meminfo = memory_summary(info.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo @@ -187,7 +188,7 @@ def test_memory_release_lazy_shuffle(shutdown_only): # Should get fused into single stage. ds = ds.lazy() - ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) + ds = ds.map(lambda x: {"data": np.ones(100 * 1024 * 1024, dtype=np.uint8)}) ds.random_shuffle().materialize() meminfo = memory_summary(info.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo @@ -206,7 +207,6 @@ def test_lazy_fanout(shutdown_only, local_path): def inc(row): map_counter.increment.remote() - row = row.as_pydict() row["one"] += 1 return row @@ -245,7 +245,7 @@ def inc(row): def inc(x): map_counter.increment.remote() - return x + 1 + return {"item": x["item"] + 1} # The source data shouldn't be cleared since it's non-lazy. ds = ray.data.from_items(list(range(10))) @@ -254,8 +254,8 @@ def inc(x): ds2 = ds1.map(inc) ds3 = ds1.map(inc) # Test content. - assert ds2.materialize().take() == list(range(2, 12)) - assert ds3.materialize().take() == list(range(2, 12)) + assert extract_values("item", ds2.materialize().take()) == list(range(2, 12)) + assert extract_values("item", ds3.materialize().take()) == list(range(2, 12)) # Test that first map is executed twice. assert ray.get(map_counter.get.remote()) == 2 * 10 + 10 + 10 @@ -268,8 +268,8 @@ def inc(x): ds1 = ds.map(inc) ds2 = ds.map(inc) # Test content. - assert ds1.materialize().take() == list(range(2, 12)) - assert ds2.materialize().take() == list(range(2, 12)) + assert extract_values("item", ds1.materialize().take()) == list(range(2, 12)) + assert extract_values("item", ds2.materialize().take()) == list(range(2, 12)) # Test that first map is executed twice, because ds1.materialize() # clears up the previous snapshot blocks, and ds2.materialize() # has to re-execute ds.map(inc) again. @@ -278,7 +278,7 @@ def inc(x): def test_spread_hint_inherit(ray_start_regular_shared): ds = ray.data.range(10).lazy() - ds = ds.map(lambda x: x + 1) + ds = ds.map(column_udf("id", lambda x: x + 1)) ds = ds.random_shuffle() for s in ds._plan._stages_before_snapshot: assert s.ray_remote_args == {}, s.ray_remote_args @@ -301,7 +301,7 @@ def test_stage_linking(ray_start_regular_shared): assert len(ds._plan._stages_before_snapshot) == 0 assert len(ds._plan._stages_after_snapshot) == 0 assert ds._plan._last_optimized_stages is None - ds = ds.map(lambda x: x + 1) + ds = ds.map(column_udf("id", lambda x: x + 1)) assert len(ds._plan._stages_before_snapshot) == 0 _assert_has_stages(ds._plan._stages_after_snapshot, ["Map"]) assert ds._plan._last_optimized_stages is None @@ -396,7 +396,10 @@ def build_pipe(): pipe = pipe.map_batches(dummy_map) pipe = pipe.map_batches(dummy_map) pipe = pipe.random_shuffle_each_window() - results = [sorted(p.take()) for p in pipe.iter_epochs()] + results = [] + for p in pipe.iter_epochs(): + result = sorted(extract_values("id", p.take())) + results.append(result) assert results == [[0, 1, 2], [0, 1, 2]], results return pipe @@ -475,8 +478,8 @@ def test_optimize_equivalent_remote_args(ray_start_regular_shared): for kwb in equivalent_kwargs: print("CHECKING", kwa, kwb) pipe = ray.data.range(3).repeat(2) - pipe = pipe.map_batches(dummy_map, compute="tasks", **kwa) - pipe = pipe.map_batches(dummy_map, compute="tasks", **kwb) + pipe = pipe.map_batches(dummy_map, batch_size=64, **kwa) + pipe = pipe.map_batches(dummy_map, batch_size=64, **kwb) pipe.take() expect_stages( pipe, @@ -490,7 +493,7 @@ def test_optimize_equivalent_remote_args(ray_start_regular_shared): for kwb in equivalent_kwargs: print("CHECKING", kwa, kwb) pipe = ray.data.range(3).repeat(2) - pipe = pipe.map_batches(dummy_map, compute="tasks", **kwa) + pipe = pipe.map_batches(dummy_map, batch_size=64, **kwa) pipe = pipe.random_shuffle_each_window(**kwb) pipe.take() expect_stages( @@ -513,9 +516,9 @@ def test_optimize_incompatible_stages(shutdown_only): pipe = ray.data.range(3).repeat(2) # Should get fused as long as their resource types are compatible. - pipe = pipe.map_batches(dummy_map, compute="actors") + pipe = pipe.map_batches(dummy_map, compute=ray.data.ActorPoolStrategy()) # Cannot fuse actors->tasks. - pipe = pipe.map_batches(dummy_map, compute="tasks") + pipe = pipe.map_batches(dummy_map) pipe = pipe.random_shuffle_each_window() pipe.take() expect_stages( @@ -529,7 +532,7 @@ def test_optimize_incompatible_stages(shutdown_only): ) pipe = ray.data.range(3).repeat(2) - pipe = pipe.map_batches(dummy_map, compute="tasks") + pipe = pipe.map_batches(dummy_map) pipe = pipe.map_batches(dummy_map, num_cpus=0.75) pipe = pipe.random_shuffle_each_window() pipe.take() @@ -585,7 +588,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=fn_constructor_args, fn_constructor_kwargs=fn_constructor_kwargs, ) @@ -593,7 +596,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=fn_constructor_args, fn_constructor_kwargs=fn_constructor_kwargs, ) @@ -621,7 +624,7 @@ def __call__(self, x): lambda df, a, b=None: b * df + a, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_args=(put(1),), fn_kwargs={"b": put(2)}, ) @@ -629,7 +632,7 @@ def __call__(self, x): CallableFn, batch_size=1, batch_format="pandas", - compute="actors", + compute=ray.data.ActorPoolStrategy(), fn_constructor_args=fn_constructor_args, fn_constructor_kwargs=fn_constructor_kwargs, ) @@ -688,7 +691,7 @@ def test_optimize_lazy_reuse_base_data( num_reads = ray.get(counter.get.remote()) assert num_reads == 1, num_reads ds = ds.lazy() - ds = ds.map(lambda x: x) + ds = ds.map(column_udf("id", lambda x: x)) if with_shuffle: ds = ds.random_shuffle() ds.take() diff --git a/python/ray/data/tests/test_pandas.py b/python/ray/data/tests/test_pandas.py index edb21f94cbb5..e83eff8662ce 100644 --- a/python/ray/data/tests/test_pandas.py +++ b/python/ray/data/tests/test_pandas.py @@ -93,8 +93,8 @@ def test_from_pandas_refs(ray_start_regular_shared, enable_pandas_block): def test_to_pandas(ray_start_regular_shared): n = 5 - df = pd.DataFrame({"value": list(range(n))}) - ds = ray.data.range_table(n) + df = pd.DataFrame({"id": list(range(n))}) + ds = ray.data.range(n) dfds = ds.to_pandas() assert df.equals(dfds) @@ -109,8 +109,8 @@ def test_to_pandas(ray_start_regular_shared): def test_to_pandas_refs(ray_start_regular_shared): n = 5 - df = pd.DataFrame({"value": list(range(n))}) - ds = ray.data.range_table(n) + df = pd.DataFrame({"id": list(range(n))}) + ds = ray.data.range(n) dfds = pd.concat(ray.get(ds.to_pandas_refs()), ignore_index=True) assert df.equals(dfds) @@ -133,7 +133,7 @@ def test_to_pandas_tensor_column_cast_pandas(ray_start_regular_shared): ctx.enable_tensor_extension_casting = True in_df = pd.DataFrame({"a": [data]}) ds = ray.data.from_pandas(in_df) - dtypes = ds.schema().types + dtypes = ds.schema().base_schema.types assert len(dtypes) == 1 # Tensor column should be automatically cast to Tensor extension. assert isinstance(dtypes[0], TensorDtype) @@ -158,7 +158,7 @@ def test_to_pandas_tensor_column_cast_arrow(ray_start_regular_shared): ctx.enable_tensor_extension_casting = True in_table = pa.table({"a": ArrowTensorArray.from_numpy(data)}) ds = ray.data.from_arrow(in_table) - dtype = ds.schema().field(0).type + dtype = ds.schema().base_schema.field(0).type assert isinstance(dtype, ArrowTensorType) out_df = ds.to_pandas() assert out_df["a"].dtype.type is np.object_ diff --git a/python/ray/data/tests/test_parquet.py b/python/ray/data/tests/test_parquet.py index 668a6c6d4c63..fcc0337cc5e9 100644 --- a/python/ray/data/tests/test_parquet.py +++ b/python/ray/data/tests/test_parquet.py @@ -146,11 +146,11 @@ def test_parquet_read_basic(ray_start_regular_shared, fs, data_path): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Datastream(num_blocks=2, num_rows=6, " + str(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "Datastream(num_blocks=2, num_rows=6, " + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds check_num_computed(ds, 0, 0) @@ -224,11 +224,11 @@ def prefetch_file_metadata(self, pieces): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Datastream(num_blocks=2, num_rows=6, " + str(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "Datastream(num_blocks=2, num_rows=6, " + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds check_num_computed(ds, 2, 2) @@ -301,11 +301,11 @@ def test_parquet_read_bulk(ray_start_regular_shared, fs, data_path): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Datastream(num_blocks=2, num_rows=6, " + str(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "Datastream(num_blocks=2, num_rows=6, " + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds check_num_computed(ds, 2, 2) @@ -391,11 +391,11 @@ def test_parquet_read_bulk_meta_provider(ray_start_regular_shared, fs, data_path assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Datastream(num_blocks=2, num_rows=6, " + str(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "Datastream(num_blocks=2, num_rows=6, " + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={one: int64, two: string})" ), ds check_num_computed(ds, 2, 2) @@ -452,7 +452,7 @@ def test_parquet_read_partitioned(ray_start_regular_shared, fs, data_path): assert len(input_files) == 2, input_files check_num_computed(ds, 0, 0) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=2,\n" " num_rows=6,\n" " schema={two: string, " @@ -460,7 +460,7 @@ def test_parquet_read_partitioned(ray_start_regular_shared, fs, data_path): ")" ), ds assert repr(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=2,\n" " num_rows=6,\n" " schema={two: string, " @@ -550,11 +550,11 @@ def test_parquet_read_partitioned_explicit(ray_start_regular_shared, tmp_path): input_files = ds.input_files() assert len(input_files) == 2, input_files assert ( - str(ds) == "Datastream(num_blocks=2, num_rows=6, " + str(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={two: string, one: int32})" ), ds assert ( - repr(ds) == "Datastream(num_blocks=2, num_rows=6, " + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " "schema={two: string, one: int32})" ), ds check_num_computed(ds, 0, 0) @@ -691,7 +691,9 @@ def test_parquet_reader_estimate_data_size(shutdown_only, tmp_path): ), "estimated data size is not deterministic in multiple calls." text_output_path = os.path.join(tmp_path, "text") - ray.data.range(1000).map(lambda _: "a" * 1000).write_parquet(text_output_path) + ray.data.range(1000).map(lambda _: {"text": "a" * 1000}).write_parquet( + text_output_path + ) ds = ray.data.read_parquet(text_output_path) assert ds.num_blocks() > 1 data_size = ds.size_bytes() diff --git a/python/ray/data/tests/test_partitioning.py b/python/ray/data/tests/test_partitioning.py index 984e8e2f9d32..4cc567b38cb8 100644 --- a/python/ray/data/tests/test_partitioning.py +++ b/python/ray/data/tests/test_partitioning.py @@ -11,7 +11,7 @@ import ray from ray.data.block import Block -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from ray.data.datasource import ( FileBasedDatasource, PathPartitionParser, diff --git a/python/ray/data/tests/test_pipeline.py b/python/ray/data/tests/test_pipeline.py index d189c5736b91..fd72d1ac3b8a 100644 --- a/python/ray/data/tests/test_pipeline.py +++ b/python/ray/data/tests/test_pipeline.py @@ -7,13 +7,13 @@ import numpy as np import ray -from ray.data import datastream -from ray.data._internal.arrow_block import ArrowRow +from ray.data import dataset from ray.data.context import DataContext, WARN_PREFIX, OK_PREFIX -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from ray.data.dataset_pipeline import DatasetPipeline from ray.tests.conftest import * # noqa +from ray.data.tests.util import column_udf, extract_values class MockLogger: @@ -22,14 +22,20 @@ def __init__(self): self.infos = [] def warning(self, msg): + if "STRICT_MODE" in msg: + return self.warnings.append(msg) print("warning:", msg) def info(self, msg): + if "STRICT_MODE" in msg: + return self.infos.append(msg) print("info:", msg) def debug(self, msg): + if "STRICT_MODE" in msg: + return print("debug:", msg) @@ -37,19 +43,19 @@ def test_warnings(shutdown_only): ray.init(num_cpus=2) # Test parallelism warning. - datastream.logger = MockLogger() + dataset.logger = MockLogger() ray.data.range(10, parallelism=10).window(blocks_per_window=1) - print(datastream.logger.warnings) - print(datastream.logger.infos) - assert datastream.logger.warnings == [ + print(dataset.logger.warnings) + print(dataset.logger.infos) + assert dataset.logger.warnings == [ f"{WARN_PREFIX} This pipeline's parallelism is limited by its blocks per " "window to " "~1 concurrent tasks per window. To maximize " "performance, increase the blocks per window to at least 2. This " - "may require increasing the base datastream's parallelism and/or " + "may require increasing the base dataset's parallelism and/or " "adjusting the windowing parameters." ] - assert datastream.logger.infos == [ + assert dataset.logger.infos == [ "Created DatasetPipeline with 10 windows: 8b min, 8b max, 8b mean", "Blocks per window: 1 min, 1 max, 1 mean", f"{OK_PREFIX} This pipeline's windows likely fit in object store memory " @@ -63,17 +69,17 @@ def test_warnings(shutdown_only): ray.cluster_resources = lambda: res_dict # Test window memory warning. - datastream.logger = MockLogger() + dataset.logger = MockLogger() ray.data.range(100000, parallelism=100).window(blocks_per_window=10) - print(datastream.logger.warnings) - print(datastream.logger.infos) - assert datastream.logger.warnings == [ + print(dataset.logger.warnings) + print(dataset.logger.infos) + assert dataset.logger.warnings == [ f"{WARN_PREFIX} This pipeline's windows are ~0.08MiB in size each and " "may not fit in " "object store memory without spilling. To improve performance, " "consider reducing the size of each window to 250b or less." ] - assert datastream.logger.infos == [ + assert dataset.logger.infos == [ "Created DatasetPipeline with 10 windows: 0.08MiB min, 0.08MiB max, " "0.08MiB mean", "Blocks per window: 10 min, 10 max, 10 mean", @@ -83,22 +89,22 @@ def test_warnings(shutdown_only): ] # Test warning on both. - datastream.logger = MockLogger() + dataset.logger = MockLogger() ray.data.range(100000, parallelism=1).window(bytes_per_window=100000) - print(datastream.logger.warnings) - print(datastream.logger.infos) - assert datastream.logger.warnings == [ + print(dataset.logger.warnings) + print(dataset.logger.infos) + assert dataset.logger.warnings == [ f"{WARN_PREFIX} This pipeline's parallelism is limited by its blocks " "per window " "to ~1 concurrent tasks per window. To maximize performance, increase " "the blocks per window to at least 2. This may require increasing the " - "base datastream's parallelism and/or adjusting the windowing parameters.", + "base dataset's parallelism and/or adjusting the windowing parameters.", f"{WARN_PREFIX} This pipeline's windows are ~0.76MiB in size each and may " "not fit " "in object store memory without spilling. To improve performance, " "consider reducing the size of each window to 250b or less.", ] - assert datastream.logger.infos == [ + assert dataset.logger.infos == [ "Created DatasetPipeline with 1 windows: 0.76MiB min, 0.76MiB max, " "0.76MiB mean", "Blocks per window: 1 min, 1 max, 1 mean", @@ -107,12 +113,12 @@ def test_warnings(shutdown_only): ray.cluster_resources = old # Test no warning. - datastream.logger = MockLogger() + dataset.logger = MockLogger() ray.data.range(10, parallelism=10).window(blocks_per_window=10) - print(datastream.logger.warnings) - print(datastream.logger.infos) - assert datastream.logger.warnings == [] - assert datastream.logger.infos == [ + print(dataset.logger.warnings) + print(dataset.logger.infos) + assert dataset.logger.warnings == [] + assert dataset.logger.infos == [ "Created DatasetPipeline with 1 windows: 80b min, 80b max, 80b mean", "Blocks per window: 10 min, 10 max, 10 mean", f"{OK_PREFIX} This pipeline's per-window parallelism is high enough to fully " @@ -127,11 +133,15 @@ def test_pipeline_actors(shutdown_only): pipe = ( ray.data.range(3) .repeat(10) - .map(lambda x: x + 1) - .map(lambda x: x + 1, compute="actors", num_gpus=1) + .map(column_udf("id", lambda x: x + 1)) + .map( + column_udf("id", lambda x: x + 1), + compute=ray.data.ActorPoolStrategy(), + num_gpus=1, + ) ) - assert sorted(pipe.take(999)) == sorted([2, 3, 4] * 10) + assert sorted(extract_values("id", pipe.take(999))) == sorted([2, 3, 4] * 10) def test_pipeline_is_parallel(shutdown_only): @@ -173,12 +183,12 @@ def sleep(x): def test_window_by_bytes(ray_start_regular_shared): with pytest.raises(ValueError): - ray.data.range_table(10).window(blocks_per_window=2, bytes_per_window=2) + ray.data.range(10).window(blocks_per_window=2, bytes_per_window=2) - pipe = ray.data.range_table(10000000, parallelism=100).window(blocks_per_window=2) + pipe = ray.data.range(10000000, parallelism=100).window(blocks_per_window=2) assert str(pipe) == "DatasetPipeline(num_windows=50, num_stages=2)" - pipe = ray.data.range_table(10000000, parallelism=100).window( + pipe = ray.data.range(10000000, parallelism=100).window( bytes_per_window=10 * 1024 * 1024 ) assert str(pipe) == "DatasetPipeline(num_windows=8, num_stages=2)" @@ -187,19 +197,19 @@ def test_window_by_bytes(ray_start_regular_shared): for ds in dss[:-1]: assert ds.num_blocks() in [12, 13] - pipe = ray.data.range_table(10000000, parallelism=100).window(bytes_per_window=1) + pipe = ray.data.range(10000000, parallelism=100).window(bytes_per_window=1) assert str(pipe) == "DatasetPipeline(num_windows=100, num_stages=2)" for ds in pipe.iter_datasets(): assert ds.num_blocks() == 1 - pipe = ray.data.range_table(10000000, parallelism=100).window(bytes_per_window=1e9) + pipe = ray.data.range(10000000, parallelism=100).window(bytes_per_window=1e9) assert str(pipe) == "DatasetPipeline(num_windows=1, num_stages=2)" for ds in pipe.iter_datasets(): assert ds.num_blocks() == 100 # Test creating from non-lazy BlockList. pipe = ( - ray.data.range_table(10000000, parallelism=100) + ray.data.range(10000000, parallelism=100) .map_batches(lambda x: x) .window(bytes_per_window=10 * 1024 * 1024) ) @@ -210,42 +220,47 @@ def test_window_by_bytes(ray_start_regular_shared): try: context.optimize_fuse_read_stages = False dataset = ray.data.range(10).window(bytes_per_window=1) - assert dataset.take(10) == list(range(10)) + assert extract_values("id", dataset.take(10)) == list(range(10)) finally: context.optimize_fuse_read_stages = old def test_epoch(ray_start_regular_shared): # Test dataset repeat. - pipe = ray.data.range(5).map(lambda x: x * 2).repeat(3).map(lambda x: x * 2) - results = [p.take() for p in pipe.iter_epochs()] + pipe = ( + ray.data.range(5) + .map(column_udf("id", lambda x: x * 2)) + .repeat(3) + .map(column_udf("id", lambda x: x * 2)) + ) + results = [extract_values("id", p.take()) for p in pipe.iter_epochs()] assert results == [[0, 4, 8, 12, 16], [0, 4, 8, 12, 16], [0, 4, 8, 12, 16]] # Test dataset pipeline repeat. pipe = ray.data.range(3).window(blocks_per_window=2).repeat(3) - results = [p.take() for p in pipe.iter_epochs()] + results = [extract_values("id", p.take()) for p in pipe.iter_epochs()] assert results == [[0, 1, 2], [0, 1, 2], [0, 1, 2]] # Test max epochs. pipe = ray.data.range(3).window(blocks_per_window=2).repeat(3) - results = [p.take() for p in pipe.iter_epochs(2)] + results = [extract_values("id", p.take()) for p in pipe.iter_epochs(2)] assert results == [[0, 1, 2], [0, 1, 2]] # Test nested repeat. pipe = ray.data.range(5).repeat(2).repeat(2) - results = [p.take() for p in pipe.iter_epochs()] + results = [extract_values("id", p.take()) for p in pipe.iter_epochs()] assert results == [[0, 1, 2, 3, 4, 0, 1, 2, 3, 4], [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]] # Test preserve_epoch=True. pipe = ray.data.range(5).repeat(2).rewindow(blocks_per_window=2) - results = [p.take() for p in pipe.iter_epochs()] + results = [extract_values("id", p.take()) for p in pipe.iter_epochs()] assert results == [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]] # Test preserve_epoch=False. pipe = ( ray.data.range(5).repeat(2).rewindow(blocks_per_window=2, preserve_epoch=False) ) - results = [p.take() for p in pipe.iter_epochs()] + results = [extract_values("id", p.take()) for p in pipe.iter_epochs()] assert results == [[0, 1, 2, 3], [4, 0, 1, 2, 3, 4]] @@ -284,15 +299,17 @@ def test_basic_pipeline(ray_start_regular_shared): pipe = ds.window(blocks_per_window=1).map(lambda x: x).map(lambda x: x) assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=4)" - assert pipe.take() == list(range(10)) + assert extract_values("id", pipe.take()) == list(range(10)) pipe = ( - ds.window(blocks_per_window=1).map(lambda x: x).flat_map(lambda x: [x, x + 1]) + ds.window(blocks_per_window=1) + .map(lambda x: x) + .flat_map(lambda x: [{"id": x["id"]}, {"id": x["id"] + 1}]) ) assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=4)" assert pipe.count() == 20 - pipe = ds.window(blocks_per_window=1).filter(lambda x: x % 2 == 0) + pipe = ds.window(blocks_per_window=1).filter(lambda x: x["id"] % 2 == 0) assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=3)" assert pipe.count() == 5 @@ -319,10 +336,10 @@ def test_window(ray_start_regular_shared): assert str(pipe) == "DatasetPipeline(num_windows=None, num_stages=1)" datasets = list(pipe.iter_datasets()) assert len(datasets) == 4 - assert datasets[0].take() == [0, 1, 2] - assert datasets[1].take() == [3, 4, 5] - assert datasets[2].take() == [6, 7, 8] - assert datasets[3].take() == [9] + assert extract_values("id", datasets[0].take()) == [0, 1, 2] + assert extract_values("id", datasets[1].take()) == [3, 4, 5] + assert extract_values("id", datasets[2].take()) == [6, 7, 8] + assert extract_values("id", datasets[3].take()) == [9] ds = ray.data.range(10, parallelism=10) pipe = ds.window(blocks_per_window=5) @@ -331,10 +348,10 @@ def test_window(ray_start_regular_shared): assert str(pipe) == "DatasetPipeline(num_windows=None, num_stages=1)" datasets = list(pipe.iter_datasets()) assert len(datasets) == 4 - assert datasets[0].take() == [0, 1, 2] - assert datasets[1].take() == [3, 4, 5] - assert datasets[2].take() == [6, 7, 8] - assert datasets[3].take() == [9] + assert extract_values("id", datasets[0].take()) == [0, 1, 2] + assert extract_values("id", datasets[1].take()) == [3, 4, 5] + assert extract_values("id", datasets[2].take()) == [6, 7, 8] + assert extract_values("id", datasets[3].take()) == [9] def test_repeat(ray_start_regular_shared): @@ -345,7 +362,7 @@ def test_repeat(ray_start_regular_shared): assert str(pipe) == "DatasetPipeline(num_windows=5, num_stages=2)" pipe = pipe.repeat(2) assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" - assert pipe.take() == (list(range(5)) + list(range(5))) + assert extract_values("id", pipe.take()) == (list(range(5)) + list(range(5))) ds = ray.data.range(5) pipe = ds.window(blocks_per_window=1) @@ -362,7 +379,7 @@ def test_from_iterable(ray_start_regular_shared): pipe = DatasetPipeline.from_iterable( [lambda: ray.data.range(3), lambda: ray.data.range(2)] ) - assert pipe.take() == [0, 1, 2, 0, 1] + assert extract_values("id", pipe.take()) == [0, 1, 2, 0, 1] def test_repeat_forever(ray_start_regular_shared): @@ -372,6 +389,7 @@ def test_repeat_forever(ray_start_regular_shared): pipe = ds.repeat() assert str(pipe) == "DatasetPipeline(num_windows=inf, num_stages=2)" for i, v in enumerate(pipe.iter_rows()): + v = v["id"] assert v == i % 10, (v, i, i % 10) if i > 1000: break @@ -404,7 +422,7 @@ def test_to_tf(ray_start_regular_shared): ds = ds.add_column("label", lambda x: 1) pipe = ds.window(blocks_per_window=2).repeat(2) batches = list( - pipe.to_tf(feature_columns="__value__", label_columns="label", batch_size=None) + pipe.to_tf(feature_columns="data", label_columns="label", batch_size=None) ) assert len(batches) == 20 @@ -425,7 +443,7 @@ def test_iter_batches_batch_across_windows(ray_start_regular_shared): # 3 windows, each containing 3 blocks, each containing 3 rows. pipe = ray.data.range(27, parallelism=9).window(blocks_per_window=3) # 4-row batches, with batches spanning both blocks and windows. - batches = list(pipe.iter_batches(batch_size=4)) + batches = list(pipe.iter_batches(batch_size=4, batch_format="pandas")) assert len(batches) == 7, batches assert all(len(e) == 4 for e in batches[:-1]) assert len(batches[-1]) == 3 @@ -443,73 +461,74 @@ def test_iter_datasets(ray_start_regular_shared): def test_foreach_window(ray_start_regular_shared): pipe = ray.data.range(5).window(blocks_per_window=2) - pipe = pipe.foreach_window(lambda ds: ds.map(lambda x: x * 2)) - assert pipe.take() == [0, 2, 4, 6, 8] + pipe = pipe.foreach_window(lambda ds: ds.map(column_udf("id", lambda x: x * 2))) + assert extract_values("id", pipe.take()) == [0, 2, 4, 6, 8] def test_schema(ray_start_regular_shared): pipe = ray.data.range(5).window(blocks_per_window=2) - assert pipe.schema() == int + assert pipe.schema().names == ["id"] def test_schema_peek(ray_start_regular_shared): # Multiple datasets pipe = ray.data.range(6, parallelism=6).window(blocks_per_window=2) - assert pipe.schema() == int - assert pipe._first_datastream is not None + assert pipe.schema().names == ["id"] + assert pipe._first_dataset is not None dss = list(pipe.iter_datasets()) assert len(dss) == 3, dss - assert pipe._first_datastream is None - assert pipe.schema() == int + assert pipe._first_dataset is None + assert pipe.schema().names == ["id"] # Only 1 dataset pipe = ray.data.range(1).window(blocks_per_window=2) - assert pipe.schema() == int - assert pipe._first_datastream is not None + assert pipe.schema().names == ["id"] + assert pipe._first_dataset is not None dss = list(pipe.iter_datasets()) assert len(dss) == 1, dss - assert pipe._first_datastream is None - assert pipe.schema() == int + assert pipe._first_dataset is None + assert pipe.schema().names == ["id"] # Empty datasets pipe = ( ray.data.range(6, parallelism=6) - .filter(lambda x: x < 0) + .filter(lambda x: x["id"] < 0) .window(blocks_per_window=2) ) assert pipe.schema() is None - assert pipe._first_datastream is not None + assert pipe._first_dataset is not None dss = list(pipe.iter_datasets()) assert len(dss) == 3, dss - assert pipe._first_datastream is None + assert pipe._first_dataset is None assert pipe.schema() is None def test_schema_after_repeat(ray_start_regular_shared): pipe = ray.data.range(6, parallelism=6).window(blocks_per_window=2).repeat(2) - assert pipe.schema() == int + assert pipe.schema().names == ["id"] output = [] for ds in pipe.iter_datasets(): - output.extend(ds.take()) + output.extend(extract_values("id", ds.take())) assert sorted(output) == sorted(list(range(6)) * 2) pipe = ray.data.range(6, parallelism=6).window(blocks_per_window=2).repeat(2) - assert pipe.schema() == int + assert pipe.schema().names == ["id"] # Test that operations still work after peek. pipe = pipe.map_batches(lambda batch: batch) output = [] for ds in pipe.iter_datasets(): - output.extend(ds.take()) + output.extend(extract_values("id", ds.take())) assert sorted(output) == sorted(list(range(6)) * 2) def test_split(ray_start_regular_shared): - pipe = ray.data.range(3).map(lambda x: x + 1).repeat(10) + pipe = ray.data.range(3).map(column_udf("id", lambda x: x + 1)).repeat(10) @ray.remote(num_cpus=0) def consume(shard, i): total = 0 for row in shard.iter_rows(): + row = row["id"] total += 1 assert row == i + 1, row assert total == 10, total @@ -522,13 +541,14 @@ def consume(shard, i): def test_split_at_indices(ray_start_regular_shared): indices = [2, 5] n = 8 - pipe = ray.data.range(n).map(lambda x: x + 1).repeat(2) + pipe = ray.data.range(n).map(column_udf("id", lambda x: x + 1)).repeat(2) @ray.remote(num_cpus=0) def consume(shard, i): total = 0 out = [] for row in shard.iter_rows(): + row = row["id"] total += 1 out.append(row) if i == 0: @@ -548,7 +568,7 @@ def consume(shard, i): ) -def _prepare_dataset_to_write(tmp_dir: str) -> Tuple[Dataset[ArrowRow], pd.DataFrame]: +def _prepare_dataset_to_write(tmp_dir: str) -> Tuple[Dataset, pd.DataFrame]: df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) df = pd.concat([df1, df2]) @@ -632,29 +652,22 @@ def test_sort_each_window(ray_start_regular_shared): pipe = ( ray.data.range(12, parallelism=12) .window(blocks_per_window=3) - .sort_each_window() - ) - assert pipe.take() == list(range(12)) - - pipe = ( - ray.data.range(12, parallelism=12) - .window(blocks_per_window=3) - .sort_each_window(descending=True) + .sort_each_window("id") ) - assert pipe.take() == [2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9] + assert extract_values("id", pipe.take()) == list(range(12)) pipe = ( ray.data.range(12, parallelism=12) .window(blocks_per_window=3) - .sort_each_window(key=lambda x: -x, descending=True) + .sort_each_window("id", descending=True) ) - assert pipe.take() == list(range(12)) + assert extract_values("id", pipe.take()) == [2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9] def test_randomize_block_order_each_window(ray_start_regular_shared): pipe = ray.data.range(12).repartition(6).window(blocks_per_window=3) pipe = pipe.randomize_block_order_each_window(seed=0) - assert pipe.take() == [0, 1, 4, 5, 2, 3, 6, 7, 10, 11, 8, 9] + assert extract_values("id", pipe.take()) == [0, 1, 4, 5, 2, 3, 6, 7, 10, 11, 8, 9] def test_add_column(ray_start_regular_shared): @@ -697,7 +710,7 @@ def test_random_shuffle_each_window_with_custom_resource(ray_start_cluster): ray.data.datasource.RangeDatasource(), parallelism=10, n=1000, - block_format="list", + block_format="arrow", ray_remote_args={"resources": {"bar": 1}}, ).repeat(3) pipe = pipe.random_shuffle_each_window(resources={"bar": 1}) @@ -717,7 +730,7 @@ def verify_integrity(p): for b in p.iter_batches(): pass # Verify the integrity of the blocks of original dataset. - assert ds.take_all() == [1, 2, 3, 4, 5, 6] + assert extract_values("item", ds.take_all()) == [1, 2, 3, 4, 5, 6] verify_integrity(ds.repeat(10).randomize_block_order_each_window()) verify_integrity( @@ -751,7 +764,7 @@ def verify_integrity(p): splits = p.split(2, equal=True) ray.get([consume.remote(p) for p in splits]) # Verify the integrity of the blocks of original dataset - assert ds.take_all() == [1, 2, 3, 4, 5, 6] + assert extract_values("item", ds.take_all()) == [1, 2, 3, 4, 5, 6] verify_integrity(ds.repeat(10).randomize_block_order_each_window()) verify_integrity( @@ -814,9 +827,9 @@ def verify_blocks(pipe, owned_by_consumer): ds.repeat(1).randomize_block_order_each_window().map_batches(lambda x: x), True ) verify_blocks(ds.repeat(1).map_batches(lambda x: x), True) - verify_blocks(ds.repeat(1).map(lambda x: x), True) - verify_blocks(ds.repeat(1).filter(lambda x: x > 3), True) - verify_blocks(ds.repeat(1).sort_each_window(), True) + verify_blocks(ds.repeat(1).map(column_udf("item", lambda x: x)), True) + verify_blocks(ds.repeat(1).filter(lambda x: x["item"] > 3), True) + verify_blocks(ds.repeat(1).sort_each_window("item"), True) verify_blocks(ds.repeat(1).random_shuffle_each_window(), True) verify_blocks(ds.repeat(1).repartition_each_window(2), True) verify_blocks(ds.repeat(1).rewindow(blocks_per_window=1), False) diff --git a/python/ray/data/tests/test_pipeline_nohang.py b/python/ray/data/tests/test_pipeline_nohang.py index af3408c81902..412f216d7880 100644 --- a/python/ray/data/tests/test_pipeline_nohang.py +++ b/python/ray/data/tests/test_pipeline_nohang.py @@ -2,6 +2,7 @@ import ray from ray.tests.conftest import * # noqa +from ray.data.tests.util import extract_values, column_udf NUM_REPEATS = 10 NUM_TASKS = 10 @@ -14,9 +15,15 @@ def test_basic_actors(shutdown_only): for _ in range(NUM_REPEATS): ds = ray.data.range(NUM_TASKS) ds = ds.window(blocks_per_window=1) - assert sorted(ds.map(lambda x: x + 1, compute="actors").take()) == list( - range(1, NUM_TASKS + 1) - ) + assert sorted( + extract_values( + "id", + ds.map( + column_udf("id", lambda x: x + 1), + compute=ray.data.ActorPoolStrategy(), + ).take(), + ) + ) == list(range(1, NUM_TASKS + 1)) if __name__ == "__main__": diff --git a/python/ray/data/tests/test_random_access.py b/python/ray/data/tests/test_random_access.py index b6c70ea6c53e..f31951d1759a 100644 --- a/python/ray/data/tests/test_random_access.py +++ b/python/ray/data/tests/test_random_access.py @@ -8,21 +8,23 @@ @pytest.mark.parametrize("pandas", [False, True]) def test_basic(ray_start_regular_shared, pandas): - ds = ray.data.range_table(100, parallelism=10) - ds = ds.add_column("embedding", lambda b: b["value"] ** 2) + ds = ray.data.range(100, parallelism=10) + ds = ds.add_column("embedding", lambda b: b["id"] ** 2) if not pandas: - ds = ds.map_batches(lambda df: pyarrow.Table.from_pandas(df)) + ds = ds.map_batches( + lambda df: pyarrow.Table.from_pandas(df), batch_format="pandas" + ) - rad = ds.to_random_access_dataset("value", num_workers=1) + rad = ds.to_random_access_dataset("id", num_workers=1) # Test get. assert ray.get(rad.get_async(-1)) is None assert ray.get(rad.get_async(100)) is None for i in range(100): - assert ray.get(rad.get_async(i)) == {"value": i, "embedding": i**2} + assert ray.get(rad.get_async(i)) == {"id": i, "embedding": i**2} def expected(i): - return {"value": i, "embedding": i**2} + return {"id": i, "embedding": i**2} # Test multiget. results = rad.multiget([-1] + list(range(10)) + [100]) @@ -30,26 +32,22 @@ def expected(i): def test_empty_blocks(ray_start_regular_shared): - ds = ray.data.range_table(10).repartition(20) + ds = ray.data.range(10).repartition(20) assert ds.num_blocks() == 20 - rad = ds.to_random_access_dataset("value") + rad = ds.to_random_access_dataset("id") for i in range(10): - assert ray.get(rad.get_async(i)) == {"value": i} + assert ray.get(rad.get_async(i)) == {"id": i} def test_errors(ray_start_regular_shared): ds = ray.data.range(10) - with pytest.raises(ValueError): - ds.to_random_access_dataset("value") - - ds = ray.data.range_table(10) with pytest.raises(ValueError): ds.to_random_access_dataset("invalid") def test_stats(ray_start_regular_shared): - ds = ray.data.range_table(100, parallelism=10) - rad = ds.to_random_access_dataset("value", num_workers=1) + ds = ray.data.range(100, parallelism=10) + rad = ds.to_random_access_dataset("id", num_workers=1) stats = rad.stats() assert "Accesses per worker: 0 min, 0 max, 0 mean" in stats, stats ray.get(rad.get_async(0)) diff --git a/python/ray/data/tests/test_randomize_block_order.py b/python/ray/data/tests/test_randomize_block_order.py index 6ae396cdea5e..bccef7aaf331 100644 --- a/python/ray/data/tests/test_randomize_block_order.py +++ b/python/ray/data/tests/test_randomize_block_order.py @@ -14,6 +14,7 @@ from ray.data._internal.logical.interfaces import LogicalPlan from ray.data._internal.logical.optimizers import LogicalOptimizer from ray.data._internal.planner.planner import Planner +from ray.data.tests.util import extract_values def test_randomize_blocks_operator(ray_start_regular_shared, enable_optimizer): @@ -112,7 +113,20 @@ def test_randomize_block_order_after_repartition(): def test_randomize_blocks_e2e(ray_start_regular_shared, enable_optimizer): ds = ray.data.range(12, parallelism=4) ds = ds.randomize_block_order(seed=0) - assert ds.take_all() == [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11], ds + assert extract_values("id", ds.take_all()) == [ + 6, + 7, + 8, + 0, + 1, + 2, + 3, + 4, + 5, + 9, + 10, + 11, + ], ds def test_randomize_blocks_rule_e2e(ray_start_regular_shared, enable_optimizer): diff --git a/python/ray/data/tests/test_raydp.py b/python/ray/data/tests/test_raydp.py index 5c06b619f288..0b5c9848da4d 100644 --- a/python/ray/data/tests/test_raydp.py +++ b/python/ray/data/tests/test_raydp.py @@ -34,10 +34,10 @@ def test_raydp_roundtrip(spark): def test_raydp_to_spark(spark): n = 5 - ds = ray.data.range_table(n) - values = [r["value"] for r in ds.take(5)] + ds = ray.data.range(n) + values = [r["id"] for r in ds.take(5)] df = ds.to_spark(spark) - rows = [r.value for r in df.take(5)] + rows = [r.id for r in df.take(5)] assert values == rows diff --git a/python/ray/data/tests/test_size_estimation.py b/python/ray/data/tests/test_size_estimation.py index 666cbe50a355..283fcd663f06 100644 --- a/python/ray/data/tests/test_size_estimation.py +++ b/python/ray/data/tests/test_size_estimation.py @@ -121,7 +121,9 @@ def test_split_read_csv(ray_start_regular_shared, tmp_path): def gen(name): path = os.path.join(tmp_path, name) - ray.data.range(1000, parallelism=1).map(lambda _: LARGE_VALUE).write_csv(path) + ray.data.range(1000, parallelism=1).map( + lambda _: {"out": LARGE_VALUE} + ).write_csv(path) return ray.data.read_csv(path) # 20MiB @@ -160,7 +162,7 @@ def gen(name): path = os.path.join(tmp_path, name) ds = ( ray.data.range(200000, parallelism=1) - .map(lambda _: uuid.uuid4().hex) + .map(lambda _: {"out": uuid.uuid4().hex}) .materialize() ) # Fully execute the operations prior to write, because with @@ -179,7 +181,7 @@ def gen(name): ctx.target_max_block_size = 3_000_000 ds2 = gen("out2") nrow = ds2._block_num_rows() - assert 2 < len(nrow) < 4, nrow + assert 3 < len(nrow) < 5, nrow for x in nrow[:-1]: assert 50000 < x < 75000, (x, nrow) @@ -198,19 +200,12 @@ def test_split_map(shutdown_only, use_actors): ray.init(num_cpus=2) kwargs = {} if use_actors: - kwargs = {"compute": "actors"} - # Simple block + kwargs = {"compute": ray.data.ActorPoolStrategy()} + + # Arrow block ctx = ray.data.context.DataContext.get_current() ctx.target_max_block_size = 20_000_000 ctx.block_splitting_enabled = True - ds1 = ray.data.range(1000, parallelism=1).map(lambda _: LARGE_VALUE, **kwargs) - nblocks = len(ds1.map(lambda x: x, **kwargs).get_internal_block_refs()) - assert nblocks == 1, nblocks - ctx.target_max_block_size = 2_000_000 - nblocks = len(ds1.map(lambda x: x, **kwargs).get_internal_block_refs()) - assert 4 < nblocks < 7 or use_actors, nblocks - - # Arrow block ctx.target_max_block_size = 20_000_000 ds2 = ray.data.range(1000, parallelism=1).map(lambda _: ARROW_LARGE_VALUE, **kwargs) nblocks = len(ds2.map(lambda x: x, **kwargs).get_internal_block_refs()) @@ -228,17 +223,9 @@ def test_split_map(shutdown_only, use_actors): def test_split_flat_map(ray_start_regular_shared): - # Simple block ctx = ray.data.context.DataContext.get_current() ctx.target_max_block_size = 20_000_000 ctx.block_splitting_enabled = True - ds1 = ray.data.range(1000, parallelism=1).map(lambda _: LARGE_VALUE) - nblocks = len(ds1.flat_map(lambda x: [x]).get_internal_block_refs()) - assert nblocks == 1, nblocks - ctx.target_max_block_size = 2_000_000 - nblocks = len(ds1.flat_map(lambda x: [x]).get_internal_block_refs()) - assert 4 < nblocks < 7, nblocks - # Arrow block ctx.target_max_block_size = 20_000_000 ds2 = ray.data.range(1000, parallelism=1).map(lambda _: ARROW_LARGE_VALUE) @@ -250,17 +237,9 @@ def test_split_flat_map(ray_start_regular_shared): def test_split_map_batches(ray_start_regular_shared): - # Simple block ctx = ray.data.context.DataContext.get_current() ctx.target_max_block_size = 20_000_000 ctx.block_splitting_enabled = True - ds1 = ray.data.range(1000, parallelism=1).map(lambda _: LARGE_VALUE) - nblocks = len(ds1.map_batches(lambda x: x, batch_size=16).get_internal_block_refs()) - assert nblocks == 1, ds1._block_num_rows() - ctx.target_max_block_size = 2_000_000 - nblocks = len(ds1.map_batches(lambda x: x, batch_size=16).get_internal_block_refs()) - assert 4 < nblocks < 7, ds1._block_num_rows() - # Arrow block ctx.target_max_block_size = 20_000_000 ds2 = ray.data.range(1000, parallelism=1).map(lambda _: ARROW_LARGE_VALUE) diff --git a/python/ray/data/tests/test_sort.py b/python/ray/data/tests/test_sort.py index b11540e376ce..1e22d6442dea 100644 --- a/python/ray/data/tests/test_sort.py +++ b/python/ray/data/tests/test_sort.py @@ -11,6 +11,7 @@ from ray.data.block import BlockAccessor from ray.data.tests.conftest import * # noqa from ray.tests.conftest import * # noqa +from ray.data.tests.util import extract_values def test_sort_simple(ray_start_regular, use_push_based_shuffle): @@ -19,18 +20,21 @@ def test_sort_simple(ray_start_regular, use_push_based_shuffle): xs = list(range(num_items)) random.shuffle(xs) ds = ray.data.from_items(xs, parallelism=parallelism) - assert ds.sort().take(num_items) == list(range(num_items)) + assert extract_values("item", ds.sort("item").take(num_items)) == list( + range(num_items) + ) # Make sure we have rows in each block. - assert len([n for n in ds.sort()._block_num_rows() if n > 0]) == parallelism - assert ds.sort(descending=True).take(num_items) == list(reversed(range(num_items))) - assert ds.sort(key=lambda x: -x).take(num_items) == list(reversed(range(num_items))) + assert len([n for n in ds.sort("item")._block_num_rows() if n > 0]) == parallelism + assert extract_values( + "item", ds.sort("item", descending=True).take(num_items) + ) == list(reversed(range(num_items))) # Test empty dataset. ds = ray.data.from_items([]) - s1 = ds.sort() + s1 = ds.sort("item") assert s1.count() == 0 assert s1.take() == ds.take() - ds = ray.data.range(10).filter(lambda r: r > 10).sort() + ds = ray.data.range(10).filter(lambda r: r["id"] > 10).sort("id") assert ds.count() == 0 @@ -40,7 +44,7 @@ def test_sort_partition_same_key_to_same_block( num_items = 100 xs = [1] * num_items ds = ray.data.from_items(xs) - sorted_ds = ds.repartition(num_items).sort() + sorted_ds = ds.repartition(num_items).sort("item") # We still have 100 blocks assert len(sorted_ds._block_num_rows()) == num_items @@ -130,21 +134,19 @@ def test_sort_arrow_with_empty_blocks( [{"A": (x % 3), "B": x} for x in range(3)], parallelism=3 ) ds = ds.filter(lambda r: r["A"] == 0) - assert [row.as_pydict() for row in ds.sort("A").iter_rows()] == [ - {"A": 0, "B": 0} - ] + assert list(ds.sort("A").iter_rows()) == [{"A": 0, "B": 0}] # Test empty dataset. - ds = ray.data.range_table(10).filter(lambda r: r["value"] > 10) + ds = ray.data.range(10).filter(lambda r: r["id"] > 10) assert ( len( ray.data._internal.sort.sample_boundaries( - ds._plan.execute().get_blocks(), "value", 3 + ds._plan.execute().get_blocks(), "id", 3 ) ) == 2 ) - assert ds.sort("value").count() == 0 + assert ds.sort("id").count() == 0 finally: ctx.use_polars = original_use_polars @@ -200,19 +202,19 @@ def test_sort_pandas_with_empty_blocks(ray_start_regular, use_push_based_shuffle ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in range(3)], parallelism=3) ds = ds.filter(lambda r: r["A"] == 0) - assert [row.as_pydict() for row in ds.sort("A").iter_rows()] == [{"A": 0, "B": 0}] + assert list(ds.sort("A").iter_rows()) == [{"A": 0, "B": 0}] # Test empty dataset. - ds = ray.data.range_table(10).filter(lambda r: r["value"] > 10) + ds = ray.data.range(10).filter(lambda r: r["id"] > 10) assert ( len( ray.data._internal.sort.sample_boundaries( - ds._plan.execute().get_blocks(), "value", 3 + ds._plan.execute().get_blocks(), "id", 3 ) ) == 2 ) - assert ds.sort("value").count() == 0 + assert ds.sort("id").count() == 0 def test_push_based_shuffle_schedule(): @@ -341,9 +343,9 @@ def test_sort_multinode(ray_start_cluster, use_push_based_shuffle): ray.init(cluster.address) parallelism = 100 - ds = ray.data.range(1000, parallelism=parallelism).random_shuffle().sort() + ds = ray.data.range(1000, parallelism=parallelism).random_shuffle().sort("id") for i, row in enumerate(ds.iter_rows()): - assert row == i + assert row["id"] == i def patch_ray_remote(condition, callback): @@ -452,12 +454,12 @@ def check_pipelined(refs): assert task_context["num_instances_below_parallelism"] <= 1 task_context["num_instances_below_parallelism"] = 0 - ds = ds.sort() + ds = ds.sort("id") # Only the last round should have fewer tasks in flight. assert task_context["num_instances_below_parallelism"] <= 1 task_context["num_instances_below_parallelism"] = 0 for i, row in enumerate(ds.iter_rows()): - assert row == i + assert row["id"] == i finally: ctx.use_push_based_shuffle = original diff --git a/python/ray/data/tests/test_split.py b/python/ray/data/tests/test_split.py index b7a20365c67c..b869bf2ddd2b 100644 --- a/python/ray/data/tests/test_split.py +++ b/python/ray/data/tests/test_split.py @@ -4,6 +4,7 @@ import time from unittest.mock import patch +import pandas as pd import numpy as np import pytest from ray.data.block import BlockMetadata @@ -14,7 +15,7 @@ _equalize, ) from ray.data._internal.plan import ExecutionPlan -from ray.data._internal.stats import DatastreamStats +from ray.data._internal.stats import DatasetStats from ray.data._internal.split import ( _drop_empty_block_split, _generate_valid_indices, @@ -24,8 +25,9 @@ _split_at_indices, ) from ray.data.block import BlockAccessor -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import extract_values from ray.tests.conftest import * # noqa @@ -97,13 +99,13 @@ def _test_equal_split_balanced(block_sizes, num_splits): metadata = [] total_rows = 0 for block_size in block_sizes: - block = list(range(total_rows, total_rows + block_size)) + block = pd.DataFrame({"id": list(range(total_rows, total_rows + block_size))}) blocks.append(ray.put(block)) metadata.append(BlockAccessor.for_block(block).get_metadata(None, None)) total_rows += block_size block_list = BlockList(blocks, metadata, owned_by_consumer=True) ds = Dataset( - ExecutionPlan(block_list, DatastreamStats.TODO(), run_by_consumer=True), + ExecutionPlan(block_list, DatasetStats.TODO(), run_by_consumer=True), 0, False, ) @@ -119,7 +121,7 @@ def _test_equal_split_balanced(block_sizes, num_splits): assert total_rows - expected_total_rows == total_rows % num_splits # Check that all rows are unique (content check). split_rows = [row for split in splits for row in split.take(total_rows)] - assert len(set(split_rows)) == len(split_rows) + assert len(set(extract_values("id", split_rows))) == len(split_rows) def test_equal_split_balanced_grid(ray_start_regular_shared): @@ -160,7 +162,7 @@ def test_split_small(ray_start_regular_shared, pipelined): @ray.remote(num_cpus=0) def take(s): - return s.take() + return extract_values("item", s.take()) for m in [1, 3]: for n in [1, 3]: @@ -216,23 +218,23 @@ def test_split_at_indices_simple(ray_start_regular_shared): ds.split_at_indices([3, 1]) splits = ds.split_at_indices([5]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] splits = ds.split_at_indices([2, 5]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1], [2, 3, 4], [5, 6, 7, 8, 9]] splits = ds.split_at_indices([2, 5, 5, 100]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1], [2, 3, 4], [], [5, 6, 7, 8, 9], []] splits = ds.split_at_indices([100]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], []] splits = ds.split_at_indices([0]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]] @@ -266,7 +268,7 @@ def test_split_at_indices_coverage(ray_start_regular_shared, num_blocks, indices # indices configurations. ds = ray.data.range(20, parallelism=num_blocks) splits = ds.split_at_indices(indices) - r = [s.take_all() for s in splits] + r = [extract_values("id", s.take_all()) for s in splits] # Use np.array_split() semantics as our correctness ground-truth. assert r == [arr.tolist() for arr in np.array_split(list(range(20)), indices)] @@ -304,7 +306,7 @@ def test_split_at_indices_coverage_complete( # indices configurations. ds = ray.data.range(5, parallelism=num_blocks) splits = ds.split_at_indices(indices) - r = [s.take_all() for s in splits] + r = [extract_values("id", s.take_all()) for s in splits] # Use np.array_split() semantics as our correctness ground-truth. assert r == [arr.tolist() for arr in np.array_split(list(range(5)), indices)] @@ -328,19 +330,19 @@ def test_split_proportionately(ray_start_regular_shared): ds.split_proportionately([0.5, 0.5]) splits = ds.split_proportionately([0.5]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] splits = ds.split_proportionately([0.2, 0.3]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1], [2, 3, 4], [5, 6, 7, 8, 9]] splits = ds.split_proportionately([0.2, 0.3, 0.3]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1], [2, 3, 4], [5, 6, 7], [8, 9]] splits = ds.split_proportionately([0.98, 0.01]) - r = [s.take() for s in splits] + r = [extract_values("id", s.take()) for s in splits] assert r == [[0, 1, 2, 3, 4, 5, 6, 7], [8], [9]] with pytest.raises(ValueError): @@ -357,31 +359,31 @@ def test_split(ray_start_regular_shared): assert [2] * 5 == [ dataset._plan.execute().initial_num_blocks() for dataset in datasets ] - assert 190 == sum([dataset.sum() for dataset in datasets]) + assert 190 == sum([dataset.sum("id") for dataset in datasets]) datasets = ds.split(3) assert [4, 3, 3] == [ dataset._plan.execute().initial_num_blocks() for dataset in datasets ] - assert 190 == sum([dataset.sum() for dataset in datasets]) + assert 190 == sum([dataset.sum("id") for dataset in datasets]) datasets = ds.split(1) assert [10] == [ dataset._plan.execute().initial_num_blocks() for dataset in datasets ] - assert 190 == sum([dataset.sum() for dataset in datasets]) + assert 190 == sum([dataset.sum("id") for dataset in datasets]) datasets = ds.split(10) assert [1] * 10 == [ dataset._plan.execute().initial_num_blocks() for dataset in datasets ] - assert 190 == sum([dataset.sum() for dataset in datasets]) + assert 190 == sum([dataset.sum("id") for dataset in datasets]) datasets = ds.split(11) assert [1] * 10 + [0] == [ dataset._plan.execute().initial_num_blocks() for dataset in datasets ] - assert 190 == sum([dataset.sum() or 0 for dataset in datasets]) + assert 190 == sum([dataset.sum("id") or 0 for dataset in datasets]) def test_split_hints(ray_start_regular_shared): @@ -510,6 +512,7 @@ def _create_meta(num_rows): def _create_block(data): + data = pd.DataFrame({"id": data}) return (ray.put(data), _create_meta(len(data))) @@ -528,7 +531,7 @@ def _create_blocks_with_metadata(blocks): def test_split_single_block(ray_start_regular_shared): - block = [1, 2, 3] + block = pd.DataFrame({"id": [1, 2, 3]}) metadata = _create_meta(3) results = ray.get( @@ -540,7 +543,7 @@ def test_split_single_block(ray_start_regular_shared): blocks = results[1:] assert 234 == block_id assert len(blocks) == 1 - assert blocks[0] == [1, 2, 3] + assert list(blocks[0]["id"]) == [1, 2, 3] assert meta[0].num_rows == 3 results = ray.get( @@ -552,9 +555,9 @@ def test_split_single_block(ray_start_regular_shared): blocks = results[1:] assert 234 == block_id assert len(blocks) == 2 - assert blocks[0] == [1] + assert list(blocks[0]["id"]) == [1] assert meta[0].num_rows == 1 - assert blocks[1] == [2, 3] + assert list(blocks[1]["id"]) == [2, 3] assert meta[1].num_rows == 2 results = ray.get( @@ -566,13 +569,13 @@ def test_split_single_block(ray_start_regular_shared): blocks = results[1:] assert 234 == block_id assert len(blocks) == 5 - assert blocks[0] == [] - assert blocks[1] == [1] - assert blocks[2] == [] - assert blocks[3] == [2, 3] - assert blocks[4] == [] + assert list(blocks[0]["id"]) == [] + assert list(blocks[1]["id"]) == [1] + assert list(blocks[2]["id"]) == [] + assert list(blocks[3]["id"]) == [2, 3] + assert list(blocks[4]["id"]) == [] - block = [] + block = pd.DataFrame({"id": []}) metadata = _create_meta(0) results = ray.get( @@ -584,8 +587,8 @@ def test_split_single_block(ray_start_regular_shared): blocks = results[1:] assert 234 == block_id assert len(blocks) == 2 - assert blocks[0] == [] - assert blocks[1] == [] + assert list(blocks[0]["id"]) == [] + assert list(blocks[1]["id"]) == [] def test_drop_empty_block_split(): @@ -600,7 +603,7 @@ def verify_splits(splits, blocks_by_split): assert len(blocks) == len(block_refs) assert len(blocks) == len(meta) for block, block_ref, meta in zip(blocks, block_refs, meta): - assert ray.get(block_ref) == block + assert list(ray.get(block_ref)["id"]) == block assert meta.num_rows == len(block) @@ -666,7 +669,7 @@ def equalize_helper(input_block_lists): for block_ref, _ in blocklist.get_blocks_with_metadata(): block = ray.get(block_ref) block_accessor = BlockAccessor.for_block(block) - block_list.append(block_accessor.to_default()) + block_list.append(list(block_accessor.to_default()["id"])) result_block_lists.append(block_list) return result_block_lists @@ -749,18 +752,18 @@ def test_train_test_split(ray_start_regular_shared): # float train, test = ds.train_test_split(test_size=0.25) - assert train.take() == [0, 1, 2, 3, 4, 5] - assert test.take() == [6, 7] + assert extract_values("id", train.take()) == [0, 1, 2, 3, 4, 5] + assert extract_values("id", test.take()) == [6, 7] # int train, test = ds.train_test_split(test_size=2) - assert train.take() == [0, 1, 2, 3, 4, 5] - assert test.take() == [6, 7] + assert extract_values("id", train.take()) == [0, 1, 2, 3, 4, 5] + assert extract_values("id", test.take()) == [6, 7] # shuffle train, test = ds.train_test_split(test_size=0.25, shuffle=True, seed=1) - assert train.take() == [4, 5, 3, 2, 7, 6] - assert test.take() == [0, 1] + assert extract_values("id", train.take()) == [4, 5, 3, 2, 7, 6] + assert extract_values("id", test.take()) == [0, 1] # error handling with pytest.raises(TypeError): diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py index 8f4e58218596..c148a5bd545d 100644 --- a/python/ray/data/tests/test_stats.py +++ b/python/ray/data/tests/test_stats.py @@ -5,10 +5,11 @@ import pytest import ray -from ray.data._internal.stats import _StatsActor, DatastreamStats -from ray.data._internal.datastream_logger import DatastreamLogger +from ray.data._internal.stats import _StatsActor, DatasetStats +from ray.data._internal.dataset_logger import DatasetLogger from ray.data.block import BlockMetadata from ray.data.context import DataContext +from ray.data.tests.util import column_udf from ray.tests.conftest import * # noqa from unittest.mock import patch @@ -39,19 +40,19 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats): if context.new_execution_backend: if context.use_streaming_executor: - logger = DatastreamLogger( + logger = DatasetLogger( "ray.data._internal.execution.streaming_executor" ).get_logger( log_to_stdout=enable_auto_log_stats, ) else: - logger = DatastreamLogger( + logger = DatasetLogger( "ray.data._internal.execution.bulk_executor" ).get_logger( log_to_stdout=enable_auto_log_stats, ) else: - logger = DatastreamLogger("ray.data._internal.plan").get_logger( + logger = DatasetLogger("ray.data._internal.plan").get_logger( log_to_stdout=enable_auto_log_stats, ) with patch.object(logger, "info") as mock_logger: @@ -146,7 +147,7 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats): * Extra metrics: {'obj_store_mem_alloc': N, 'obj_store_mem_freed': N, \ 'obj_store_mem_peak': N} -Datastream iterator time breakdown: +Dataset iterator time breakdown: * Total time user code is blocked: T * Total time in user code: T * Total time overall: T @@ -182,7 +183,7 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats): * Extra metrics: {'obj_store_mem_alloc': N, 'obj_store_mem_freed': N, \ 'obj_store_mem_peak': N} -Datastream iterator time breakdown: +Dataset iterator time breakdown: * In ray.wait(): T * In ray.get(): T * Num blocks local: Z @@ -218,7 +219,7 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats): * Extra metrics: {'obj_store_mem_alloc': N, 'obj_store_mem_freed': N, \ 'obj_store_mem_peak': N} -Datastream iterator time breakdown: +Dataset iterator time breakdown: * Total time user code is blocked: T * Total time in user code: T * Total time overall: T @@ -250,7 +251,7 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats): * Output size bytes: N min, N max, N mean, N total * Tasks per node: N min, N max, N mean; N nodes used -Datastream iterator time breakdown: +Dataset iterator time breakdown: * In ray.wait(): T * In ray.get(): T * In next_batch(): T @@ -262,20 +263,14 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats): def test_dataset__repr__(ray_start_regular_shared): - context = DataContext.get_current() - context.optimize_fuse_stages = True - - n = 4 - ds = ray.data.range(n).materialize() + n = 100 + ds = ray.data.range(n) assert len(ds.take_all()) == n - ds2 = ds.map_batches(lambda x: x).materialize() - assert len(ds2.take_all()) == n - ss = ds._plan.stats().to_summary() - ss2 = ds2._plan.stats().to_summary() + ds = ds.materialize() - assert canonicalize(repr(ss)) == ( - "DatastreamStatsSummary(\n" - " datastream_uuid=U,\n" + assert canonicalize(repr(ds._plan.stats().to_summary())) == ( + "DatasetStatsSummary(\n" + " dataset_uuid=U,\n" " base_name=None,\n" " number=N,\n" " extra_metrics={},\n" @@ -298,7 +293,7 @@ def test_dataset__repr__(ray_start_regular_shared): " get_time=T,\n" " iter_blocks_local=None,\n" " iter_blocks_remote=None,\n" - " iter_unknown_location=N,\n" + " iter_unknown_location=None,\n" " next_time=T,\n" " format_time=T,\n" " user_time=T,\n" @@ -307,9 +302,12 @@ def test_dataset__repr__(ray_start_regular_shared): " parents=[],\n" ")" ) - assert canonicalize(repr(ss2)) == ( - "DatastreamStatsSummary(\n" - " datastream_uuid=U,\n" + + ds2 = ds.map_batches(lambda x: x).materialize() + assert len(ds2.take_all()) == n + assert canonicalize(repr(ds2._plan.stats().to_summary())) == ( + "DatasetStatsSummary(\n" + " dataset_uuid=U,\n" " base_name=MapBatches(),\n" " number=N,\n" " extra_metrics={\n" @@ -343,8 +341,8 @@ def test_dataset__repr__(ray_start_regular_shared): " total_time=T,\n" " ),\n" " parents=[\n" - " DatastreamStatsSummary(\n" - " datastream_uuid=U,\n" + " DatasetStatsSummary(\n" + " dataset_uuid=U,\n" " base_name=None,\n" " number=N,\n" " extra_metrics={},\n" @@ -367,7 +365,7 @@ def test_dataset__repr__(ray_start_regular_shared): " get_time=T,\n" " iter_blocks_local=None,\n" " iter_blocks_remote=None,\n" - " iter_unknown_location=N,\n" + " iter_unknown_location=None,\n" " next_time=T,\n" " format_time=T,\n" " user_time=T,\n" @@ -450,7 +448,7 @@ def test_dataset_stats_zip(ray_start_regular_shared): def test_dataset_stats_sort(ray_start_regular_shared): ds = ray.data.range(1000, parallelism=10) - ds = ds.sort() + ds = ds.sort("id") stats = ds.materialize().stats() assert "SortMap" in stats, stats assert "SortReduce" in stats, stats @@ -499,9 +497,9 @@ def test_dataset_stats_read_parquet(ray_start_regular_shared, tmp_path): def test_dataset_split_stats(ray_start_regular_shared, tmp_path): context = DataContext.get_current() - ds = ray.data.range(100, parallelism=10).map(lambda x: x + 1) + ds = ray.data.range(100, parallelism=10).map(column_udf("id", lambda x: x + 1)) dses = ds.split_at_indices([49]) - dses = [ds.map(lambda x: x + 1) for ds in dses] + dses = [ds.map(column_udf("id", lambda x: x + 1)) for ds in dses] for ds_ in dses: stats = canonicalize(ds_.materialize().stats()) @@ -573,19 +571,19 @@ def test_dataset_pipeline_stats_basic(ray_start_regular_shared, enable_auto_log_ if context.new_execution_backend: if context.use_streaming_executor: - logger = DatastreamLogger( + logger = DatasetLogger( "ray.data._internal.execution.streaming_executor" ).get_logger( log_to_stdout=enable_auto_log_stats, ) else: - logger = DatastreamLogger( + logger = DatasetLogger( "ray.data._internal.execution.bulk_executor" ).get_logger( log_to_stdout=enable_auto_log_stats, ) else: - logger = DatastreamLogger("ray.data._internal.plan").get_logger( + logger = DatasetLogger("ray.data._internal.plan").get_logger( log_to_stdout=enable_auto_log_stats, ) @@ -746,10 +744,10 @@ def test_dataset_pipeline_stats_basic(ray_start_regular_shared, enable_auto_log_ 'obj_store_mem_peak': N} ##### Overall Pipeline Time Breakdown ##### -* Time stalled waiting for next datastream: T min, T max, T mean, T total +* Time stalled waiting for next dataset: T min, T max, T mean, T total DatasetPipeline iterator time breakdown: -* Waiting for next datastream: T +* Waiting for next dataset: T * In ray.wait(): T * In ray.get(): T * In next_batch(): T @@ -801,10 +799,10 @@ def test_dataset_pipeline_stats_basic(ray_start_regular_shared, enable_auto_log_ * Tasks per node: N min, N max, N mean; N nodes used ##### Overall Pipeline Time Breakdown ##### -* Time stalled waiting for next datastream: T min, T max, T mean, T total +* Time stalled waiting for next dataset: T min, T max, T mean, T total DatasetPipeline iterator time breakdown: -* Waiting for next datastream: T +* Waiting for next dataset: T * In ray.wait(): T * In ray.get(): T * In next_batch(): T @@ -877,10 +875,10 @@ def consume(split): 'obj_store_mem_peak': N} ##### Overall Pipeline Time Breakdown ##### -* Time stalled waiting for next datastream: T min, T max, T mean, T total +* Time stalled waiting for next dataset: T min, T max, T mean, T total DatasetPipeline iterator time breakdown: -* Waiting for next datastream: T +* Waiting for next dataset: T * In ray.wait(): T * In ray.get(): T * In next_batch(): T @@ -911,10 +909,10 @@ def consume(split): * Tasks per node: N min, N max, N mean; N nodes used ##### Overall Pipeline Time Breakdown ##### -* Time stalled waiting for next datastream: T min, T max, T mean, T total +* Time stalled waiting for next dataset: T min, T max, T mean, T total DatasetPipeline iterator time breakdown: -* Waiting for next datastream: T +* Waiting for next dataset: T * In ray.wait(): T * In ray.get(): T * In next_batch(): T @@ -930,7 +928,7 @@ def test_calculate_blocks_stats(ray_start_regular_shared, stage_two_block): context.optimize_fuse_stages = True block_params, block_meta_list = stage_two_block - stats = DatastreamStats( + stats = DatasetStats( stages={"Read": block_meta_list}, parent=None, ) @@ -975,11 +973,11 @@ def test_summarize_blocks(ray_start_regular_shared, stage_two_block): context.optimize_fuse_stages = True block_params, block_meta_list = stage_two_block - stats = DatastreamStats( + stats = DatasetStats( stages={"Read": block_meta_list}, parent=None, ) - stats.datastream_uuid = "test-uuid" + stats.dataset_uuid = "test-uuid" calculated_stats = stats.to_summary() summarized_lines = calculated_stats.to_string().split("\n") @@ -1050,14 +1048,14 @@ def test_summarize_blocks(ray_start_regular_shared, stage_two_block): def test_get_total_stats(ray_start_regular_shared, stage_two_block): """Tests a set of similar getter methods which pull aggregated statistics values after calculating stage-level stats: - `DatastreamStats.get_max_wall_time()`, - `DatastreamStats.get_total_cpu_time()`, - `DatastreamStats.get_max_heap_memory()`.""" + `DatasetStats.get_max_wall_time()`, + `DatasetStats.get_total_cpu_time()`, + `DatasetStats.get_max_heap_memory()`.""" context = DataContext.get_current() context.optimize_fuse_stages = True block_params, block_meta_list = stage_two_block - stats = DatastreamStats( + stats = DatasetStats( stages={"Read": block_meta_list}, parent=None, ) @@ -1078,7 +1076,7 @@ def test_streaming_stats_full(ray_start_regular_shared, restore_data_context): DataContext.get_current().new_execution_backend = True DataContext.get_current().use_streaming_executor = True - ds = ray.data.range(5, parallelism=5).map(lambda x: x + 1) + ds = ray.data.range(5, parallelism=5).map(column_udf("id", lambda x: x + 1)) ds.take_all() stats = canonicalize(ds.stats()) assert ( @@ -1093,7 +1091,7 @@ def test_streaming_stats_full(ray_start_regular_shared, restore_data_context): * Extra metrics: \ {'obj_store_mem_alloc': N, 'obj_store_mem_freed': N, 'obj_store_mem_peak': N} -Datastream iterator time breakdown: +Dataset iterator time breakdown: * Total time user code is blocked: T * Total time in user code: T * Total time overall: T diff --git a/python/ray/data/tests/test_streaming_backpressure_edge_case.py b/python/ray/data/tests/test_streaming_backpressure_edge_case.py index 36893d5b598e..2c4134c27165 100644 --- a/python/ray/data/tests/test_streaming_backpressure_edge_case.py +++ b/python/ray/data/tests/test_streaming_backpressure_edge_case.py @@ -1,24 +1,90 @@ import pytest import time +import pandas as pd import numpy as np import ray from ray._private.internal_api import memory_summary +from ray.data.datasource import Datasource, ReadTask +from ray.data.block import BlockMetadata +from ray.data.tests.conftest import * # noqa +from ray.tests.conftest import * # noqa -def test_streaming_backpressure_e2e(): +def test_input_backpressure_e2e(restore_data_context, shutdown_only): + + # Tests that backpressure applies even when reading directly from the input + # datasource. This relies on datasource metadata size estimation. + @ray.remote + class Counter: + def __init__(self): + self.count = 0 + + def increment(self): + self.count += 1 + + def get(self): + return self.count + + def reset(self): + self.count = 0 + + class CountingRangeDatasource(Datasource): + def __init__(self): + self.counter = Counter.remote() + + def prepare_read(self, parallelism, n): + def range_(i): + ray.get(self.counter.increment.remote()) + return [ + pd.DataFrame({"data": np.ones((n // parallelism * 1024 * 1024,))}) + ] + + sz = (n // parallelism) * 1024 * 1024 * 8 + print("Block size", sz) + + return [ + ReadTask( + lambda i=i: range_(i), + BlockMetadata( + num_rows=n // parallelism, + size_bytes=sz, + schema=None, + input_files=None, + exec_stats=None, + ), + ) + for i in range(parallelism) + ] + + source = CountingRangeDatasource() + ctx = ray.data.DataContext.get_current() + ctx.execution_options.resource_limits.object_store_memory = 10e6 + + # 10GiB dataset. + ds = ray.data.read_datasource(source, n=10000, parallelism=1000) + it = ds.iter_batches(batch_size=None, prefetch_batches=0) + next(it) + time.sleep(3) + launched = ray.get(source.counter.get.remote()) + + # If backpressure is broken we'll launch 15+. + assert launched < 5, launched + + +def test_streaming_backpressure_e2e(restore_data_context): # This test case is particularly challenging since there is a large input->output # increase in data size: https://github.com/ray-project/ray/issues/34041 class TestSlow: def __call__(self, df: np.ndarray): time.sleep(2) - return np.random.randn(1, 20, 1024, 1024) + return {"id": np.random.randn(1, 20, 1024, 1024)} class TestFast: def __call__(self, df: np.ndarray): time.sleep(0.5) - return np.random.randn(1, 20, 1024, 1024) + return {"id": np.random.randn(1, 20, 1024, 1024)} ctx = ray.init(object_store_memory=4e9) ds = ray.data.range_tensor(20, shape=(3, 1024, 1024), parallelism=20) diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py index 50ab6a1fac2c..2b1291858610 100644 --- a/python/ray/data/tests/test_streaming_executor.py +++ b/python/ray/data/tests/test_streaming_executor.py @@ -99,19 +99,29 @@ def test_process_completed_tasks(): o2.get_work_refs = MagicMock(return_value=[sleep_ref, done_ref]) o2.notify_work_completed = MagicMock() o2.inputs_done = MagicMock() + o1.all_dependents_complete = MagicMock() process_completed_tasks(topo) o2.notify_work_completed.assert_called_once_with(done_ref) o2.inputs_done.assert_not_called() + o1.all_dependents_complete.assert_not_called() # Test input finalization. o2.get_work_refs = MagicMock(return_value=[done_ref]) o2.notify_work_completed = MagicMock() o2.inputs_done = MagicMock() + o1.all_dependents_complete = MagicMock() o1.completed = MagicMock(return_value=True) topo[o1].outqueue.clear() process_completed_tasks(topo) o2.notify_work_completed.assert_called_once_with(done_ref) o2.inputs_done.assert_called_once() + o1.all_dependents_complete.assert_not_called() + + # Test dependents completed. + o2.need_more_inputs = MagicMock(return_value=False) + o1.all_dependents_complete = MagicMock() + process_completed_tasks(topo) + o1.all_dependents_complete.assert_called_once() def test_select_operator_to_run(): diff --git a/python/ray/data/tests/test_streaming_integration.py b/python/ray/data/tests/test_streaming_integration.py index cc3758138de9..88f20e9870f7 100644 --- a/python/ray/data/tests/test_streaming_integration.py +++ b/python/ray/data/tests/test_streaming_integration.py @@ -1,4 +1,5 @@ import itertools +import pandas as pd import random import pytest import threading @@ -24,12 +25,13 @@ from ray.data._internal.execution.util import make_ref_bundles from ray._private.test_utils import wait_for_condition from ray.data.tests.conftest import * # noqa +from ray.data.tests.util import extract_values def make_transform(block_fn): def map_fn(block_iter, ctx): for block in block_iter: - yield block_fn(block) + yield pd.DataFrame({"id": block_fn(block["id"])}) return map_fn @@ -38,10 +40,24 @@ def ref_bundles_to_list(bundles: List[RefBundle]) -> List[List[Any]]: output = [] for bundle in bundles: for block, _ in bundle.blocks: - output.append(ray.get(block)) + output.append(list(ray.get(block)["id"])) return output +def test_autoshutdown_dangling_executors(ray_start_10_cpus_shared): + from ray.data._internal.execution import streaming_executor + + initial = streaming_executor._num_shutdown + + for _ in range(5): + ds = ray.data.range(100) + it = ds.iter_batches(batch_size=None, prefetch_batches=0) + next(it) + + final = streaming_executor._num_shutdown - initial + assert final == 4 + + def test_pipelined_execution(ray_start_10_cpus_shared): executor = StreamingExecutor(ExecutionOptions(preserve_order=True)) inputs = make_ref_bundles([[x] for x in range(20)]) @@ -285,7 +301,7 @@ def func(x): # The pipeline should fully execute even when the output iterator is blocked. wait_for_condition(lambda: ray.get(counter.get.remote()) == 100) # Check we can take the rest. - assert list(it) == [[x] for x in range(1, 100)] + assert [b["id"] for b in it] == [[x] for x in range(1, 100)] def test_backpressure_from_output(ray_start_10_cpus_shared, restore_data_context): @@ -344,7 +360,7 @@ def test_e2e_liveness_with_output_backpressure_edge_case( ds = ray.data.range(10000, parallelism=100).map(lambda x: x, num_cpus=2) # This will hang forever if the liveness logic is wrong, since the output # backpressure will prevent any operators from running at all. - assert ds.take_all() == list(range(10000)) + assert extract_values("id", ds.take_all()) == list(range(10000)) def test_e2e_autoscaling_up(ray_start_10_cpus_shared, restore_data_context): @@ -468,12 +484,14 @@ def f(x): # Test recover. base = ray.data.range(1000, parallelism=100) ds1 = base.map_batches( - f, compute=ray.data.ActorPoolStrategy(4, 4), max_task_retries=999 + f, compute=ray.data.ActorPoolStrategy(size=4), max_task_retries=999 ) ds1.take_all() # Test disabling fault tolerance. - ds2 = base.map_batches(f, compute=ray.data.ActorPoolStrategy(4, 4), max_restarts=0) + ds2 = base.map_batches( + f, compute=ray.data.ActorPoolStrategy(size=4), max_restarts=0 + ) with pytest.raises(ray.exceptions.RayActorError): ds2.take_all() diff --git a/python/ray/data/tests/test_strict_mode.py b/python/ray/data/tests/test_strict_mode.py index a1d7c92d7c50..c81a25cd9162 100644 --- a/python/ray/data/tests/test_strict_mode.py +++ b/python/ray/data/tests/test_strict_mode.py @@ -7,12 +7,8 @@ from ray.data.tests.conftest import * # noqa from ray.tests.conftest import * # noqa -# Force strict mode. -ctx = ray.data.DatasetContext.get_current() -ctx.strict_mode = True - -def test_strict_read_schemas(ray_start_regular_shared): +def test_strict_read_schemas(ray_start_regular_shared, enable_strict_mode): ds = ray.data.range(1) assert ds.take()[0] == {"id": 0} @@ -47,7 +43,7 @@ def test_strict_read_schemas(ray_start_regular_shared): assert "text" in ds.take()[0] -def test_strict_map_output(ray_start_regular_shared): +def test_strict_map_output(ray_start_regular_shared, enable_strict_mode): ds = ray.data.range(1) with pytest.raises(StrictModeError): @@ -57,8 +53,8 @@ def test_strict_map_output(ray_start_regular_shared): with pytest.raises(StrictModeError): ds.map_batches(lambda x: np.array([0]), max_retries=0).materialize() - ds.map_batches(lambda x: {"id": np.array([0])}).materialize() - ds.map_batches(lambda x: UserDict({"id": np.array([0])})).materialize() + ds.map_batches(lambda x: {"id": [0]}).materialize() + ds.map_batches(lambda x: UserDict({"id": [0]})).materialize() with pytest.raises(StrictModeError): ds.map(lambda x: np.ones(10), max_retries=0).materialize() @@ -75,8 +71,8 @@ def test_strict_map_output(ray_start_regular_shared): ds.map_batches(lambda x: object(), max_retries=0).materialize() with pytest.raises(ValueError): ds.map_batches(lambda x: {"x": object()}, max_retries=0).materialize() - ds.map_batches(lambda x: {"x": np.array([object()])}).materialize() - ds.map_batches(lambda x: UserDict({"x": np.array([object()])})).materialize() + ds.map_batches(lambda x: {"x": [object()]}).materialize() + ds.map_batches(lambda x: UserDict({"x": [object()]})).materialize() with pytest.raises(StrictModeError): ds.map(lambda x: object(), max_retries=0).materialize() @@ -84,7 +80,46 @@ def test_strict_map_output(ray_start_regular_shared): ds.map(lambda x: UserDict({"x": object()})).materialize() -def test_strict_default_batch_format(ray_start_regular_shared): +def test_strict_convert_map_output(ray_start_regular_shared, enable_strict_mode): + ds = ray.data.range(1).map_batches(lambda x: {"id": [0, 1, 2, 3]}).materialize() + assert ds.take_batch()["id"].tolist() == [0, 1, 2, 3] + + with pytest.raises(ValueError): + # Strings not converted into array. + ray.data.range(1).map_batches( + lambda x: {"id": "string"}, max_retries=0 + ).materialize() + + class UserObj: + def __eq__(self, other): + return isinstance(other, UserObj) + + ds = ( + ray.data.range(1) + .map_batches(lambda x: {"id": [0, 1, 2, UserObj()]}) + .materialize() + ) + assert ds.take_batch()["id"].tolist() == [0, 1, 2, UserObj()] + + +def test_strict_convert_map_groups(ray_start_regular_shared, enable_strict_mode): + ds = ray.data.read_csv("example://iris.csv") + + def process_group(group): + variety = group["variety"][0] + count = len(group["variety"]) + + # Test implicit list->array conversion here. + return { + "variety": [variety], + "count": [count], + } + + ds = ds.groupby("variety").map_groups(process_group) + ds.show() + + +def test_strict_default_batch_format(ray_start_regular_shared, enable_strict_mode): ds = ray.data.range(1) @ray.remote @@ -111,7 +146,7 @@ def f(x): assert isinstance(batch["id"], np.ndarray), batch -def test_strict_tensor_support(ray_start_regular_shared): +def test_strict_tensor_support(ray_start_regular_shared, enable_strict_mode): ds = ray.data.from_items([np.ones(10), np.ones(10)]) assert np.array_equal(ds.take()[0]["item"], np.ones(10)) @@ -122,7 +157,7 @@ def test_strict_tensor_support(ray_start_regular_shared): assert np.array_equal(ds.take()[0]["item"], 4 * np.ones(10)) -def test_strict_value_repr(ray_start_regular_shared): +def test_strict_value_repr(ray_start_regular_shared, enable_strict_mode): ds = ray.data.from_items([{"__value__": np.ones(10)}]) ds = ds.map_batches(lambda x: {"__value__": x["__value__"] * 2}) @@ -131,28 +166,73 @@ def test_strict_value_repr(ray_start_regular_shared): assert np.array_equal(ds.take_batch()["x"][0], 4 * np.ones(10)) -def test_strict_object_support(ray_start_regular_shared): +def test_strict_object_support(ray_start_regular_shared, enable_strict_mode): ds = ray.data.from_items([{"x": 2}, {"x": object()}]) ds.map_batches(lambda x: x, batch_format="numpy").materialize() -def test_strict_schema(ray_start_regular_shared): - import pyarrow +def test_strict_compute(ray_start_regular_shared, enable_strict_mode): + with pytest.raises(StrictModeError): + ray.data.range(10).map(lambda x: x, compute="actors").show() + with pytest.raises(StrictModeError): + ray.data.range(10).map( + lambda x: x, compute=ray.data.ActorPoolStrategy(1, 1) + ).show() + with pytest.raises(StrictModeError): + ray.data.range(10).map(lambda x: x, compute="tasks").show() + + +def test_strict_schema(ray_start_regular_shared, enable_strict_mode): + import pyarrow as pa + from ray.data.extensions.tensor_extension import ArrowTensorType from ray.data._internal.pandas_block import PandasBlockSchema ds = ray.data.from_items([{"x": 2}]) schema = ds.schema() - assert isinstance(schema.base_schema, pyarrow.lib.Schema) + assert isinstance(schema.base_schema, pa.lib.Schema) + assert schema.names == ["x"] + assert schema.types == [pa.int64()] + + ds = ray.data.from_items([{"x": 2, "y": [1, 2]}]) + schema = ds.schema() + assert isinstance(schema.base_schema, pa.lib.Schema) + assert schema.names == ["x", "y"] + assert schema.types == [pa.int64(), pa.list_(pa.int64())] + + ds = ray.data.from_items([{"x": 2, "y": object(), "z": [1, 2]}]) + schema = ds.schema() + assert schema.names == ["x", "y", "z"] + assert schema.types == [pa.int64(), object, object] ds = ray.data.from_numpy(np.ones((100, 10))) schema = ds.schema() - assert isinstance(schema.base_schema, pyarrow.lib.Schema) - assert str(schema) == "Schema({'data': numpy.ndarray(shape=(10,), dtype=double)})" + assert isinstance(schema.base_schema, pa.lib.Schema) + assert schema.names == ["data"] + assert schema.types == [ArrowTensorType(shape=(10,), dtype=pa.float64())] schema = ds.map_batches(lambda x: x, batch_format="pandas").schema() - # TODO(ekl) fix this to return ndarray - assert str(schema) == "Schema({'data': TensorDtype(shape=(10,), dtype=float64)})" assert isinstance(schema.base_schema, PandasBlockSchema) + assert schema.names == ["data"] + assert schema.types == [ArrowTensorType(shape=(10,), dtype=pa.float64())] + + +def test_use_raw_dicts(ray_start_regular_shared, enable_strict_mode): + assert type(ray.data.range(10).take(1)[0]) is dict + assert type(ray.data.from_items([1]).take(1)[0]) is dict + + def checker(x): + assert type(x) is dict + return x + + ray.data.range(10).map(checker).show() + + +def test_strict_require_batch_size_for_gpu(enable_strict_mode): + ray.shutdown() + ray.init(num_cpus=4, num_gpus=1) + ds = ray.data.range(1) + with pytest.raises(StrictModeError): + ds.map_batches(lambda x: x, num_gpus=1) if __name__ == "__main__": diff --git a/python/ray/data/tests/test_tensor.py b/python/ray/data/tests/test_tensor.py index adaad5c083fc..cf0967ea1adf 100644 --- a/python/ray/data/tests/test_tensor.py +++ b/python/ray/data/tests/test_tensor.py @@ -17,6 +17,7 @@ ) from ray.data.tests.conftest import * # noqa from ray.tests.conftest import * # noqa +from ray.data.tests.util import extract_values # https://github.com/ray-project/ray/issues/33695 @@ -36,31 +37,39 @@ def test_tensors_basic(ray_start_regular_shared): tensor_shape = (3, 5) ds = ray.data.range_tensor(6, shape=tensor_shape, parallelism=6) assert str(ds) == ( - "Datastream(\n" + "Dataset(\n" " num_blocks=6,\n" " num_rows=6,\n" - " schema={__value__: numpy.ndarray(shape=(3, 5), dtype=int64)}\n" + " schema={data: numpy.ndarray(shape=(3, 5), dtype=int64)}\n" ")" ) assert ds.size_bytes() == 5 * 3 * 6 * 8 # Test row iterator yields tensors. for tensor in ds.iter_rows(): + tensor = tensor["data"] assert isinstance(tensor, np.ndarray) assert tensor.shape == tensor_shape # Test batch iterator yields tensors. for tensor in ds.iter_batches(batch_size=2): + tensor = tensor["data"] assert isinstance(tensor, np.ndarray) assert tensor.shape == (2,) + tensor_shape # Native format. def np_mapper(arr): + if "data" in arr: + arr = arr["data"] + else: + arr = arr["id"] assert isinstance(arr, np.ndarray) - return arr + 1 + return {"data": arr + 1} res = ray.data.range_tensor(2, shape=(2, 2)).map(np_mapper).take() - np.testing.assert_equal(res, [np.ones((2, 2)), 2 * np.ones((2, 2))]) + np.testing.assert_equal( + extract_values("data", res), [np.ones((2, 2)), 2 * np.ones((2, 2))] + ) # Explicit NumPy format. res = ( @@ -68,7 +77,9 @@ def np_mapper(arr): .map_batches(np_mapper, batch_format="numpy") .take() ) - np.testing.assert_equal(res, [np.ones((2, 2)), 2 * np.ones((2, 2))]) + np.testing.assert_equal( + extract_values("data", res), [np.ones((2, 2)), 2 * np.ones((2, 2))] + ) # Pandas conversion. def pd_mapper(df): @@ -76,7 +87,7 @@ def pd_mapper(df): return df + 2 res = ray.data.range_tensor(2).map_batches(pd_mapper, batch_format="pandas").take() - np.testing.assert_equal(res, [np.array([2]), np.array([3])]) + np.testing.assert_equal(extract_values("data", res), [np.array([2]), np.array([3])]) # Arrow columns in NumPy format. def multi_mapper(col_arrs): @@ -99,7 +110,7 @@ def multi_mapper(col_arrs): .take() ) np.testing.assert_equal( - [r.as_pydict() for r in res], + res, [ {"a": 2, "b": 5.0, "c": np.array([2, 3])}, {"a": 3, "b": 6.0, "c": np.array([4, 5])}, @@ -121,7 +132,7 @@ def single_mapper(col_arrs): .take() ) np.testing.assert_equal( - [r.as_pydict() for r in res], + res, [ {"c": np.array([2, 3])}, {"c": np.array([4, 5])}, @@ -156,7 +167,7 @@ def multi_mapper(col_arrs): .take() ) np.testing.assert_equal( - [r.as_pydict() for r in res], + res, [ {"a": 2, "b": 5.0, "c": np.array([2, 3])}, {"a": 3, "b": 6.0, "c": np.array([4, 5])}, @@ -178,7 +189,7 @@ def single_mapper(col_arrs): .take() ) np.testing.assert_equal( - [r.as_pydict() for r in res], + res, [ {"c": np.array([2, 3])}, {"c": np.array([4, 5])}, @@ -189,14 +200,14 @@ def single_mapper(col_arrs): # Simple dataset in NumPy format. def mapper(arr): arr = np_mapper(arr) - return arr.tolist() + return arr res = ( ray.data.range(10, parallelism=2) .map_batches(mapper, batch_format="numpy") .take() ) - assert res == list(range(1, 11)) + assert extract_values("data", res) == list(range(1, 11)) def test_batch_tensors(ray_start_regular_shared): @@ -204,16 +215,15 @@ def test_batch_tensors(ray_start_regular_shared): ds = ray.data.from_items([torch.tensor([0, 0]) for _ in range(40)], parallelism=40) res = ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=40,\n" " num_rows=40,\n" - " schema=\n)" + " schema={item: numpy.ndarray(shape=(2,), dtype=int64)}\n" + ")" ) assert str(ds) == res, str(ds) - with pytest.raises(pa.lib.ArrowInvalid): - next(ds.iter_batches(batch_format="pyarrow")) df = next(ds.iter_batches(batch_format="pandas")) - assert df.to_dict().keys() == {"value"} + assert df.to_dict().keys() == {"item"} def test_tensors_shuffle(ray_start_regular_shared): @@ -221,8 +231,8 @@ def test_tensors_shuffle(ray_start_regular_shared): tensor_shape = (3, 5) ds = ray.data.range_tensor(6, shape=tensor_shape) shuffled_ds = ds.random_shuffle() - shuffled = shuffled_ds.take() - base = ds.take() + shuffled = extract_values("data", shuffled_ds.take()) + base = extract_values("data", ds.take()) np.testing.assert_raises( AssertionError, np.testing.assert_equal, @@ -239,8 +249,8 @@ def test_tensors_shuffle(ray_start_regular_shared): ds = ray.data.range_tensor(6, shape=tensor_shape) ds = ds.map_batches(lambda df: df, batch_format="pandas") shuffled_ds = ds.random_shuffle() - shuffled = shuffled_ds.take() - base = ds.take() + shuffled = extract_values("data", shuffled_ds.take()) + base = extract_values("data", ds.take()) np.testing.assert_raises( AssertionError, np.testing.assert_equal, @@ -291,39 +301,39 @@ def test_tensors_sort(ray_start_regular_shared): def test_tensors_inferred_from_map(ray_start_regular_shared): # Test map. - ds = ray.data.range(10, parallelism=10).map(lambda _: np.ones((4, 4))) + ds = ray.data.range(10, parallelism=10).map(lambda _: {"data": np.ones((4, 4))}) ds = ds.materialize() assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=10,\n" " num_rows=10,\n" - " schema={__value__: numpy.ndarray(shape=(4, 4), dtype=double)}\n" + " schema={data: numpy.ndarray(shape=(4, 4), dtype=double)}\n" ")" ) # Test map_batches. ds = ray.data.range(16, parallelism=4).map_batches( - lambda _: np.ones((3, 4, 4)), batch_size=2 + lambda _: {"data": np.ones((3, 4, 4))}, batch_size=2 ) ds = ds.materialize() assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=4,\n" " num_rows=24,\n" - " schema={__value__: numpy.ndarray(shape=(4, 4), dtype=double)}\n" + " schema={data: numpy.ndarray(shape=(4, 4), dtype=double)}\n" ")" ) # Test flat_map. ds = ray.data.range(10, parallelism=10).flat_map( - lambda _: [np.ones((4, 4)), np.ones((4, 4))] + lambda _: [{"data": np.ones((4, 4))}, {"data": np.ones((4, 4))}] ) ds = ds.materialize() assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=10,\n" " num_rows=20,\n" - " schema={__value__: numpy.ndarray(shape=(4, 4), dtype=double)}\n" + " schema={data: numpy.ndarray(shape=(4, 4), dtype=double)}\n" ")" ) @@ -333,7 +343,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): ) ds = ds.materialize() assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=4,\n" " num_rows=24,\n" " schema={a: numpy.ndarray(shape=(4, 4), dtype=float64)}\n" @@ -346,7 +356,7 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): ) ds = ds.materialize() assert str(ds) == ( - "MaterializedDatastream(\n" + "MaterializedDataset(\n" " num_blocks=4,\n" " num_rows=16,\n" " schema={a: numpy.ndarray(shape=(None, None), dtype=float64)}\n" @@ -547,7 +557,7 @@ def test_tensors_in_tables_pandas_roundtrip( arr = np.arange(num_items).reshape(shape) df = pd.DataFrame({"one": list(range(outer_dim)), "two": TensorArray(arr)}) ds = ray.data.from_pandas(df) - ds = ds.map_batches(lambda df: df + 1, batch_size=2) + ds = ds.map_batches(lambda df: df + 1, batch_size=2, batch_format="pandas") ds_df = ds.to_pandas() expected_df = df + 1 if enable_automatic_tensor_extension_cast: @@ -568,7 +578,7 @@ def test_tensors_in_tables_pandas_roundtrip_variable_shaped( outer_dim = len(arrs) df = pd.DataFrame({"one": list(range(outer_dim)), "two": TensorArray(arrs)}) ds = ray.data.from_pandas(df) - ds = ds.map_batches(lambda df: df + 1, batch_size=2) + ds = ds.map_batches(lambda df: df + 1, batch_size=2, batch_format="pandas") ds_df = ds.to_pandas() expected_df = df + 1 if enable_automatic_tensor_extension_cast: @@ -586,7 +596,7 @@ def test_tensors_in_tables_parquet_roundtrip(ray_start_regular_shared, tmp_path) arr = np.arange(num_items).reshape(shape) df = pd.DataFrame({"one": list(range(outer_dim)), "two": TensorArray(arr)}) ds = ray.data.from_pandas(df) - ds = ds.map_batches(lambda df: df + 1, batch_size=2) + ds = ds.map_batches(lambda df: df + 1, batch_size=2, batch_format="pandas") ds.write_parquet(str(tmp_path)) ds = ray.data.read_parquet(str(tmp_path)) values = [[s["one"], s["two"]] for s in ds.take()] @@ -607,7 +617,7 @@ def test_tensors_in_tables_parquet_roundtrip_variable_shaped( outer_dim = len(arrs) df = pd.DataFrame({"one": list(range(outer_dim)), "two": TensorArray(arrs)}) ds = ray.data.from_pandas(df) - ds = ds.map_batches(lambda df: df + 1, batch_size=2) + ds = ds.map_batches(lambda df: df + 1, batch_size=2, batch_format="pandas") ds.write_parquet(str(tmp_path)) ds = ray.data.read_parquet(str(tmp_path)) values = [[s["one"], s["two"]] for s in ds.take()] @@ -759,7 +769,9 @@ def np_deser_udf(block: pa.Table): ds = ray.data.read_parquet(str(tmp_path), _block_udf=np_deser_udf) - assert isinstance(ds.schema().field_by_name(tensor_col_name).type, ArrowTensorType) + assert isinstance( + ds.schema().base_schema.field_by_name(tensor_col_name).type, ArrowTensorType + ) values = [[s["one"], s["two"]] for s in ds.take()] expected = list(zip(list(range(outer_dim)), arr)) @@ -793,7 +805,9 @@ def _block_udf(block: pa.Table): _block_udf=_block_udf, ) - assert isinstance(ds.schema().field_by_name(tensor_col_name).type, ArrowTensorType) + assert isinstance( + ds.schema().base_schema.field_by_name(tensor_col_name).type, ArrowTensorType + ) values = [[s["one"], s["two"]] for s in ds.take()] expected = list(zip(list(range(outer_dim)), arr + 1)) @@ -859,7 +873,7 @@ def test_tensors_in_tables_iter_batches( df.loc[:, "one"] = list(df["one"].to_numpy()) df.loc[:, "two"] = list(df["two"].to_numpy()) ds = ray.data.from_pandas([df1, df2]) - batches = list(ds.iter_batches(batch_size=2)) + batches = list(ds.iter_batches(batch_size=2, batch_format="pandas")) assert len(batches) == 3 expected_batches = [df.iloc[:2], df.iloc[2:4], df.iloc[4:]] for batch, expected_batch in zip(batches, expected_batches): diff --git a/python/ray/data/tests/test_tf.py b/python/ray/data/tests/test_tf.py index 799c59547899..14b0bf781a45 100644 --- a/python/ray/data/tests/test_tf.py +++ b/python/ray/data/tests/test_tf.py @@ -6,7 +6,6 @@ import ray from ray.air import session from ray.air.config import ScalingConfig -from ray.air.constants import TENSOR_COLUMN_NAME from ray.data.preprocessors import Concatenator from ray.train.tensorflow import TensorflowTrainer @@ -186,19 +185,6 @@ def test_invalid_column_raises_error(self): with pytest.raises(ValueError): ds.to_tf(feature_columns="foo", label_columns="bar") - def test_simple_dataset_raises_error(self): - # `range` returns a simple dataset. - ds = ray.data.range(1) - with pytest.raises(NotImplementedError): - ds.to_tf(feature_columns="spam", label_columns="ham") - - def test_tensor_dataset_raises_error(self): - ds = ray.data.range_tensor(1) - with pytest.raises(NotImplementedError): - ds.to_tf( - feature_columns=TENSOR_COLUMN_NAME, label_columns=TENSOR_COLUMN_NAME - ) - if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_torch.py b/python/ray/data/tests/test_torch.py index e36587437f65..e2ecad237e71 100644 --- a/python/ray/data/tests/test_torch.py +++ b/python/ray/data/tests/test_torch.py @@ -319,7 +319,7 @@ def test_iter_torch_batches_tensor_ds(ray_start_regular_shared, pipelined): for _ in range(num_epochs): iterations = [] for batch in ds.iter_torch_batches(batch_size=2): - iterations.append(batch.numpy()) + iterations.append(batch["data"].numpy()) combined_iterations = np.concatenate(iterations) np.testing.assert_array_equal(arr, combined_iterations) diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py index 3ff1e29d3263..a8ff698312b6 100644 --- a/python/ray/data/tests/test_transform_pyarrow.py +++ b/python/ray/data/tests/test_transform_pyarrow.py @@ -378,19 +378,19 @@ def test_convert_to_pyarrow(ray_start_regular_shared, tmp_path): def test_pyarrow(ray_start_regular_shared): - ds = ray.data.range_table(5) - assert ds.map(lambda x: {"b": x["value"] + 2}).take() == [ + ds = ray.data.range(5) + assert ds.map(lambda x: {"b": x["id"] + 2}).take() == [ {"b": 2}, {"b": 3}, {"b": 4}, {"b": 5}, {"b": 6}, ] - assert ds.map(lambda x: {"b": x["value"] + 2}).filter( + assert ds.map(lambda x: {"b": x["id"] + 2}).filter( lambda x: x["b"] % 2 == 0 ).take() == [{"b": 2}, {"b": 4}, {"b": 6}] - assert ds.filter(lambda x: x["value"] == 0).flat_map( - lambda x: [{"b": x["value"] + 2}, {"b": x["value"] + 20}] + assert ds.filter(lambda x: x["id"] == 0).flat_map( + lambda x: [{"b": x["id"] + 2}, {"b": x["id"] + 20}] ).take() == [{"b": 2}, {"b": 20}] diff --git a/python/ray/data/tests/test_util.py b/python/ray/data/tests/test_util.py index 0f3651f3d896..d64f3da9c715 100644 --- a/python/ray/data/tests/test_util.py +++ b/python/ray/data/tests/test_util.py @@ -3,7 +3,6 @@ import numpy as np from ray.data._internal.util import _check_pyarrow_version, _split_list -from ray.data._internal.usage import _recorded_block_formats from ray.data._internal.memory_tracing import ( trace_allocation, trace_deallocation, @@ -88,16 +87,6 @@ def test_list_splits(): assert _split_list(["foo", 1, [0], None], 3) == [["foo", 1], [[0]], [None]] -def test_block_format_usage(): - assert not _recorded_block_formats - ray.data.range(10).show() - assert set(_recorded_block_formats.keys()) == {"simple"} - ray.data.range_table(10).show() - assert set(_recorded_block_formats.keys()) == {"simple", "arrow"} - ray.data.range_table(10).map_batches(lambda x: x).show() - assert set(_recorded_block_formats.keys()) == {"simple", "arrow", "pandas"} - - if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_webdataset.py b/python/ray/data/tests/test_webdataset.py index 9771ee36d042..dd2f15fba72e 100644 --- a/python/ray/data/tests/test_webdataset.py +++ b/python/ray/data/tests/test_webdataset.py @@ -142,7 +142,8 @@ def test_webdataset_coding(ray_start_2_cpus, tmp_path): image = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8) gray = np.random.randint(0, 255, (100, 100), dtype=np.uint8) dstruct = dict(a=[1], b=dict(c=2), d="hello") - ttensor = torch.tensor([1, 2, 3]) + # Note: tensors are supported as numpy format only in strict mode. + ttensor = torch.tensor([1, 2, 3]).numpy() sample = { "__key__": "foo", @@ -180,7 +181,7 @@ def test_webdataset_coding(ray_start_2_cpus, tmp_path): assert sample["mp"]["b"]["c"] == 2 assert isinstance(sample["json"], dict) assert sample["json"]["a"] == [1] - assert isinstance(sample["pt"], torch.Tensor) + assert isinstance(sample["pt"], np.ndarray) assert sample["pt"].tolist() == [1, 2, 3] # test the format argument to the default decoder and multiple decoders diff --git a/python/ray/data/tests/util.py b/python/ray/data/tests/util.py index 16f98876d921..9c66784d19ef 100644 --- a/python/ray/data/tests/util.py +++ b/python/ray/data/tests/util.py @@ -4,6 +4,8 @@ import tempfile import ray +STRICT_MODE = ray.data.DatasetContext.get_current().strict_mode + @ray.remote class Counter: @@ -31,3 +33,27 @@ def gen_bin_files(n): to_write = str(i) * 500 fp.write(to_write.encode()) yield (temp_dir, paths) + + +def column_udf(col, udf): + def wraps(row): + return {col: udf(row[col])} + + return wraps + + +# Ex: named_values("id", [1, 2, 3]) +# Ex: named_values(["id", "id2"], [(1, 1), (2, 2), (3, 3)]) +def named_values(col_names, tuples): + output = [] + if isinstance(col_names, list): + for t in tuples: + output.append({name: value for (name, value) in zip(col_names, t)}) + else: + for t in tuples: + output.append({name: value for (name, value) in zip((col_names,), (t,))}) + return output + + +def extract_values(col_name, tuples): + return [t[col_name] for t in tuples] diff --git a/python/ray/exceptions.py b/python/ray/exceptions.py index 12f895af4f40..276acfd372c6 100644 --- a/python/ray/exceptions.py +++ b/python/ray/exceptions.py @@ -102,6 +102,7 @@ def __init__( pid=None, ip=None, actor_repr=None, + actor_id=None, ): """Initialize a RayTaskError.""" import ray @@ -119,6 +120,7 @@ def __init__( self.function_name = function_name self.traceback_str = traceback_str self.actor_repr = actor_repr + self._actor_id = actor_id # TODO(edoakes): should we handle non-serializable exception objects? self.cause = cause assert traceback_str is not None @@ -183,7 +185,9 @@ def __str__(self): f"(pid={self.pid}, ip={self.ip}" ) if self.actor_repr: - traceback_line += f", repr={self.actor_repr})" + traceback_line += ( + f", actor_id={self._actor_id}, repr={self.actor_repr})" + ) else: traceback_line += ")" code_from_internal_file = False @@ -273,6 +277,7 @@ def __init__(self, cause: Union[RayTaskError, ActorDiedErrorContext] = None): self.error_msg = self.base_error_msg elif isinstance(cause, RayTaskError): self._actor_init_failed = True + self.actor_id = cause._actor_id self.error_msg = ( "The actor died because of an error" " raised in its creation task, " diff --git a/python/ray/experimental/internal_kv.py b/python/ray/experimental/internal_kv.py index 5cb1ade991ec..862ff3bacc89 100644 --- a/python/ray/experimental/internal_kv.py +++ b/python/ray/experimental/internal_kv.py @@ -25,12 +25,12 @@ def _initialize_internal_kv(gcs_client: GcsClient): _initialized = True -@client_mode_hook(auto_init=False) +@client_mode_hook def _internal_kv_initialized(): return global_gcs_client is not None -@client_mode_hook(auto_init=False) +@client_mode_hook def _internal_kv_get( key: Union[str, bytes], *, namespace: Optional[Union[str, bytes]] = None ) -> bytes: @@ -44,7 +44,7 @@ def _internal_kv_get( return global_gcs_client.internal_kv_get(key, namespace) -@client_mode_hook(auto_init=False) +@client_mode_hook def _internal_kv_exists( key: Union[str, bytes], *, namespace: Optional[Union[str, bytes]] = None ) -> bool: @@ -58,13 +58,13 @@ def _internal_kv_exists( return global_gcs_client.internal_kv_exists(key, namespace) -@client_mode_hook(auto_init=False) +@client_mode_hook def _pin_runtime_env_uri(uri: str, *, expiration_s: int) -> None: """Pin a runtime_env URI for expiration_s.""" return global_gcs_client.pin_runtime_env_uri(uri, expiration_s) -@client_mode_hook(auto_init=False) +@client_mode_hook def _internal_kv_put( key: Union[str, bytes], value: Union[str, bytes], @@ -94,7 +94,7 @@ def _internal_kv_put( return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0 -@client_mode_hook(auto_init=False) +@client_mode_hook def _internal_kv_del( key: Union[str, bytes], *, @@ -109,7 +109,7 @@ def _internal_kv_del( return global_gcs_client.internal_kv_del(key, del_by_prefix, namespace) -@client_mode_hook(auto_init=False) +@client_mode_hook def _internal_kv_list( prefix: Union[str, bytes], *, namespace: Optional[Union[str, bytes]] = None ) -> List[bytes]: diff --git a/python/ray/experimental/state/api.py b/python/ray/experimental/state/api.py index c20454118bba..f62f5f2ef7cd 100644 --- a/python/ray/experimental/state/api.py +++ b/python/ray/experimental/state/api.py @@ -1,1397 +1,4 @@ -import logging -import threading -import urllib -import warnings -from contextlib import contextmanager -from dataclasses import fields -from typing import Any, Dict, Generator, List, Optional, Tuple, Union +from ray.util.state import * # noqa: F401 F403 +from ray.util.state.util import record_deprecated_state_api_import -import requests - -from ray.dashboard.modules.dashboard_sdk import SubmissionClient -from ray.dashboard.utils import ( - get_address_for_submission_client, - ray_address_to_api_server_url, -) -from ray.experimental.state.common import ( - DEFAULT_LIMIT, - DEFAULT_RPC_TIMEOUT, - ActorState, - ClusterEventState, - GetApiOptions, - GetLogOptions, - JobState, - ListApiOptions, - NodeState, - ObjectState, - PlacementGroupState, - PredicateType, - RuntimeEnvState, - StateResource, - SummaryApiOptions, - SummaryResource, - SupportedFilterType, - TaskState, - WorkerState, - dict_to_state, -) -from ray.experimental.state.exception import RayStateApiException, ServerUnavailable - -logger = logging.getLogger(__name__) - - -@contextmanager -def warnings_on_slow_request( - *, address: str, endpoint: str, timeout: float, explain: bool -): - """A context manager to print warnings if the request is replied slowly. - - Warnings are printed 3 times - - Args: - address: The address of the endpoint. - endpoint: The name of the endpoint. - timeout: Request timeout in seconds. - explain: Whether ot not it will print the warning. - """ - # Do nothing if explain is not specified. - if not explain: - yield - return - - # Prepare timers to print warning. - # Print 3 times with exponential backoff. timeout / 2, timeout / 4, timeout / 8 - def print_warning(elapsed: float): - logger.info( - f"({round(elapsed, 2)} / {timeout} seconds) " - "Waiting for the response from the API server " - f"address {address}{endpoint}.", - ) - - warning_timers = [ - threading.Timer(timeout / i, print_warning, args=[timeout / i]) - for i in [2, 4, 8] - ] - - try: - for timer in warning_timers: - timer.start() - yield - finally: - # Make sure all timers are cancelled once request is terminated. - for timer in warning_timers: - timer.cancel() - - -""" -This file contains API client and methods for querying ray state. - -NOTE(rickyyx): This is still a work-in-progress API, and subject to changes. - -If you have any feedback, you could do so at either way as below: - 1. Report bugs/issues with details: https://forms.gle/gh77mwjEskjhN8G46 , - 2. Follow up in #ray-state-observability-dogfooding slack channel of Ray: - https://tinyurl.com/2pm26m4a" - - -Usage: - 1. [Recommended] With StateApiClient: - ``` - client = StateApiClient(address="auto") - data = client.list(StateResource.NODES) - ... - ``` - - 2. With SDK APIs: - The API creates a `StateApiClient` for each invocation. So if multiple - invocations of listing are used, it is better to reuse the `StateApiClient` - as suggested above. - ``` - data = list_nodes(address="auto") - ``` -""" - - -class StateApiClient(SubmissionClient): - """State API Client issues REST GET requests to the server for resource states.""" - - def __init__( - self, - address: Optional[str] = None, - cookies: Optional[Dict[str, Any]] = None, - headers: Optional[Dict[str, Any]] = None, - ): - """Initialize a StateApiClient and check the connection to the cluster. - - Args: - address: Ray bootstrap address (e.g. `127.0.0.0:6379`, `auto`), or Ray - Client adress (e.g. `ray://:10001`), or Ray dashboard - address (e.g. `http://:8265`). - If not provided, it will be detected automatically from any running - local Ray cluster. - cookies: Cookies to use when sending requests to the HTTP job server. - headers: Headers to use when sending requests to the HTTP job server, used - for cases like authentication to a remote cluster. - """ - if requests is None: - raise RuntimeError( - "The Ray state CLI & SDK require the ray[default] " - "installation: `pip install 'ray[default']``" - ) - if not headers: - headers = {"Content-Type": "application/json"} - - # Resolve API server URL - api_server_url = get_address_for_submission_client(address) - - super().__init__( - address=api_server_url, - create_cluster_if_needed=False, - headers=headers, - cookies=cookies, - ) - - @classmethod - def _make_param(cls, options: Union[ListApiOptions, GetApiOptions]) -> Dict: - options_dict = {} - for field in fields(options): - # TODO(rickyyx): We will need to find a way to pass server side timeout - # TODO(rickyyx): We will have to convert filter option - # slightly differently for now. But could we do k,v pair rather than this? - # I see we are also converting dict to XXXApiOptions later on, we could - # probably organize the marshaling a bit better. - if field.name == "filters": - options_dict["filter_keys"] = [] - options_dict["filter_predicates"] = [] - options_dict["filter_values"] = [] - for filter in options.filters: - if len(filter) != 3: - raise ValueError( - f"The given filter has incorrect input type, {filter}. " - "Provide (key, predicate, value) tuples." - ) - filter_k, filter_predicate, filter_val = filter - options_dict["filter_keys"].append(filter_k) - options_dict["filter_predicates"].append(filter_predicate) - options_dict["filter_values"].append(filter_val) - continue - - option_val = getattr(options, field.name) - if option_val is not None: - options_dict[field.name] = option_val - - return options_dict - - def _make_http_get_request( - self, - endpoint: str, - params: Dict, - timeout: float, - _explain: bool = False, - ) -> Dict: - with warnings_on_slow_request( - address=self._address, endpoint=endpoint, timeout=timeout, explain=_explain - ): - # Send a request. - response = None - try: - response = self._do_request( - "GET", - endpoint, - timeout=timeout, - params=params, - ) - # If we have a valid JSON error, don't raise a generic exception but - # instead let the caller parse it to raise a more precise exception. - if ( - response.status_code == 500 - and "application/json" - not in response.headers.get("Content-Type", "") - ): - response.raise_for_status() - except requests.exceptions.RequestException as e: - err_str = f"Failed to make request to {self._address}{endpoint}. " - - # Best-effort to give hints to users on potential reasons of connection - # failure. - err_str += ( - "Failed to connect to API server. Please check the API server " - "log for details. Make sure dependencies are installed with " - "`pip install ray[default]`. Please also check dashboard is " - "available, and included when starting ray cluster, " - "i.e. `ray start --include-dashboard=True --head`. " - ) - if response is None: - raise ServerUnavailable(err_str) - - err_str += f"Response(url={response.url},status={response.status_code})" - raise RayStateApiException(err_str) from e - - # Process the response. - response = response.json() - if response["result"] is False: - raise RayStateApiException( - "API server internal error. See dashboard.log file for more details. " - f"Error: {response['msg']}" - ) - - # Dictionary of `ListApiResponse` or `SummaryApiResponse` - return response["data"]["result"] - - def get( - self, - resource: StateResource, - id: str, - options: Optional[GetApiOptions], - _explain: bool = False, - ) -> Optional[ - Union[ - ActorState, - PlacementGroupState, - NodeState, - WorkerState, - TaskState, - List[ObjectState], - ] - ]: - """Get resources states by id - - Args: - resource_name: Resource names, i.e. 'workers', 'actors', 'nodes', - 'placement_groups', 'tasks', 'objects'. - 'jobs' and 'runtime-envs' are not supported yet. - id: ID for the resource, i.e. 'node_id' for nodes. - options: Get options. See `GetApiOptions` for details. - _explain: Print the API information such as API - latency or failed query information. - - Returns: - None if not found, and if found, a dictionarified: - - ActorState for actors - - PlacementGroupState for placement groups - - NodeState for nodes - - WorkerState for workers - - TaskState for tasks - - Empty list for objects if not found, or list of ObjectState for objects - - Raises: - This doesn't catch any exceptions raised when the underlying request - call raises exceptions. For example, it could raise `requests.Timeout` - when timeout occurs. - - ValueError: - if the resource could not be GET by id, i.e. jobs and runtime-envs. - - """ - # TODO(rickyyx): Make GET not using filters on list operation - params = self._make_param(options) - - RESOURCE_ID_KEY_NAME = { - StateResource.NODES: "node_id", - StateResource.ACTORS: "actor_id", - StateResource.PLACEMENT_GROUPS: "placement_group_id", - StateResource.WORKERS: "worker_id", - StateResource.TASKS: "task_id", - StateResource.OBJECTS: "object_id", - } - if resource not in RESOURCE_ID_KEY_NAME: - raise ValueError(f"Can't get {resource.name} by id.") - - params["filter_keys"] = [RESOURCE_ID_KEY_NAME[resource]] - params["filter_predicates"] = ["="] - params["filter_values"] = [id] - params["detail"] = True - endpoint = f"/api/v0/{resource.value}" - - list_api_response = self._make_http_get_request( - endpoint=endpoint, - params=params, - timeout=options.timeout, - _explain=_explain, - ) - result = list_api_response["result"] - - # Empty result - if len(result) == 0: - return None - - result = [dict_to_state(d, resource) for d in result] - if resource == StateResource.OBJECTS: - # NOTE(rickyyx): - # There might be multiple object entries for a single object id - # because a single object could be referenced at different places - # e.g. pinned as local variable, used as parameter - return result - - if resource == StateResource.TASKS: - # There might be multiple task attempts given a task id due to - # task retries. - if len(result) == 1: - return result[0] - return result - - # For the rest of the resources, there should only be a single entry - # for a particular id. - assert len(result) == 1 - return result[0] - - def _print_api_warning( - self, - resource: StateResource, - api_response: dict, - warn_data_source_not_available: bool = True, - warn_data_truncation: bool = True, - warn_limit: bool = True, - warn_server_side_warnings: bool = True, - ): - """Print the API warnings. - - Args: - resource: Resource names, i.e. 'jobs', 'actors', 'nodes', - see `StateResource` for details. - api_response: The dictionarified `ListApiResponse` or `SummaryApiResponse`. - warn_data_source_not_available: Warn when some data sources - are not available. - warn_data_truncation: Warn when results were truncated at - the data source. - warn_limit: Warn when results were limited. - warn_server_side_warnings: Warn when the server side generates warnings - (E.g., when callsites not enabled for listing objects) - """ - # Print warnings if anything was given. - if warn_data_source_not_available: - warning_msgs = api_response.get("partial_failure_warning", None) - if warning_msgs: - warnings.warn(warning_msgs) - - if warn_data_truncation: - # Print warnings if data is truncated at the data source. - num_after_truncation = api_response["num_after_truncation"] - total = api_response["total"] - if total > num_after_truncation: - # NOTE(rickyyx): For now, there's not much users - # could do (neither can we), with hard truncation. - # Unless we allow users to set a higher - # `RAY_MAX_LIMIT_FROM_DATA_SOURCE`, the data will - # always be truncated at the data source. - warnings.warn( - ( - "The returned data may contain incomplete result. " - f"{num_after_truncation} ({total} total from the cluster) " - f"{resource.value} are retrieved from the data source. " - f"{total - num_after_truncation} entries have been truncated. " - f"Max of {num_after_truncation} entries are retrieved " - "from data source to prevent over-sized payloads." - ), - ) - - if warn_limit: - # Print warnings if return data is limited at the API server due to - # limit enforced at the server side - num_filtered = api_response["num_filtered"] - data = api_response["result"] - if num_filtered > len(data): - warnings.warn( - ( - f"Limit last {len(data)} entries " - f"(Total {num_filtered}). Use `--filter` to reduce " - "the amount of data to return or " - "setting a higher limit with `--limit` to see all data. " - ), - ) - - if warn_server_side_warnings: - # Print the additional warnings. - warnings_to_print = api_response.get("warnings", []) - if warnings_to_print: - for warning_to_print in warnings_to_print: - warnings.warn(warning_to_print) - - def _raise_on_missing_output(self, resource: StateResource, api_response: dict): - """Raise an exception when the API resopnse contains a missing output. - - Output can be missing if (1) Failures on some of data source queries (e.g., - `ray list tasks` queries all raylets, and if some of queries fail, it will - contain missing output. If all queries fail, it will just fail). (2) Data - is truncated because the output is too large. - - Args: - resource: Resource names, i.e. 'jobs', 'actors', 'nodes', - see `StateResource` for details. - api_response: The dictionarified `ListApiResponse` or `SummaryApiResponse`. - """ - # Raise an exception if there are partial failures that cause missing output. - warning_msgs = api_response.get("partial_failure_warning", None) - if warning_msgs: - raise RayStateApiException( - f"Failed to retrieve all {resource.value} from the cluster because" - "they are not reachable due to query failures to the data sources. " - "To avoid raising an exception and allow having missing output, " - "set `raise_on_missing_output=False`. " - ) - # Raise an exception is there is data truncation that cause missing output. - total = api_response["total"] - num_after_truncation = api_response["num_after_truncation"] - - if total != num_after_truncation: - raise RayStateApiException( - f"Failed to retrieve all {resource.value} from the cluster because " - "they are not reachable due to data truncation. It happens " - "when the returned data is too large " - # When the data is truncated, the truncation - # threshold == num_after_truncation. We cannot set this to env - # var because the CLI side might not have the correct env var. - f"(> {num_after_truncation}) " - "To avoid raising an exception and allow having missing output, " - "set `raise_on_missing_output=False`. " - ) - - def list( - self, - resource: StateResource, - options: ListApiOptions, - raise_on_missing_output: bool, - _explain: bool = False, - ) -> List[ - Union[ - ActorState, - JobState, - NodeState, - TaskState, - ObjectState, - PlacementGroupState, - RuntimeEnvState, - WorkerState, - ClusterEventState, - ] - ]: - """List resources states - - Args: - resource: Resource names, i.e. 'jobs', 'actors', 'nodes', - see `StateResource` for details. - options: List options. See `ListApiOptions` for details. - raise_on_missing_output: When True, raise an exception if the output - is incomplete. Output can be incomplete if - (1) there's a partial network failure when the source is distributed. - (2) data is truncated because it is too large. - Set it to False to avoid throwing an exception on missing data. - _explain: Print the API information such as API - latency or failed query information. - - Returns: - A list of queried result from `ListApiResponse`, - - Raises: - This doesn't catch any exceptions raised when the underlying request - call raises exceptions. For example, it could raise `requests.Timeout` - when timeout occurs. - - """ - endpoint = f"/api/v0/{resource.value}" - params = self._make_param(options) - list_api_response = self._make_http_get_request( - endpoint=endpoint, - params=params, - timeout=options.timeout, - _explain=_explain, - ) - if raise_on_missing_output: - self._raise_on_missing_output(resource, list_api_response) - if _explain: - self._print_api_warning(resource, list_api_response) - return [dict_to_state(d, resource) for d in list_api_response["result"]] - - def summary( - self, - resource: SummaryResource, - *, - options: SummaryApiOptions, - raise_on_missing_output: bool, - _explain: bool = False, - ) -> Dict: - """Summarize resources states - - Args: - resource_name: Resource names, - see `SummaryResource` for details. - options: summary options. See `SummaryApiOptions` for details. - raise_on_missing_output: Raise an exception if the output has missing data. - Output can have missing data if (1) there's a partial network failure - when the source is distributed. (2) data is truncated - because it is too large. - _explain: Print the API information such as API - latency or failed query information. - - Returns: - A dictionary of queried result from `SummaryApiResponse`. - - Raises: - This doesn't catch any exceptions raised when the underlying request - call raises exceptions. For example, it could raise `requests.Timeout` - when timeout occurs. - """ - params = {"timeout": options.timeout} - endpoint = f"/api/v0/{resource.value}/summarize" - summary_api_response = self._make_http_get_request( - endpoint=endpoint, - params=params, - timeout=options.timeout, - _explain=_explain, - ) - if raise_on_missing_output: - self._raise_on_missing_output(resource, summary_api_response) - if _explain: - # There's no limit applied to summary, so we shouldn't warn. - self._print_api_warning(resource, summary_api_response, warn_limit=False) - return summary_api_response["result"]["node_id_to_summary"] - - -def get_actor( - id: str, - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - _explain: bool = False, -) -> Optional[Dict]: - """Get an actor by id. - - Args: - id: Id of the actor - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout value for the state API requests made. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - None if actor not found, or - :class:`ActorState `. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).get( - StateResource.ACTORS, id, GetApiOptions(timeout=timeout), _explain=_explain - ) - - -# TODO(rickyyx:alpha-obs) -def get_job( - id: str, - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - _explain: bool = False, -) -> Optional[JobState]: - raise NotImplementedError("Get Job by id is currently not supported") - - -def get_placement_group( - id: str, - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - _explain: bool = False, -) -> Optional[PlacementGroupState]: - """Get a placement group by id. - - Args: - id: Id of the placement group - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout value for the state APIs requests made. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - None if actor not found, or - :class:`~ray.experimental.state.common.PlacementGroupState`. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).get( - StateResource.PLACEMENT_GROUPS, - id, - GetApiOptions(timeout=timeout), - _explain=_explain, - ) - - -def get_node( - id: str, - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - _explain: bool = False, -) -> Optional[NodeState]: - """Get a node by id. - - Args: - id: Id of the node. - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout value for the state APIs requests made. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - None if actor not found, or - :class:`NodeState `. - - Raises: - Exceptions: :class:`RayStateApiException ` - if the CLI is failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).get( - StateResource.NODES, - id, - GetApiOptions(timeout=timeout), - _explain=_explain, - ) - - -def get_worker( - id: str, - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - _explain: bool = False, -) -> Optional[WorkerState]: - """Get a worker by id. - - Args: - id: Id of the worker - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout value for the state APIs requests made. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - None if actor not found, or - :class:`WorkerState `. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).get( - StateResource.WORKERS, - id, - GetApiOptions(timeout=timeout), - _explain=_explain, - ) - - -def get_task( - id: str, - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - _explain: bool = False, -) -> Optional[TaskState]: - """Get task attempts of a task by id. - - Args: - id: Id of the task - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout value for the state APIs requests made. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - None if task not found, or a list of - :class:`~ray.experimental.state.common.TaskState` - from the task attempts. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).get( - StateResource.TASKS, - id, - GetApiOptions(timeout=timeout), - _explain=_explain, - ) - - -def get_objects( - id: str, - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - _explain: bool = False, -) -> List[ObjectState]: - """Get objects by id. - - There could be more than 1 entry returned since an object could be - referenced at different places. - - Args: - id: Id of the object - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout value for the state APIs requests made. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of - :class:`~ray.experimental.state.common.ObjectState`. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).get( - StateResource.OBJECTS, - id, - GetApiOptions(timeout=timeout), - _explain=_explain, - ) - - -def list_actors( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[ActorState]: - """List actors in the cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("id", "=", "abcd")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `ActorState`) - will be queried and returned. See - :class:`ActorState `. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of - :class:`ActorState `. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.ACTORS, - options=ListApiOptions( - limit=limit, - timeout=timeout, - filters=filters, - detail=detail, - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_placement_groups( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[PlacementGroupState]: - """List placement groups in the cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("state", "=", "abcd")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `PlacementGroupState`) - will be queried and returned. See - :class:`~ray.experimental.state.common.PlacementGroupState`. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of dictionarified - :class:`~ray.experimental.state.common.PlacementGroupState`. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.PLACEMENT_GROUPS, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_nodes( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[NodeState]: - """List nodes in the cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("node_name", "=", "abcd")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `NodeState`) - will be queried and returned. See - :class:`NodeState `. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of dictionarified - :class:`NodeState `. - - Raises: - Exceptions: :class:`RayStateApiException ` - if the CLI failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.NODES, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_jobs( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[JobState]: - """List jobs submitted to the cluster by :ref: `ray job submission `. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("status", "=", "abcd")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `JobState`) - will be queried and returned. See - :class:`JobState `. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of dictionarified - :class:`JobState `. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.JOBS, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_workers( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[WorkerState]: - """List workers in the cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("is_alive", "=", "True")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `WorkerState`) - will be queried and returned. See - :class:`WorkerState `. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of - :class:`WorkerState `. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.WORKERS, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_tasks( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[TaskState]: - """List tasks in the cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("is_alive", "=", "True")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `WorkerState`) - will be queried and returned. See - :class:`WorkerState `. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of - :class:`TaskState `. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.TASKS, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_objects( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[ObjectState]: - """List objects in the cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("ip", "=", "0.0.0.0")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `ObjectState`) - will be queried and returned. See - :class:`ObjectState `. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of - :class:`ObjectState `. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.OBJECTS, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_runtime_envs( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[RuntimeEnvState]: - """List runtime environments in the cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - filters: List of tuples of filter key, predicate (=, or !=), and - the filter value. E.g., `("node_id", "=", "abcdef")` - limit: Max number of entries returned by the state backend. - timeout: Max timeout value for the state APIs requests made. - detail: When True, more details info (specified in `RuntimeEnvState`) - will be queried and returned. See - :class:`RuntimeEnvState `. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Returns: - List of - :class:`RuntimeEnvState `. - - Raises: - Exceptions: :class:`RayStateApiException ` - if the CLI failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).list( - StateResource.RUNTIME_ENVS, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def list_cluster_events( - address: Optional[str] = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - limit: int = DEFAULT_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - detail: bool = False, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> List[Dict]: - return StateApiClient(address=address).list( - StateResource.CLUSTER_EVENTS, - options=ListApiOptions( - limit=limit, timeout=timeout, filters=filters, detail=detail - ), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -""" -Log APIs -""" - - -def get_log( - address: Optional[str] = None, - node_id: Optional[str] = None, - node_ip: Optional[str] = None, - filename: Optional[str] = None, - actor_id: Optional[str] = None, - task_id: Optional[str] = None, - pid: Optional[int] = None, - follow: bool = False, - tail: int = -1, - timeout: int = DEFAULT_RPC_TIMEOUT, - suffix: str = "out", - encoding: Optional[str] = "utf-8", - errors: Optional[str] = "strict", - _interval: Optional[float] = None, -) -> Generator[str, None, None]: - """Retrieve log file based on file name or some entities ids (pid, actor id, task id). - - Examples: - >>> import ray - >>> from ray.experimental.state.api import get_log # doctest: +SKIP - # To connect to an existing ray instance if there is - >>> ray.init("auto") # doctest: +SKIP - # Node IP could be retrieved from list_nodes() or ray.nodes() - >>> node_ip = "172.31.47.143" # doctest: +SKIP - >>> filename = "gcs_server.out" # doctest: +SKIP - >>> for l in get_log(filename=filename, node_ip=node_ip): # doctest: +SKIP - >>> print(l) # doctest: +SKIP - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If not specified, it will be retrieved from the initialized ray cluster. - node_id: Id of the node containing the logs . - node_ip: Ip of the node containing the logs. (At least one of the node_id and - node_ip have to be supplied when identifying a node). - filename: Name of the file (relative to the ray log directory) to be retrieved. - actor_id: Id of the actor if getting logs from an actor. - task_id: Id of the task if getting logs generated by a task. - pid: PID of the worker if getting logs generated by a worker. When querying - with pid, either node_id or node_ip must be supplied. - follow: When set to True, logs will be streamed and followed. - tail: Number of lines to get from the end of the log file. Set to -1 for getting - the entire log. - timeout: Max timeout for requests made when getting the logs. - suffix: The suffix of the log file if query by id of tasks/workers/actors. Default to "out". - encoding: The encoding used to decode the content of the log file. Default is - "utf-8". Use None to get binary data directly. - errors: The error handling scheme to use for decoding errors. Default is - "strict". See https://docs.python.org/3/library/codecs.html#error-handlers - _interval: The interval in secs to print new logs when `follow=True`. - - Return: - A Generator of log line, None for SendType and ReturnType. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - - api_server_url = ray_address_to_api_server_url(address) - media_type = "stream" if follow else "file" - - options = GetLogOptions( - node_id=node_id, - node_ip=node_ip, - filename=filename, - actor_id=actor_id, - task_id=task_id, - pid=pid, - lines=tail, - interval=_interval, - media_type=media_type, - timeout=timeout, - suffix=suffix, - ) - options_dict = {} - for field in fields(options): - option_val = getattr(options, field.name) - if option_val: - options_dict[field.name] = option_val - - with requests.get( - f"{api_server_url}/api/v0/logs/{media_type}?" - f"{urllib.parse.urlencode(options_dict)}", - stream=True, - ) as r: - if r.status_code != 200: - raise RayStateApiException(r.text) - for bytes in r.iter_content(chunk_size=None): - bytes = bytearray(bytes) - # First byte 1 means success. - if bytes.startswith(b"1"): - bytes.pop(0) - logs = bytes - if encoding is not None: - logs = bytes.decode(encoding=encoding, errors=errors) - else: - assert bytes.startswith(b"0") - error_msg = bytes.decode("utf-8") - raise RayStateApiException(error_msg) - yield logs - - -def list_logs( - address: Optional[str] = None, - node_id: Optional[str] = None, - node_ip: Optional[str] = None, - glob_filter: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, -) -> Dict[str, List[str]]: - """Listing log files available. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If not specified, it will be retrieved from the initialized ray cluster. - node_id: Id of the node containing the logs. - node_ip: Ip of the node containing the logs. - glob_filter: Name of the file (relative to the ray log directory) to be - retrieved. E.g. `glob_filter="*worker*"` for all worker logs. - actor_id: Id of the actor if getting logs from an actor. - timeout: Max timeout for requests made when getting the logs. - _interval: The interval in secs to print new logs when `follow=True`. - - Return: - A dictionary where the keys are log groups (e.g. gcs, raylet, worker), and - values are list of log filenames. - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data, or ConnectionError if failed to resolve the - ray address. - """ # noqa: E501 - assert ( - node_ip is not None or node_id is not None - ), "At least one of node ip and node id is required" - - api_server_url = ray_address_to_api_server_url(address) - - if not glob_filter: - glob_filter = "*" - - options_dict = {} - if node_ip: - options_dict["node_ip"] = node_ip - if node_id: - options_dict["node_id"] = node_id - if glob_filter: - options_dict["glob"] = glob_filter - options_dict["timeout"] = timeout - - r = requests.get( - f"{api_server_url}/api/v0/logs?{urllib.parse.urlencode(options_dict)}" - ) - r.raise_for_status() - - response = r.json() - if response["result"] is False: - raise RayStateApiException( - "API server internal error. See dashboard.log file for more details. " - f"Error: {response['msg']}" - ) - return response["data"]["result"] - - -""" -Summary APIs -""" - - -def summarize_tasks( - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> Dict: - """Summarize the tasks in cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout for requests made when getting the states. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Return: - Dictionarified - :class:`~ray.experimental.state.common.TaskSummaries` - - Raises: - Exceptions: :class:`RayStateApiException ` - if the CLI is failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).summary( - SummaryResource.TASKS, - options=SummaryApiOptions(timeout=timeout), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def summarize_actors( - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> Dict: - """Summarize the actors in cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout for requests made when getting the states. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Return: - Dictionarified - :class:`~ray.experimental.state.common.ActorSummaries` - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).summary( - SummaryResource.ACTORS, - options=SummaryApiOptions(timeout=timeout), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) - - -def summarize_objects( - address: Optional[str] = None, - timeout: int = DEFAULT_RPC_TIMEOUT, - raise_on_missing_output: bool = True, - _explain: bool = False, -) -> Dict: - """Summarize the objects in cluster. - - Args: - address: Ray bootstrap address, could be `auto`, `localhost:6379`. - If None, it will be resolved automatically from an initialized ray. - timeout: Max timeout for requests made when getting the states. - raise_on_missing_output: When True, exceptions will be raised if - there is missing data due to truncation/data source unavailable. - _explain: Print the API information such as API latency or - failed query information. - - Return: - Dictionarified :class:`~ray.experimental.state.common.ObjectSummaries` - - Raises: - Exceptions: :class:`RayStateApiException ` if the CLI - failed to query the data. - """ # noqa: E501 - return StateApiClient(address=address).summary( - SummaryResource.OBJECTS, - options=SummaryApiOptions(timeout=timeout), - raise_on_missing_output=raise_on_missing_output, - _explain=_explain, - ) +record_deprecated_state_api_import() diff --git a/python/ray/experimental/state/common.py b/python/ray/experimental/state/common.py index 7433a1040822..6fbd488cd0fe 100644 --- a/python/ray/experimental/state/common.py +++ b/python/ray/experimental/state/common.py @@ -1,1451 +1,4 @@ -import json -import logging -import sys -from abc import ABC -from dataclasses import field, fields -from enum import Enum, unique -from typing import Dict, List, Optional, Set, Tuple, Union +from ray.util.state.common import * # noqa: F401 F403 +from ray.util.state.util import record_deprecated_state_api_import -import ray.dashboard.utils as dashboard_utils -from ray._private.ray_constants import env_integer -from ray.core.generated.common_pb2 import TaskStatus, TaskType -from ray.core.generated.gcs_pb2 import TaskEvents -from ray.dashboard.modules.job.common import JobInfo -from ray.experimental.state.custom_types import ( - TypeActorStatus, - TypeNodeStatus, - TypePlacementGroupStatus, - TypeReferenceType, - TypeTaskStatus, - TypeTaskType, - TypeWorkerExitType, - TypeWorkerType, -) -from ray.experimental.state.exception import RayStateApiException - -try: - from pydantic.dataclasses import dataclass -except ImportError: - # pydantic is not available in the dashboard. - # We will use the dataclass from the standard library. - from dataclasses import dataclass - - -logger = logging.getLogger(__name__) - -DEFAULT_RPC_TIMEOUT = 30 -DEFAULT_LIMIT = 100 -DEFAULT_LOG_LIMIT = 1000 - -# Max number of entries from API server to the client -RAY_MAX_LIMIT_FROM_API_SERVER = env_integer( - "RAY_MAX_LIMIT_FROM_API_SERVER", 10 * 1000 -) # 10k - -# Max number of entries from data sources (rest will be truncated at the -# data source, e.g. raylet) -RAY_MAX_LIMIT_FROM_DATA_SOURCE = env_integer( - "RAY_MAX_LIMIT_FROM_DATA_SOURCE", 10 * 1000 -) # 10k - - -@unique -class StateResource(Enum): - ACTORS = "actors" - JOBS = "jobs" - PLACEMENT_GROUPS = "placement_groups" - NODES = "nodes" - WORKERS = "workers" - TASKS = "tasks" - OBJECTS = "objects" - RUNTIME_ENVS = "runtime_envs" - CLUSTER_EVENTS = "cluster_events" - - -@unique -class SummaryResource(Enum): - ACTORS = "actors" - TASKS = "tasks" - OBJECTS = "objects" - - -SupportedFilterType = Union[str, bool, int, float] - - -PredicateType = str # Literal["=", "!="] - - -@dataclass(init=True) -class ListApiOptions: - # Maximum number of entries to return - limit: int = DEFAULT_LIMIT - # The timeout for the API call. - timeout: int = DEFAULT_RPC_TIMEOUT - # If True, more detailed output will be printed. - # The API could query more sources than detail == False - # to get more data in detail. - detail: bool = False - # Filters. Each tuple pair (key, predicate, value) means key predicate value. - # If there's more than 1 filter, it means AND. - # E.g., [(key, "=", val), (key2, "!=" val2)] means (key=val) AND (key2!=val2) - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = field( - default_factory=list - ) - # [only tasks] If driver tasks should be excluded. - exclude_driver: bool = True - # When the request is processed on the server side, - # we should apply multiplier so that server side can finish - # processing a request within timeout. Otherwise, - # timeout will always lead Http timeout. - server_timeout_multiplier: float = 0.8 - - def __post_init__(self): - # To return the data to users, when there's a partial failure - # we need to have a timeout that's smaller than the users' timeout. - # 80% is configured arbitrarily. - self.timeout = int(self.timeout * self.server_timeout_multiplier) - assert self.timeout != 0, "0 second timeout is not supported." - if self.filters is None: - self.filters = [] - - for filter in self.filters: - _, filter_predicate, _ = filter - if filter_predicate != "=" and filter_predicate != "!=": - raise ValueError( - f"Unsupported filter predicate {filter_predicate} is given. " - "Available predicates: =, !=." - ) - - -@dataclass(init=True) -class GetApiOptions: - # Timeout for the HTTP request - timeout: int = DEFAULT_RPC_TIMEOUT - - -@dataclass(init=True) -class SummaryApiOptions: - # Timeout for the HTTP request - timeout: int = DEFAULT_RPC_TIMEOUT - - # Filters. Each tuple pair (key, predicate, value) means key predicate value. - # If there's more than 1 filter, it means AND. - # E.g., [(key, "=", val), (key2, "!=" val2)] means (key=val) AND (key2!=val2) - # For summary endpoints that call list under the hood, we'll pass - # these filters directly into the list call. - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = field( - default_factory=list - ) - - # Change out to summarize the output. There is a summary_by value for each entity. - # Tasks: by func_name - # Actors: by class - # Objects: by callsite - summary_by: Optional[str] = None - - -def state_column(*, filterable: bool, detail: bool = False, **kwargs): - """A wrapper around dataclass.field to add additional metadata. - - The metadata is used to define detail / filterable option of - each column. - - Args: - detail: If True, the column is used when detail == True - filterable: If True, the column can be used for filtering. - kwargs: The same kwargs for the `dataclasses.field` function. - """ - m = {"detail": detail, "filterable": filterable} - - # Default for detail field is None since it could be missing. - if detail and "default" not in kwargs: - kwargs["default"] = None - - if "metadata" in kwargs: - kwargs["metadata"].update(m) - else: - kwargs["metadata"] = m - return field(**kwargs) - - -class StateSchema(ABC): - """Schema class for Ray resource abstraction. - - The child class must be dataclass. All child classes - - perform runtime type checking upon initialization. - - are supposed to use `state_column` instead of `field`. - It will allow the class to return filterable/detail columns. - If `state_column` is not specified, that column is not filterable - and for non-detail output. - - For example, - ``` - @dataclass - class State(StateSchema): - column_a: str - column_b: int = state_column(detail=True, filterable=True) - - s = State(column_a="abc", b=1) - # Returns {"column_b"} - s.filterable_columns() - # Returns {"column_a"} - s.base_columns() - # Returns {"column_a", "column_b"} - s.columns() - ``` - """ - - @classmethod - def list_columns(cls, detail: bool = True) -> List[str]: - """Return a list of columns.""" - cols = [] - for f in fields(cls): - if detail: - cols.append(f.name) - elif not f.metadata.get("detail", False): - cols.append(f.name) - - return cols - - @classmethod - def columns(cls) -> Set[str]: - """Return a set of all columns.""" - return set(cls.list_columns()) - - @classmethod - def filterable_columns(cls) -> Set[str]: - """Return a list of filterable columns""" - filterable = set() - for f in fields(cls): - if f.metadata.get("filterable", False): - filterable.add(f.name) - return filterable - - @classmethod - def base_columns(cls) -> Set[str]: - """Return a list of base columns. - - Base columns mean columns to return when detail == False. - """ - return set(cls.list_columns(detail=False)) - - @classmethod - def detail_columns(cls) -> Set[str]: - """Return a list of detail columns. - - Detail columns mean columns to return when detail == True. - """ - return set(cls.list_columns(detail=True)) - - # Allow dict like access on the class directly for backward compatibility. - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - setattr(self, key, value) - - def get(self, key, default=None): - return getattr(self, key, default) - - -def filter_fields(data: dict, state_dataclass: StateSchema, detail: bool) -> dict: - """Filter the given data's columns based on the given schema. - - Args: - data: A single data entry to filter columns. - state_dataclass: The schema to filter data. - detail: Whether or not it should include columns for detail output. - """ - filtered_data = {} - columns = state_dataclass.columns() if detail else state_dataclass.base_columns() - for col in columns: - if col in data: - filtered_data[col] = data[col] - else: - filtered_data[col] = None - return filtered_data - - -@dataclass(init=True) -class GetLogOptions: - timeout: int - node_id: Optional[str] = None - node_ip: Optional[str] = None - # One of {file, stream}. File means it will return the whole log. - # stream means it will keep the connection and streaming the log. - media_type: str = "file" - # The file name of the log. - filename: Optional[str] = None - # The actor id of the log. It is used only for worker logs. - actor_id: Optional[str] = None - # The task id of the log. It is used only for worker logs. - # This is currently not working. TODO(sang): Support task log. - task_id: Optional[str] = None - # The pid of the log. It is used only for worker logs. - pid: Optional[int] = None - # Total log lines to return. - lines: int = 1000 - # The interval where new logs are streamed to. - # Should be used only when media_type == stream. - interval: Optional[float] = None - # The suffix of the log file if file resolution not through filename directly. - # Default to "out". - suffix: str = "out" - - def __post_init__(self): - if self.pid: - self.pid = int(self.pid) - if self.interval: - self.interval = float(self.interval) - self.lines = int(self.lines) - - if self.task_id: - raise NotImplementedError("task_id is not supported yet.") - - if self.media_type == "file": - assert self.interval is None - if self.media_type not in ["file", "stream"]: - raise ValueError(f"Invalid media type: {self.media_type}") - if not (self.node_id or self.node_ip) and not (self.actor_id or self.task_id): - raise ValueError( - "node_id or node_ip must be provided as constructor arguments when no " - "actor or task_id is supplied as arguments." - ) - if self.node_id and self.node_ip: - raise ValueError( - "Both node_id and node_ip are given. Only one of them can be provided. " - f"Given node id: {self.node_id}, given node ip: {self.node_ip}" - ) - if not (self.actor_id or self.task_id or self.pid or self.filename): - raise ValueError( - "None of actor_id, task_id, pid, or filename is provided. " - "At least one of them is required to fetch logs." - ) - - if self.suffix not in ["out", "err"]: - raise ValueError( - f"Invalid suffix: {self.suffix}. Must be one of 'out' or 'err'." - ) - - -# See the ActorTableData message in gcs.proto for all potential options that -# can be included in this class. -@dataclass(init=True) -class ActorState(StateSchema): - """Actor State""" - - #: The id of the actor. - actor_id: str = state_column(filterable=True) - #: The class name of the actor. - class_name: str = state_column(filterable=True) - #: The state of the actor. - #: - #: - DEPENDENCIES_UNREADY: Actor is waiting for dependency to be ready. - #: E.g., a new actor is waiting for object ref that's created from - #: other remote task. - #: - PENDING_CREATION: Actor's dependency is ready, but it is not created yet. - #: It could be because there are not enough resources, too many actor - #: entries in the scheduler queue, or the actor creation is slow - #: (e.g., slow runtime environment creation, - #: slow worker startup, or etc.). - #: - ALIVE: The actor is created, and it is alive. - #: - RESTARTING: The actor is dead, and it is restarting. - #: It is equivalent to `PENDING_CREATION`, - #: but means the actor was dead more than once. - #: - DEAD: The actor is permanatly dead. - state: TypeActorStatus = state_column(filterable=True) - #: The job id of this actor. - job_id: str = state_column(filterable=True) - #: The name of the actor given by the `name` argument. - name: Optional[str] = state_column(filterable=True) - #: The node id of this actor. - #: If the actor is restarting, it could be the node id - #: of the dead actor (and it will be re-updated when - #: the actor is successfully restarted). - node_id: Optional[str] = state_column(filterable=True) - #: The pid of the actor. 0 if it is not created yet. - pid: Optional[int] = state_column(filterable=True) - #: The namespace of the actor. - ray_namespace: Optional[str] = state_column(filterable=True) - #: The runtime environment information of the actor. - serialized_runtime_env: Optional[str] = state_column(filterable=False, detail=True) - #: The resource requirement of the actor. - required_resources: Optional[dict] = state_column(filterable=False, detail=True) - #: Actor's death information in detail. None if the actor is not dead yet. - death_cause: Optional[dict] = state_column(filterable=False, detail=True) - #: True if the actor is detached. False otherwise. - is_detached: Optional[bool] = state_column(filterable=False, detail=True) - #: The placement group id that's associated with this actor. - placement_group_id: Optional[str] = state_column(detail=True, filterable=True) - #: Actor's repr name if a customized __repr__ method exists, else empty string. - repr_name: Optional[str] = state_column(detail=True, filterable=True) - - -@dataclass(init=True) -class PlacementGroupState(StateSchema): - """PlacementGroup State""" - - #: The id of the placement group. - placement_group_id: str = state_column(filterable=True) - #: The name of the placement group if it is given by the name argument. - name: str = state_column(filterable=True) - #: The job id of the placement group. - creator_job_id: str = state_column(filterable=True) - #: The state of the placement group. - #: - #: - PENDING: The placement group creation is pending scheduling. - #: It could be because there's not enough resources, some of creation - #: stage has failed (e.g., failed to commit placement gropus because - #: the node is dead). - #: - CREATED: The placement group is created. - #: - REMOVED: The placement group is removed. - #: - RESCHEDULING: The placement group is rescheduling because some of - #: bundles are dead because they were on dead nodes. - state: TypePlacementGroupStatus = state_column(filterable=True) - #: The bundle specification of the placement group. - bundles: Optional[List[dict]] = state_column(filterable=False, detail=True) - #: True if the placement group is detached. False otherwise. - is_detached: Optional[bool] = state_column(filterable=True, detail=True) - #: The scheduling stats of the placement group. - stats: Optional[dict] = state_column(filterable=False, detail=True) - - -@dataclass(init=True) -class NodeState(StateSchema): - """Node State""" - - #: The id of the node. - node_id: str = state_column(filterable=True) - #: The ip address of the node. - node_ip: str = state_column(filterable=True) - #: If this is a head node. - is_head_node: bool = state_column(filterable=True) - #: The state of the node. - #: - #: ALIVE: The node is alive. - #: DEAD: The node is dead. - state: TypeNodeStatus = state_column(filterable=True) - #: The name of the node if it is given by the name argument. - node_name: str = state_column(filterable=True) - #: The total resources of the node. - resources_total: dict = state_column(filterable=False) - #: The time when the node (raylet) starts. - start_time_ms: Optional[int] = state_column(filterable=False, detail=True) - #: The time when the node exits. The timestamp could be delayed - #: if the node is dead unexpectedly (could be delayed - # up to 30 seconds). - end_time_ms: Optional[int] = state_column(filterable=False, detail=True) - - -@dataclass(init=True) -class JobState(JobInfo, StateSchema): - """The state of the job that's submitted by Ray's Job APIs""" - - job_id: Optional[str] = state_column(filterable=False, default=None) - - @classmethod - def filterable_columns(cls) -> Set[str]: - return {"status", "entrypoint", "error_type"} - - @classmethod - def list_columns(cls, detail: bool) -> List[str]: - return ["job_id"] + [f.name for f in fields(JobInfo)] - - -@dataclass(init=True) -class WorkerState(StateSchema): - """Worker State""" - - #: The id of the worker. - worker_id: str = state_column(filterable=True) - #: Whether or not if the worker is alive. - is_alive: bool = state_column(filterable=True) - #: The type of the worker. - #: - #: - WORKER: The regular Ray worker process that executes tasks or - # instantiates an actor. - #: - DRIVER: The driver (Python script that calls `ray.init`). - #: - SPILL_WORKER: The worker that spills objects. - #: - RESTORE_WORKER: The worker that restores objects. - worker_type: TypeWorkerType = state_column(filterable=True) - #: The exit type of the worker if the worker is dead. - #: - #: - SYSTEM_ERROR: Worker exit due to system level failures (i.e. worker crash). - #: - INTENDED_SYSTEM_EXIT: System-level exit that is intended. E.g., - #: Workers are killed because they are idle for a long time. - #: - USER_ERROR: Worker exits because of user error. - #: E.g., execptions from the actor initialization. - #: - INTENDED_USER_EXIT: Intended exit from users (e.g., users exit - #: workers with exit code 0 or exit initated by Ray API such as ray.kill). - exit_type: Optional[TypeWorkerExitType] = state_column(filterable=True) - #: The node id of the worker. - node_id: str = state_column(filterable=True) - #: The ip address of the worker. - ip: str = state_column(filterable=True) - #: The pid of the worker. - pid: int = state_column(filterable=True) - #: The exit detail of the worker if the worker is dead. - exit_detail: Optional[str] = state_column(detail=True, filterable=False) - #: The time worker is first launched. - #: -1 if the value doesn't exist. - #: The lifecycle of worker is as follow. - #: worker_launch_time_ms (process startup requested). - #: -> worker_launched_time_ms (process started). - #: -> start_time_ms (worker is ready to be used). - #: -> end_time_ms (worker is destroyed). - worker_launch_time_ms: Optional[int] = state_column(filterable=False, detail=True) - #: The time worker is succesfully launched - #: -1 if the value doesn't exist. - worker_launched_time_ms: Optional[int] = state_column(filterable=False, detail=True) - #: The time when the worker is started and initialized. - #: 0 if the value doesn't exist. - start_time_ms: Optional[int] = state_column(filterable=False, detail=True) - #: The time when the worker exits. The timestamp could be delayed - #: if the worker is dead unexpectedly. - #: 0 if the value doesn't exist. - end_time_ms: Optional[int] = state_column(filterable=False, detail=True) - - -@dataclass(init=True) -class ClusterEventState(StateSchema): - severity: str = state_column(filterable=True) - time: str = state_column(filterable=False) - source_type: str = state_column(filterable=True) - message: str = state_column(filterable=False) - event_id: str = state_column(filterable=True) - custom_fields: Optional[dict] = state_column(filterable=False, detail=True) - - -@dataclass(init=True) -class TaskState(StateSchema): - """Task State""" - - #: The id of the task. - task_id: str = state_column(filterable=True) - #: The attempt (retry) number of the task. - attempt_number: int = state_column(filterable=True) - #: The name of the task if it is given by the name argument. - name: str = state_column(filterable=True) - #: The state of the task. - #: - #: Refer to src/ray/protobuf/common.proto for a detailed explanation of the state - #: breakdowns and typical state transition flow. - #: - state: TypeTaskStatus = state_column(filterable=True) - #: The job id of this task. - job_id: str = state_column(filterable=True) - #: The actor id that's associated with this task. - #: It is empty if there's no relevant actors. - actor_id: Optional[str] = state_column(filterable=True) - #: The type of the task. - #: - #: - NORMAL_TASK: Tasks created by `func.remote()`` - #: - ACTOR_CREATION_TASK: Actors created by `class.remote()` - #: - ACTOR_TASK: Actor tasks submitted by `actor.method.remote()` - #: - DRIVER_TASK: Driver (A script that calls `ray.init`). - type: TypeTaskType = state_column(filterable=True) - #: The name of the task. If is the name of the function - #: if the type is a task or an actor task. - #: It is the name of the class if it is a actor scheduling task. - func_or_class_name: str = state_column(filterable=True) - #: The parent task id. If the parent is a normal task, it will be the task's id. - #: If the parent runs in a concurrent actor (async actor or threaded actor), - #: it will be the actor's creation task id. - parent_task_id: str = state_column(filterable=True) - #: Id of the node that runs the task. If the task is retried, it could - #: contain the node id of the previous executed task. - #: If empty, it means the task hasn't been scheduled yet. - node_id: Optional[str] = state_column(filterable=True) - #: The worker id that's associated with this task. - worker_id: Optional[str] = state_column(filterable=True) - #: Task error type. - error_type: Optional[str] = state_column(filterable=True) - #: The language of the task. E.g., Python, Java, or Cpp. - language: Optional[str] = state_column(detail=True, filterable=True) - #: The required resources to execute the task. - required_resources: Optional[dict] = state_column(detail=True, filterable=False) - #: The runtime environment information for the task. - runtime_env_info: Optional[dict] = state_column(detail=True, filterable=False) - #: The placement group id that's associated with this task. - placement_group_id: Optional[str] = state_column(detail=True, filterable=True) - #: The list of events of the given task. - #: Refer to src/ray/protobuf/common.proto for a detailed explanation of the state - #: breakdowns and typical state transition flow. - events: Optional[List[dict]] = state_column(detail=True, filterable=False) - #: The list of profile events of the given task. - profiling_data: Optional[dict] = state_column(detail=True, filterable=False) - #: The time when the task is created. A Unix timestamp in ms. - creation_time_ms: Optional[int] = state_column(detail=True, filterable=False) - #: The time when the task starts to run. A Unix timestamp in ms. - start_time_ms: Optional[int] = state_column(detail=True, filterable=False) - #: The time when the task is finished or failed. A Unix timestamp in ms. - end_time_ms: Optional[int] = state_column(detail=True, filterable=False) - #: The task logs info, e.g. offset into the worker log file when the task - #: starts/finishes. - task_log_info: Optional[dict] = state_column(detail=True, filterable=False) - #: Task error detail info. - error_message: Optional[str] = state_column(detail=True, filterable=False) - - -@dataclass(init=True) -class ObjectState(StateSchema): - """Object State""" - - #: The id of the object. - object_id: str = state_column(filterable=True) - #: The size of the object in mb. - object_size: int = state_column(filterable=True) - #: The status of the task that creates the object. - #: - #: - NIL: We don't have a status for this task because we are not the owner or the - #: task metadata has already been deleted. - #: - WAITING_FOR_DEPENDENCIES: The task is waiting for its dependencies - #: to be created. - #: - SCHEDULED: All dependencies have been created and the task is - #: scheduled to execute. - #: It could be because the task is waiting for resources, - #: runtime environmenet creation, fetching dependencies to the - #: local node, and etc.. - #: - FINISHED: The task finished successfully. - #: - WAITING_FOR_EXECUTION: The task is scheduled properly and - #: waiting for execution. It includes time to deliver the task - #: to the remote worker + queueing time from the execution side. - #: - RUNNING: The task that is running. - task_status: TypeTaskStatus = state_column(filterable=True) - #: The reference type of the object. - #: See :ref:`Debugging with Ray Memory ` for more details. - #: - #: - ACTOR_HANDLE: The reference is an actor handle. - #: - PINNED_IN_MEMORY: The object is pinned in memory, meaning there's - #: in-flight `ray.get` on this reference. - #: - LOCAL_REFERENCE: There's a local reference (e.g., Python reference) - #: to this object reference. The object won't be GC'ed until all of them is gone. - #: - USED_BY_PENDING_TASK: The object reference is passed to other tasks. E.g., - #: `a = ray.put()` -> `task.remote(a)`. In this case, a is used by a - #: pending task `task`. - #: - CAPTURED_IN_OBJECT: The object is serialized by other objects. E.g., - #: `a = ray.put(1)` -> `b = ray.put([a])`. a is serialized within a list. - #: - UNKNOWN_STATUS: The object ref status is unkonwn. - reference_type: TypeReferenceType = state_column(filterable=True) - #: The callsite of the object. - call_site: str = state_column(filterable=True) - #: The worker type that creates the object. - #: - #: - WORKER: The regular Ray worker process that executes tasks or - #: instantiates an actor. - #: - DRIVER: The driver (Python script that calls `ray.init`). - #: - SPILL_WORKER: The worker that spills objects. - #: - RESTORE_WORKER: The worker that restores objects. - type: TypeWorkerType = state_column(filterable=True) - #: The pid of the owner. - pid: int = state_column(filterable=True) - #: The ip address of the owner. - ip: str = state_column(filterable=True) - - -@dataclass(init=True) -class RuntimeEnvState(StateSchema): - """Runtime Environment State""" - - #: The runtime environment spec. - runtime_env: dict = state_column(filterable=True) - #: Whether or not the runtime env creation has succeeded. - success: bool = state_column(filterable=True) - #: The latency of creating the runtime environment. - #: Available if the runtime env is successfully created. - creation_time_ms: Optional[float] = state_column(filterable=False) - #: The node id of this runtime environment. - node_id: str = state_column(filterable=True) - #: The number of actors and tasks that use this runtime environment. - ref_cnt: Optional[int] = state_column(detail=True, filterable=False) - #: The error message if the runtime environment creation has failed. - #: Available if the runtime env is failed to be created. - error: Optional[str] = state_column(detail=True, filterable=True) - - -AVAILABLE_STATES = [ - ActorState, - PlacementGroupState, - NodeState, - WorkerState, - JobState, - TaskState, - ObjectState, - RuntimeEnvState, -] - - -for state in AVAILABLE_STATES: - if len(state.filterable_columns()) > 0: - filterable_cols = "\n\n ".join(state.filterable_columns()) - state.__doc__ += f""" -\nBelow columns can be used for the `--filter` option. -\n - {filterable_cols} -\n -""" - - if len(state.detail_columns()) > 0: - detail_cols = "\n\n ".join(state.detail_columns()) - state.__doc__ += f""" -\nBelow columns are available only when `get` API is used, -\n`--detail` is specified through CLI, or `detail=True` is given to Python APIs. -\n -\n - {detail_cols} -\n -""" - - -@dataclass(init=True) -class ListApiResponse: - # NOTE(rickyyx): We currently perform hard truncation when querying - # resources which could have a large number (e.g. asking raylets for - # the number of all objects). - # The returned of resources seen by the user will go through from the - # below funnel: - # - total - # | With truncation at the data source if the number of returned - # | resource exceeds `RAY_MAX_LIMIT_FROM_DATA_SOURCE` - # v - # - num_after_truncation - # | With filtering at the state API server - # v - # - num_filtered - # | With limiting, - # | set by min(`RAY_MAX_LIMIT_FROM_API_SERER`, ) - # v - # - len(result) - - # Total number of the available resource from the cluster. - total: int - # Number of resources returned by data sources after truncation - num_after_truncation: int - # Number of resources after filtering - num_filtered: int - # Returned data. None if no data is returned. - result: List[Dict] - # List API can have a partial failure if queries to - # all sources fail. For example, getting object states - # require to ping all raylets, and it is possible some of - # them fails. Note that it is impossible to guarantee high - # availability of data because ray's state information is - # not replicated. - partial_failure_warning: Optional[str] = "" - # A list of warnings to print. - warnings: Optional[List[str]] = None - - -""" -Summary API schema -""" - -DRIVER_TASK_ID_PREFIX = "ffffffffffffffffffffffffffffffffffffffff" - - -@dataclass(init=True) -class TaskSummaryPerFuncOrClassName: - #: The function or class name of this task. - func_or_class_name: str - #: The type of the class. Equivalent to protobuf TaskType. - type: str - #: State name to the count dict. State name is equivalent to - #: the protobuf TaskStatus. - state_counts: Dict[TypeTaskStatus, int] = field(default_factory=dict) - - -@dataclass -class Link: - #: The type of entity to link to - type: str - #: The id of the entity to link to - id: str - - -@dataclass(init=True) -class NestedTaskSummary: - #: The name of this task group - name: str - #: A unique identifier for this group - key: str - #: The type of the class. Equivalent to protobuf TaskType, - #: "ACTOR" if it represents an Actor, or "GROUP" if it's a grouping of tasks. - type: str - #: Unix timestamp to use to sort the task group. - timestamp: Optional[int] = None - #: State name to the count dict. State name is equivalent to - #: the protobuf TaskStatus. - state_counts: Dict[TypeTaskStatus, int] = field(default_factory=dict) - #: The child - children: List["NestedTaskSummary"] = field(default_factory=list) - #: A link to more details about this summary. - link: Optional[Link] = None - - -@dataclass -class TaskSummaries: - #: Group key -> summary. - #: Right now, we only have func_class_name as a key. - # TODO(sang): Support the task group abstraction. - summary: Union[Dict[str, TaskSummaryPerFuncOrClassName], List[NestedTaskSummary]] - #: Total Ray tasks. - total_tasks: int - #: Total actor tasks. - total_actor_tasks: int - #: Total scheduled actors. - total_actor_scheduled: int - summary_by: str = "func_name" - - @classmethod - def to_summary_by_func_name(cls, *, tasks: List[Dict]) -> "TaskSummaries": - # NOTE: The argument tasks contains a list of dictionary - # that have the same k/v as TaskState. - summary = {} - total_tasks = 0 - total_actor_tasks = 0 - total_actor_scheduled = 0 - - for task in tasks: - key = task["func_or_class_name"] - if key not in summary: - summary[key] = TaskSummaryPerFuncOrClassName( - func_or_class_name=task["func_or_class_name"], - type=task["type"], - ) - task_summary = summary[key] - - state = task["state"] - if state not in task_summary.state_counts: - task_summary.state_counts[state] = 0 - task_summary.state_counts[state] += 1 - - type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number - if type_enum == TaskType.NORMAL_TASK: - total_tasks += 1 - elif type_enum == TaskType.ACTOR_CREATION_TASK: - total_actor_scheduled += 1 - elif type_enum == TaskType.ACTOR_TASK: - total_actor_tasks += 1 - - return TaskSummaries( - summary=summary, - total_tasks=total_tasks, - total_actor_tasks=total_actor_tasks, - total_actor_scheduled=total_actor_scheduled, - summary_by="func_name", - ) - - @classmethod - def to_summary_by_lineage( - cls, *, tasks: List[Dict], actors: List[Dict] - ) -> "TaskSummaries": - """ - This summarizes tasks by lineage. - i.e. A task will be grouped with another task if they have the - same parent. - - This does things in 4 steps. - Step 1: Iterate through all tasks and keep track of them by id and ownership - Step 2: Put the tasks in a tree structure based on ownership - Step 3: Merge together siblings in the tree if there are more - than one with the same name. - Step 4: Total the children - - This can probably be more efficient if we merge together some steps to - reduce the amount of iterations but this algorithm produces very easy to - understand code. We can optimize in the future. - """ - # NOTE: The argument tasks contains a list of dictionary - # that have the same k/v as TaskState. - - tasks_by_id = {} - task_group_by_id = {} - actor_creation_task_id_for_actor_id = {} - summary = [] - total_tasks = 0 - total_actor_tasks = 0 - total_actor_scheduled = 0 - - # Step 1 - # We cannot assume that a parent task always comes before the child task - # So we need to keep track of all tasks by ids so we can quickly find the - # parent. - # We also track the actor creation tasks so we can quickly figure out the - # ownership of actors. - for task in tasks: - tasks_by_id[task["task_id"]] = task - type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number - if type_enum == TaskType.ACTOR_CREATION_TASK: - actor_creation_task_id_for_actor_id[task["actor_id"]] = task["task_id"] - - actor_dict = {actor["actor_id"]: actor for actor in actors} - - def get_or_create_task_group(task_id: str) -> Optional[NestedTaskSummary]: - """ - Gets an already created task_group - OR - Creates a task group and puts it in the right place under its parent. - For actor tasks, the parent is the Actor that owns it. For all other - tasks, the owner is the driver or task that created it. - - Returns None if there is missing data about the task or one of its parents. - - For task groups that represents actors, the id is in the - format actor:{actor_id} - """ - if task_id in task_group_by_id: - return task_group_by_id[task_id] - - task = tasks_by_id.get(task_id) - if not task: - logger.debug(f"We're missing data about {task_id}") - # We're missing data about this parent. So we're dropping the whole - # tree at that node. - return None - - # Use name first which allows users to customize the name of - # their remote function call using the name option. - func_name = task["name"] or task["func_or_class_name"] - task_id = task["task_id"] - type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number - - task_group_by_id[task_id] = NestedTaskSummary( - name=func_name, - key=task_id, - type=task["type"], - timestamp=task["creation_time_ms"], - link=Link(type="task", id=task_id), - ) - - # Set summary in right place under parent - if ( - type_enum == TaskType.ACTOR_TASK - or type_enum == TaskType.ACTOR_CREATION_TASK - ): - # For actor tasks, the parent is the actor and not the parent task. - parent_task_group = get_or_create_actor_task_group(task["actor_id"]) - if parent_task_group: - parent_task_group.children.append(task_group_by_id[task_id]) - else: - parent_task_id = task["parent_task_id"] - if not parent_task_id or parent_task_id.startswith( - DRIVER_TASK_ID_PREFIX - ): - summary.append(task_group_by_id[task_id]) - else: - parent_task_group = get_or_create_task_group(parent_task_id) - if parent_task_group: - parent_task_group.children.append(task_group_by_id[task_id]) - - return task_group_by_id[task_id] - - def get_or_create_actor_task_group( - actor_id: str, - ) -> Optional[NestedTaskSummary]: - """ - Gets an existing task group that represents an actor. - OR - Creates a task group that represents an actor. The owner of the actor is - the parent of the creation_task that created that actor. - - Returns None if there is missing data about the actor or one of its parents. - """ - key = f"actor:{actor_id}" - actor = actor_dict.get(actor_id) - if key not in task_group_by_id: - creation_task_id = actor_creation_task_id_for_actor_id.get(actor_id) - creation_task = tasks_by_id.get(creation_task_id) - - if not creation_task: - logger.debug(f"We're missing data about actor {actor_id}") - # We're missing data about the parent. So we're dropping the whole - # tree at that node. - return None - - # TODO(rickyx) - # We are using repr name for grouping actors if exists, - # else use class name. We should be using some group_name in the future. - if actor is None: - logger.debug( - f"We are missing actor info for actor {actor_id}, " - f"even though creation task exists: {creation_task}" - ) - [actor_name, *rest] = creation_task["func_or_class_name"].split(".") - else: - actor_name = ( - actor["repr_name"] - if actor["repr_name"] - else actor["class_name"] - ) - - task_group_by_id[key] = NestedTaskSummary( - name=actor_name, - key=key, - type="ACTOR", - timestamp=task["creation_time_ms"], - link=Link(type="actor", id=actor_id), - ) - - parent_task_id = creation_task["parent_task_id"] - if not parent_task_id or parent_task_id.startswith( - DRIVER_TASK_ID_PREFIX - ): - summary.append(task_group_by_id[key]) - else: - parent_task_group = get_or_create_task_group(parent_task_id) - if parent_task_group: - parent_task_group.children.append(task_group_by_id[key]) - - return task_group_by_id[key] - - # Step 2: Create the tree structure based on ownership - for task in tasks: - task_id = task["task_id"] - - task_group = get_or_create_task_group(task_id) - - if not task_group: - # We are probably missing data about this task or one of its parents. - continue - - state = task["state"] - if state not in task_group.state_counts: - task_group.state_counts[state] = 0 - task_group.state_counts[state] += 1 - - type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number - if type_enum == TaskType.NORMAL_TASK: - total_tasks += 1 - elif type_enum == TaskType.ACTOR_CREATION_TASK: - total_actor_scheduled += 1 - elif type_enum == TaskType.ACTOR_TASK: - total_actor_tasks += 1 - - def merge_sibings_for_task_group( - siblings: List[NestedTaskSummary], - ) -> Tuple[List[NestedTaskSummary], Optional[int]]: - """ - Merges task summaries with the same name into a group if there are more than - one child with that name. - - Args: - siblings: A list of NestedTaskSummary's to merge together - - Returns - Index 0: A list of NestedTaskSummary's which have been merged - Index 1: The smallest timestamp amongst the siblings - """ - if not len(siblings): - return siblings, None - - # Group by name - groups = {} - min_timestamp = None - - for child in siblings: - child.children, child_min_timestamp = merge_sibings_for_task_group( - child.children - ) - if child_min_timestamp and child_min_timestamp < ( - child.timestamp or sys.maxsize - ): - child.timestamp = child_min_timestamp - - if child.name not in groups: - groups[child.name] = NestedTaskSummary( - name=child.name, - key=child.name, - type="GROUP", - ) - groups[child.name].children.append(child) - if child.timestamp and child.timestamp < ( - groups[child.name].timestamp or sys.maxsize - ): - groups[child.name].timestamp = child.timestamp - if child.timestamp < (min_timestamp or sys.maxsize): - min_timestamp = child.timestamp - - # Take the groups that have more than one children and return it. - # For groups with just one child, return the child itself instead of - # creating a group. - return [ - group if len(group.children) > 1 else group.children[0] - for group in groups.values() - ], min_timestamp - - # Step 3 - summary, _ = merge_sibings_for_task_group(summary) - - def sort_task_groups(task_groups: List[NestedTaskSummary]) -> None: - # Sort by timestamp - # Put actor creation tasks above other tasks with the same timestamp - task_groups.sort(key=lambda x: 0 if x.type == "ACTOR_CREATION_TASK" else 1) - task_groups.sort(key=lambda x: x.timestamp or sys.maxsize) - - def calc_total_for_task_group( - task_group: NestedTaskSummary, - ) -> NestedTaskSummary: - """ - Calculates the total of a group as the sum of all children. - Sorts children by timestamp - """ - if not len(task_group.children): - return task_group - - for child in task_group.children: - totaled = calc_total_for_task_group(child) - - for state, count in totaled.state_counts.items(): - task_group.state_counts[state] = ( - task_group.state_counts.get(state, 0) + count - ) - - sort_task_groups(task_group.children) - - return task_group - - # Step 4 - summary = [calc_total_for_task_group(task_group) for task_group in summary] - sort_task_groups(summary) - - return TaskSummaries( - summary=summary, - total_tasks=total_tasks, - total_actor_tasks=total_actor_tasks, - total_actor_scheduled=total_actor_scheduled, - summary_by="lineage", - ) - - -@dataclass(init=True) -class ActorSummaryPerClass: - #: The class name of the actor. - class_name: str - #: State name to the count dict. State name is equivalent to - #: the protobuf ActorState. - state_counts: Dict[TypeActorStatus, int] = field(default_factory=dict) - - -@dataclass -class ActorSummaries: - #: Group key (actor class name) -> summary - summary: Dict[str, ActorSummaryPerClass] - #: Total number of actors - total_actors: int - summary_by: str = "class" - - @classmethod - def to_summary(cls, *, actors: List[Dict]): - # NOTE: The argument tasks contains a list of dictionary - # that have the same k/v as ActorState. - summary = {} - total_actors = 0 - - for actor in actors: - key = actor["class_name"] - if key not in summary: - summary[key] = ActorSummaryPerClass( - class_name=actor["class_name"], - ) - actor_summary = summary[key] - - state = actor["state"] - if state not in actor_summary.state_counts: - actor_summary.state_counts[state] = 0 - actor_summary.state_counts[state] += 1 - - total_actors += 1 - - return ActorSummaries( - summary=summary, - total_actors=total_actors, - ) - - -@dataclass(init=True) -class ObjectSummaryPerKey: - #: Total number of objects of the type. - total_objects: int - #: Total size in mb. - total_size_mb: float - #: Total number of workers that reference the type of objects. - total_num_workers: int - #: Total number of nodes that reference the type of objects. - total_num_nodes: int - #: State name to the count dict. State name is equivalent to - #: ObjectState. - task_state_counts: Dict[TypeTaskStatus, int] = field(default_factory=dict) - #: Ref count type to the count dict. State name is equivalent to - #: ObjectState. - ref_type_counts: Dict[TypeReferenceType, int] = field(default_factory=dict) - - -@dataclass -class ObjectSummaries: - #: Group key (actor class name) -> summary - summary: Dict[str, ObjectSummaryPerKey] - #: Total number of referenced objects in the cluster. - total_objects: int - #: Total size of referenced objects in the cluster in MB. - total_size_mb: float - #: Whether or not the callsite collection is enabled. - callsite_enabled: bool - summary_by: str = "callsite" - - @classmethod - def to_summary(cls, *, objects: List[Dict]): - # NOTE: The argument tasks contains a list of dictionary - # that have the same k/v as ObjectState. - summary = {} - total_objects = 0 - total_size_mb = 0 - key_to_workers = {} - key_to_nodes = {} - callsite_enabled = True - - for object in objects: - key = object["call_site"] - if key == "disabled": - callsite_enabled = False - if key not in summary: - summary[key] = ObjectSummaryPerKey( - total_objects=0, - total_size_mb=0, - total_num_workers=0, - total_num_nodes=0, - ) - key_to_workers[key] = set() - key_to_nodes[key] = set() - - object_summary = summary[key] - - task_state = object["task_status"] - if task_state not in object_summary.task_state_counts: - object_summary.task_state_counts[task_state] = 0 - object_summary.task_state_counts[task_state] += 1 - - ref_type = object["reference_type"] - if ref_type not in object_summary.ref_type_counts: - object_summary.ref_type_counts[ref_type] = 0 - object_summary.ref_type_counts[ref_type] += 1 - object_summary.total_objects += 1 - total_objects += 1 - - size_bytes = object["object_size"] - # object_size's unit is byte by default. It is -1, if the size is - # unknown. - if size_bytes != -1: - object_summary.total_size_mb += size_bytes / 1024**2 - total_size_mb += size_bytes / 1024**2 - - key_to_workers[key].add(object["pid"]) - key_to_nodes[key].add(object["ip"]) - - # Convert set of pid & node ips to length. - for key, workers in key_to_workers.items(): - summary[key].total_num_workers = len(workers) - for key, nodes in key_to_nodes.items(): - summary[key].total_num_nodes = len(nodes) - - return ObjectSummaries( - summary=summary, - total_objects=total_objects, - total_size_mb=total_size_mb, - callsite_enabled=callsite_enabled, - ) - - -@dataclass(init=True) -class StateSummary: - #: Node ID -> summary per node - #: If the data is not required to be orgnized per node, it will contain - #: a single key, "cluster". - node_id_to_summary: Dict[str, Union[TaskSummaries, ActorSummaries, ObjectSummaries]] - - -@dataclass(init=True) -class SummaryApiResponse: - # Carried over from ListApiResponse - # We currently use list API for listing the resources - total: int - # Carried over from ListApiResponse - # Number of resources returned by data sources after truncation - num_after_truncation: int - # Number of resources after filtering - num_filtered: int - result: StateSummary = None - partial_failure_warning: Optional[str] = "" - # A list of warnings to print. - warnings: Optional[List[str]] = None - - -def resource_to_schema(resource: StateResource) -> StateSchema: - if resource == StateResource.ACTORS: - return ActorState - elif resource == StateResource.JOBS: - return JobState - elif resource == StateResource.NODES: - return NodeState - elif resource == StateResource.OBJECTS: - return ObjectState - elif resource == StateResource.PLACEMENT_GROUPS: - return PlacementGroupState - elif resource == StateResource.RUNTIME_ENVS: - return RuntimeEnvState - elif resource == StateResource.TASKS: - return TaskState - elif resource == StateResource.WORKERS: - return WorkerState - elif resource == StateResource.CLUSTER_EVENTS: - return ClusterEventState - else: - assert False, "Unreachable" - - -def protobuf_message_to_dict( - message, - fields_to_decode: List[str], - preserving_proto_field_name: bool = True, -) -> dict: - """Convert a protobuf message to dict - - Args: - fields_to_decode: field names which will be decoded from binary to hex. - preserving_proto_field_name: a pass-through option for protobuf message - method. See google.protobuf MessageToDict - - Return: - Dictionary of the converted rpc protobuf. - """ - return dashboard_utils.message_to_dict( - message, - fields_to_decode, - including_default_value_fields=True, - preserving_proto_field_name=preserving_proto_field_name, - ) - - -def protobuf_to_task_state_dict(message: TaskEvents) -> dict: - """ - Convert a TaskEvents to a dic repr of `TaskState` - """ - task_attempt = protobuf_message_to_dict( - message=message, - fields_to_decode=[ - "task_id", - "job_id", - "node_id", - "actor_id", - "parent_task_id", - "worker_id", - "placement_group_id", - "component_id", - ], - ) - - task_state = {} - task_info = task_attempt.get("task_info", {}) - state_updates = task_attempt.get("state_updates", {}) - profiling_data = task_attempt.get("profile_events", {}) - if profiling_data: - for event in profiling_data["events"]: - # End/start times are recorded in ns. We convert them to ms. - event["end_time"] = int(event["end_time"]) / 1e6 - event["start_time"] = int(event["start_time"]) / 1e6 - event["extra_data"] = json.loads(event["extra_data"]) - task_state["profiling_data"] = profiling_data - - # Convert those settable fields - mappings = [ - ( - task_info, - [ - "task_id", - "name", - "actor_id", - "type", - "func_or_class_name", - "language", - "required_resources", - "runtime_env_info", - "parent_task_id", - "placement_group_id", - ], - ), - (task_attempt, ["task_id", "attempt_number", "job_id"]), - ( - state_updates, - ["node_id", "worker_id", "task_log_info"], - ), - ] - for src, keys in mappings: - for key in keys: - task_state[key] = src.get(key) - - task_state["creation_time_ms"] = None - task_state["start_time_ms"] = None - task_state["end_time_ms"] = None - events = [] - - for state in TaskStatus.keys(): - key = f"{state.lower()}_ts" - if key in state_updates: - # timestamp is recorded as nanosecond from the backend. - # We need to convert it to the second. - ts_ms = int(state_updates[key]) // 1e6 - events.append( - { - "state": state, - "created_ms": ts_ms, - } - ) - if state == "PENDING_ARGS_AVAIL": - task_state["creation_time_ms"] = ts_ms - if state == "RUNNING": - task_state["start_time_ms"] = ts_ms - if state == "FINISHED" or state == "FAILED": - task_state["end_time_ms"] = ts_ms - - task_state["events"] = events - if len(events) > 0: - latest_state = events[-1]["state"] - else: - latest_state = "NIL" - task_state["state"] = latest_state - - # Parse error info - if latest_state == "FAILED": - error_info = state_updates.get("error_info", None) - if error_info: - # We captured colored error message printed to console, e.g. - # "\x1b[31mTraceback (most recent call last):\x1b[0m", - # this is to remove the ANSI escape codes. - task_state["error_message"] = remove_ansi_escape_codes( - error_info.get("error_message", "") - ) - task_state["error_type"] = error_info.get("error_type", "") - - return task_state - - -def remove_ansi_escape_codes(text: str) -> str: - """Remove ANSI escape codes from a string.""" - import re - - return re.sub(r"\x1b[^m]*m", "", text) - - -def dict_to_state(d: Dict, state_schema: StateSchema) -> StateSchema: - """Convert a dict to a state schema. - - Args: - d: a dict to convert. - state_schema: a schema to convert to. - - Returns: - A state schema. - """ - try: - return resource_to_schema(state_schema)(**d) - except Exception as e: - raise RayStateApiException(f"Failed to convert {d} to StateSchema: {e}") from e +record_deprecated_state_api_import() diff --git a/python/ray/experimental/state/custom_types.py b/python/ray/experimental/state/custom_types.py index 5f3535a27446..f5576beaeaa1 100644 --- a/python/ray/experimental/state/custom_types.py +++ b/python/ray/experimental/state/custom_types.py @@ -1,100 +1,4 @@ -import sys +from ray.util.state.custom_types import * # noqa: F401 F403 +from ray.util.state.util import record_deprecated_state_api_import -from ray.core.generated.common_pb2 import ( - TaskStatus, - TaskType, - WorkerExitType, - WorkerType, -) -from ray.core.generated.gcs_pb2 import ( - ActorTableData, - GcsNodeInfo, - PlacementGroupTableData, -) -from ray.dashboard.memory_utils import ReferenceType - -if sys.version_info >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal - - -ACTOR_STATUS = [ - "DEPENDENCIES_UNREADY", - "PENDING_CREATION", - "ALIVE", - "RESTARTING", - "DEAD", -] -TypeActorStatus = Literal[tuple(ACTOR_STATUS)] -PLACEMENT_GROUP_STATUS = [ - "PENDING", - "CREATED", - "REMOVED", - "RESCHEDULING", -] -TypePlacementGroupStatus = Literal[tuple(PLACEMENT_GROUP_STATUS)] -TASK_STATUS = [ - "NIL", - "PENDING_ARGS_AVAIL", - "PENDING_NODE_ASSIGNMENT", - "PENDING_OBJ_STORE_MEM_AVAIL", - "PENDING_ARGS_FETCH", - "SUBMITTED_TO_WORKER", - "RUNNING", - "RUNNING_IN_RAY_GET", - "RUNNING_IN_RAY_WAIT", - "FINISHED", - "FAILED", -] -TypeTaskStatus = Literal[tuple(TASK_STATUS)] -NODE_STATUS = ["ALIVE", "DEAD"] -TypeNodeStatus = Literal[tuple(NODE_STATUS)] -WORKER_TYPE = [ - "WORKER", - "DRIVER", - "SPILL_WORKER", - "RESTORE_WORKER", -] -TypeWorkerType = Literal[tuple(WORKER_TYPE)] -WORKER_EXIT_TYPE = [ - "SYSTEM_ERROR", - "INTENDED_SYSTEM_EXIT", - "USER_ERROR", - "INTENDED_USER_EXIT", - "NODE_OUT_OF_MEMORY", -] -TypeWorkerExitType = Literal[tuple(WORKER_EXIT_TYPE)] -TASK_TYPE = [ - "NORMAL_TASK", - "ACTOR_CREATION_TASK", - "ACTOR_TASK", - "DRIVER_TASK", -] -TypeTaskType = Literal[tuple(TASK_TYPE)] -TypeReferenceType = Literal[ - tuple(reference_type.value for reference_type in ReferenceType) -] - - -def validate_protobuf_enum(grpc_enum, custom_enum): - """Validate the literal contains the correct enum values from protobuf""" - enum_vals = set(grpc_enum.DESCRIPTOR.values_by_name) - # Sometimes, the grpc enum is mocked, and it - # doesn't include any values in that case. - if len(enum_vals) > 0: - assert enum_vals == set(custom_enum) - - -# Do the enum validation here. -# It is necessary to avoid regression. Alternatively, we can auto generate this -# directly by protobuf. -validate_protobuf_enum(ActorTableData.ActorState, ACTOR_STATUS) -validate_protobuf_enum( - PlacementGroupTableData.PlacementGroupState, PLACEMENT_GROUP_STATUS -) -validate_protobuf_enum(TaskStatus, TASK_STATUS) -validate_protobuf_enum(GcsNodeInfo.GcsNodeState, NODE_STATUS) -validate_protobuf_enum(WorkerType, WORKER_TYPE) -validate_protobuf_enum(WorkerExitType, WORKER_EXIT_TYPE) -validate_protobuf_enum(TaskType, TASK_TYPE) +record_deprecated_state_api_import() diff --git a/python/ray/experimental/state/exception.py b/python/ray/experimental/state/exception.py index 43156d28b5de..49e7099cf325 100644 --- a/python/ray/experimental/state/exception.py +++ b/python/ray/experimental/state/exception.py @@ -1,31 +1,4 @@ -"""Internal Error""" +from ray.util.state.exception import * # noqa: F401 F403 +from ray.util.state.util import record_deprecated_state_api_import - -STATE_OBS_ALPHA_FEEDBACK_MSG = [ - "\n==========ALPHA, FEEDBACK NEEDED ===============", - "State Observability APIs is currently in Alpha. ", - "If you have any feedback, you could do so at either way as below:", - " 1. Report bugs/issues with details: https://forms.gle/gh77mwjEskjhN8G46", - " 2. Follow up in #ray-state-observability-dogfooding slack channel of Ray: " - "https://tinyurl.com/2pm26m4a", - "==========================================================", -] - - -class DataSourceUnavailable(Exception): - pass - - -"""User-facing Error""" - - -class RayStateApiException(Exception): - def __init__(self, err_msg, *args): - err_msg += "\n".join(STATE_OBS_ALPHA_FEEDBACK_MSG) - super().__init__(err_msg, *args) - - -class ServerUnavailable(RayStateApiException): - """Thrown when failing to connect to dashboard server""" - - pass +record_deprecated_state_api_import() diff --git a/python/ray/experimental/state/state_cli.py b/python/ray/experimental/state/state_cli.py index da88e6d7d02e..58bc5d31ebdc 100644 --- a/python/ray/experimental/state/state_cli.py +++ b/python/ray/experimental/state/state_cli.py @@ -1,1163 +1,4 @@ -import dataclasses -import json -import logging -from datetime import datetime -from enum import Enum, unique -from typing import Dict, List, Optional, Tuple +from ray.util.state.state_cli import * # noqa: F401 F403 +from ray.util.state.util import record_deprecated_state_api_import -import click -import yaml - -import ray._private.services as services -from ray._private.thirdparty.tabulate.tabulate import tabulate -from ray.experimental.state.api import ( - StateApiClient, - get_log, - list_logs, - summarize_actors, - summarize_objects, - summarize_tasks, -) -from ray.experimental.state.common import ( - DEFAULT_LIMIT, - DEFAULT_LOG_LIMIT, - DEFAULT_RPC_TIMEOUT, - GetApiOptions, - ListApiOptions, - PredicateType, - StateResource, - StateSchema, - SupportedFilterType, - resource_to_schema, -) -from ray.experimental.state.exception import RayStateApiException -from ray.util.annotations import PublicAPI - -logger = logging.getLogger(__name__) - - -@unique -class AvailableFormat(Enum): - DEFAULT = "default" - JSON = "json" - YAML = "yaml" - TABLE = "table" - - -def _parse_filter(filter: str) -> Tuple[str, PredicateType, SupportedFilterType]: - """Parse the filter string to a tuple of key, preciate, and value.""" - # The function assumes there's going to be no key that includes "="" or "!=". - # Since key is controlled by us, it should be trivial to keep the invariant. - predicate = None - # Tuple of [predicate_start, predicate_end). - predicate_index = None - - # Find the first predicate match. This logic works because we assume the - # key doesn't contain = or !=. - for i in range(len(filter)): - char = filter[i] - if char == "=": - predicate = "=" - predicate_index = (i, i + 1) - break - elif char == "!": - if len(filter) <= i + 1: - continue - - next_char = filter[i + 1] - if next_char == "=": - predicate = "!=" - predicate_index = (i, i + 2) - break - - if not predicate or not predicate_index: - raise ValueError( - f"The format of a given filter {filter} is invalid: " - "Cannot find the predicate. " - "Please provide key=val or key!=val format string." - ) - - key, predicate, value = ( - filter[: predicate_index[0]], - filter[predicate_index[0] : predicate_index[1]], - filter[predicate_index[1] :], - ) - - assert predicate == "=" or predicate == "!=" - if len(key) == 0 or len(value) == 0: - raise ValueError( - f"The format of a given filter {filter} is invalid: " - f"Cannot identify key {key} or value, {value}. " - "Please provide key=val or key!=val format string." - ) - - return (key, predicate, value) - - -def _get_available_formats() -> List[str]: - """Return the available formats in a list of string""" - return [format_enum.value for format_enum in AvailableFormat] - - -def _get_available_resources( - excluded: Optional[List[StateResource]] = None, -) -> List[str]: - """Return the available resources in a list of string - - Args: - excluded: List of resources that should be excluded - """ - # All resource names use '_' rather than '-'. But users options have '-' - return [ - e.value.replace("_", "-") - for e in StateResource - if excluded is None or e not in excluded - ] - - -def get_table_output(state_data: List, schema: StateSchema, detail: bool) -> str: - """Display the table output. - - The table headers are ordered as the order defined in the dataclass of - `StateSchema`. For example, - - @dataclass - class A(StateSchema): - a: str - b: str - c: str - - will create headers - A B C - ----- - - Args: - state_data: A list of state data. - schema: The schema for the corresponding resource. - - Returns: - The table formatted string. - """ - time = datetime.now() - header = "=" * 8 + f" List: {time} " + "=" * 8 - headers = [] - table = [] - cols = schema.list_columns(detail=detail) - for data in state_data: - for key, val in data.items(): - if isinstance(val, dict): - data[key] = yaml.dump(val, indent=2) - keys = set(data.keys()) - headers = [] - for col in cols: - if col in keys: - headers.append(col.upper()) - table.append([data[header.lower()] for header in headers]) - return f""" -{header} -Stats: ------------------------------- -Total: {len(state_data)} - -Table: ------------------------------- -{tabulate(table, headers=headers, showindex=True, tablefmt="plain", floatfmt=".3f")} -""" - - -def output_with_format( - state_data: List[Dict], - *, - schema: Optional[StateSchema], - format: AvailableFormat = AvailableFormat.DEFAULT, - detail: bool = False, -) -> str: - if format == AvailableFormat.DEFAULT: - return get_table_output(state_data, schema, detail) - if format == AvailableFormat.YAML: - return yaml.dump( - state_data, - indent=4, - explicit_start=True, - # We want to keep the defined ordering of the states, thus sort_keys=False - sort_keys=False, - ) - elif format == AvailableFormat.JSON: - return json.dumps(state_data) - elif format == AvailableFormat.TABLE: - return get_table_output(state_data, schema, detail) - else: - raise ValueError( - f"Unexpected format: {format}. " - f"Supported formatting: {_get_available_formats()}" - ) - - -def format_summary_output(state_data: Dict, *, resource: StateResource) -> str: - if len(state_data) == 0: - return "No resource in the cluster" - - # Parse the data. - cluster_data = state_data["cluster"] - summaries = cluster_data["summary"] - summary_by = cluster_data["summary_by"] - del cluster_data["summary_by"] - del cluster_data["summary"] - - cluster_info_table = yaml.dump(cluster_data, indent=2) - - # Create a table. - table = [] - headers = [] - for summary in summaries.values(): - # Convert dict to yaml for better formatting. - for key, val in summary.items(): - if isinstance(val, dict): - summary[key] = yaml.dump(val, indent=2) - - headers = sorted([key.upper() for key in summary.keys()]) - table.append([summary[header.lower()] for header in headers]) - - summary_table = tabulate( - table, headers=headers, showindex=True, tablefmt="plain", numalign="left" - ) - - time = datetime.now() - header = "=" * 8 + f" {resource.value.capitalize()} Summary: {time} " + "=" * 8 - return f""" -{header} -Stats: ------------------------------------- -{cluster_info_table} - -Table (group by {summary_by}): ------------------------------------- -{summary_table} -""" - - -def format_object_summary_output(state_data: Dict) -> str: - if len(state_data) == 0: - return "No resource in the cluster" - - # Parse the data. - cluster_data = state_data["cluster"] - summaries = cluster_data["summary"] - summary_by = cluster_data["summary_by"] - del cluster_data["summary_by"] - del cluster_data["summary"] - - cluster_info_table = yaml.dump(cluster_data, indent=2) - - # Create a table per callsite. - tables = [] - for callsite, summary in summaries.items(): - # Convert dict to yaml for better formatting. - for key, val in summary.items(): - if isinstance(val, dict): - summary[key] = yaml.dump(val, indent=2) - - table = [] - headers = sorted([key.upper() for key in summary.keys()]) - table.append([summary[header.lower()] for header in headers]) - table_for_callsite = tabulate( - table, headers=headers, showindex=True, numalign="left" - ) - - # Format callsite. | is a separator for ray callsite. - formatted_callsite = callsite.replace("|", "\n|") - tables.append(f"{formatted_callsite}\n{table_for_callsite}") - - time = datetime.now() - header = "=" * 8 + f" Object Summary: {time} " + "=" * 8 - table_string = "\n\n\n\n".join(tables) - return f""" -{header} -Stats: ------------------------------------- -{cluster_info_table} - -Table (group by {summary_by}) ------------------------------------- -{table_string} -""" - - -def format_get_api_output( - state_data: Optional[StateSchema], - id: str, - *, - schema: StateSchema, - format: AvailableFormat = AvailableFormat.YAML, -) -> str: - if not state_data or isinstance(state_data, list) and len(state_data) == 0: - return f"Resource with id={id} not found in the cluster." - if not isinstance(state_data, list): - state_data = [state_data] - state_data = [dataclasses.asdict(state) for state in state_data] - return output_with_format(state_data, schema=schema, format=format, detail=True) - - -def format_list_api_output( - state_data: List[StateSchema], - *, - schema: StateSchema, - format: AvailableFormat = AvailableFormat.DEFAULT, - detail: bool = False, -) -> str: - if len(state_data) == 0: - return "No resource in the cluster" - state_data = [dataclasses.asdict(state) for state in state_data] - return output_with_format(state_data, schema=schema, format=format, detail=detail) - - -def _should_explain(format: AvailableFormat) -> bool: - # If the format is json or yaml, it should not print stats because - # users don't want additional strings. - return format == AvailableFormat.DEFAULT or format == AvailableFormat.TABLE - - -""" -Common Options for State API commands -""" -timeout_option = click.option( - "--timeout", - default=DEFAULT_RPC_TIMEOUT, - help=f"Timeout in seconds for the API requests. Default is {DEFAULT_RPC_TIMEOUT}", -) -address_option = click.option( - "--address", - default=None, - help=( - "The address of Ray API server. If not provided, it will be configured " - "automatically from querying the GCS server." - ), -) - - -@click.command() -@click.argument( - "resource", - # NOTE(rickyyx): We are not allowing query job with id, and runtime envs - type=click.Choice( - _get_available_resources( - excluded=[StateResource.JOBS, StateResource.RUNTIME_ENVS] - ) - ), -) -@click.argument( - "id", - type=str, -) -@address_option -@timeout_option -@PublicAPI(stability="alpha") -def ray_get( - resource: str, - id: str, - address: Optional[str], - timeout: float, -): - """Get a state of a given resource by ID. - - We currently DO NOT support get by id for jobs and runtime-envs - - The output schema is defined at :ref:`State API Schema section. ` - - For example, the output schema of `ray get tasks ` is - :class:`~ray.experimental.state.common.TaskState`. - - Usage: - - Get an actor with actor id - - ``` - ray get actors - ``` - - Get a placement group information with - - ``` - ray get placement-groups - ``` - - The API queries one or more components from the cluster to obtain the data. - The returned state snapshot could be stale, and it is not guaranteed to return - the live data. - - Args: - resource: The type of the resource to query. - id: The id of the resource. - - Raises: - :class:`RayStateApiException ` - if the CLI is failed to query the data. - """ # noqa: E501 - # All resource names use '_' rather than '-'. But users options have '-' - resource = StateResource(resource.replace("-", "_")) - - # Create the State API server and put it into context - logger.debug(f"Create StateApiClient to ray instance at: {address}...") - client = StateApiClient(address=address) - options = GetApiOptions(timeout=timeout) - - # If errors occur, exceptions will be thrown. - try: - data = client.get( - resource=resource, - id=id, - options=options, - _explain=_should_explain(AvailableFormat.YAML), - ) - except RayStateApiException as e: - raise click.UsageError(str(e)) - - # Print data to console. - print( - format_get_api_output( - state_data=data, - id=id, - schema=resource_to_schema(resource), - format=AvailableFormat.YAML, - ) - ) - - -@click.command() -@click.argument( - "resource", - type=click.Choice(_get_available_resources()), -) -@click.option( - "--format", default="default", type=click.Choice(_get_available_formats()) -) -@click.option( - "-f", - "--filter", - help=( - "A key, predicate, and value to filter the result. " - "E.g., --filter 'key=value' or --filter 'key!=value'. " - "You can specify multiple --filter options. In this case all predicates " - "are concatenated as AND. For example, --filter key=value --filter key2=value " - "means (key==val) AND (key2==val2)" - ), - multiple=True, -) -@click.option( - "--limit", - default=DEFAULT_LIMIT, - type=int, - help=("Maximum number of entries to return. 100 by default."), -) -@click.option( - "--detail", - help=( - "If the flag is set, the output will contain data in more details. " - "Note that the API could query more sources " - "to obtain information in a greater detail." - ), - is_flag=True, - default=False, -) -@timeout_option -@address_option -@PublicAPI(stability="alpha") -def ray_list( - resource: str, - format: str, - filter: List[str], - limit: int, - detail: bool, - timeout: float, - address: str, -): - """List all states of a given resource. - - Normally, summary APIs are recommended before listing all resources. - - The output schema is defined at :ref:`State API Schema section. ` - - For example, the output schema of `ray list tasks` is - :class:`~ray.experimental.state.common.TaskState`. - - Usage: - - List all actor information from the cluster. - - ``` - ray list actors - ``` - - List 50 actors from the cluster. The sorting order cannot be controlled. - - ``` - ray list actors --limit 50 - ``` - - List 10 actors with state PENDING. - - ``` - ray list actors --limit 10 --filter "state=PENDING" - ``` - - List actors with yaml format. - - ``` - ray list actors --format yaml - ``` - - List actors with details. When --detail is specified, it might query - more data sources to obtain data in details. - - ``` - ray list actors --detail - ``` - - The API queries one or more components from the cluster to obtain the data. - The returned state snapshot could be stale, and it is not guaranteed to return - the live data. - - The API can return partial or missing output upon the following scenarios. - - - When the API queries more than 1 component, if some of them fail, - the API will return the partial result (with a suppressible warning). - - When the API returns too many entries, the API - will truncate the output. Currently, truncated data cannot be - selected by users. - - Args: - resource: The type of the resource to query. - - Raises: - :class:`RayStateApiException ` - if the CLI is failed to query the data. - """ # noqa: E501 - # All resource names use '_' rather than '-'. But users options have '-' - resource = StateResource(resource.replace("-", "_")) - format = AvailableFormat(format) - - # Create the State API server and put it into context - client = StateApiClient(address=address) - - filter = [_parse_filter(f) for f in filter] - - options = ListApiOptions( - limit=limit, - timeout=timeout, - filters=filter, - detail=detail, - ) - - # If errors occur, exceptions will be thrown. Empty data indicate successful query. - try: - data = client.list( - resource, - options=options, - raise_on_missing_output=False, - _explain=_should_explain(format), - ) - except RayStateApiException as e: - raise click.UsageError(str(e)) - - # If --detail is given, the default formatting is yaml. - if detail and format == AvailableFormat.DEFAULT: - format = AvailableFormat.YAML - - # Print data to console. - print( - format_list_api_output( - state_data=data, - schema=resource_to_schema(resource), - format=format, - detail=detail, - ) - ) - - -@click.group("summary") -@click.pass_context -@PublicAPI(stability="alpha") -def summary_state_cli_group(ctx): - """Return the summarized information of a given resource.""" - pass - - -@summary_state_cli_group.command(name="tasks") -@timeout_option -@address_option -@click.pass_context -@PublicAPI(stability="alpha") -def task_summary(ctx, timeout: float, address: str): - """Summarize the task state of the cluster. - - By default, the output contains the information grouped by - task function names. - - The output schema is - :class:`~ray.experimental.state.common.TaskSummaries`. - - Raises: - :class:`RayStateApiException ` - if the CLI is failed to query the data. - """ # noqa: E501 - print( - format_summary_output( - summarize_tasks( - address=address, - timeout=timeout, - raise_on_missing_output=False, - _explain=True, - ), - resource=StateResource.TASKS, - ) - ) - - -@summary_state_cli_group.command(name="actors") -@timeout_option -@address_option -@click.pass_context -@PublicAPI(stability="alpha") -def actor_summary(ctx, timeout: float, address: str): - """Summarize the actor state of the cluster. - - By default, the output contains the information grouped by - actor class names. - - The output schema is - :class:`ray.experimental.state.common.ActorSummaries - `. - - Raises: - :class:`RayStateApiException ` - if the CLI is failed to query the data. - """ # noqa: E501 - print( - format_summary_output( - summarize_actors( - address=address, - timeout=timeout, - raise_on_missing_output=False, - _explain=True, - ), - resource=StateResource.ACTORS, - ) - ) - - -@summary_state_cli_group.command(name="objects") -@timeout_option -@address_option -@click.pass_context -@PublicAPI(stability="alpha") -def object_summary(ctx, timeout: float, address: str): - """Summarize the object state of the cluster. - - The API is recommended when debugging memory leaks. - See :ref:`Debugging with Ray Memory ` for more details. - (Note that this command is almost equivalent to `ray memory`, but it returns - easier-to-understand output). - - By default, the output contains the information grouped by - object callsite. Note that the callsite is not collected and - all data will be aggregated as "disable" callsite if the env var - `RAY_record_ref_creation_sites` is not configured. To enable the - callsite collection, set the following environment variable when - starting Ray. - - Example: - - ``` - RAY_record_ref_creation_sites=1 ray start --head - ``` - - ``` - RAY_record_ref_creation_sites=1 ray_script.py - ``` - - The output schema is - :class:`ray.experimental.state.common.ObjectSummaries - `. - - Raises: - :class:`RayStateApiException ` - if the CLI is failed to query the data. - """ # noqa: E501 - print( - format_object_summary_output( - summarize_objects( - address=address, - timeout=timeout, - raise_on_missing_output=False, - _explain=True, - ), - ) - ) - - -log_follow_option = click.option( - "--follow", - "-f", - required=False, - type=bool, - is_flag=True, - help="Streams the log file as it is updated instead of just tailing.", -) - -log_tail_option = click.option( - "--tail", - required=False, - type=int, - default=DEFAULT_LOG_LIMIT, - help="Number of lines to tail from log. Use -1 to fetch the whole file.", -) - -log_interval_option = click.option( - "--interval", - required=False, - type=float, - default=None, - help="The interval in secs to print new logs when `--follow` is specified.", - hidden=True, -) - -log_timeout_option = click.option( - "--timeout", - default=DEFAULT_RPC_TIMEOUT, - help=( - "Timeout in seconds for the API requests. " - f"Default is {DEFAULT_RPC_TIMEOUT}. If --follow is specified, " - "this option will be ignored." - ), -) - -log_node_ip_option = click.option( - "-ip", - "--node-ip", - required=False, - type=str, - default=None, - help="Filters the logs by this ip address", -) - -log_node_id_option = click.option( - "--node-id", - "-id", - required=False, - type=str, - default=None, - help="Filters the logs by this NodeID", -) - -log_suffix_option = click.option( - "--err", - is_flag=True, - default=False, - help=( - "If supplied, querying stderr files for workers/actors, " - "else defaults to stdout files." - ), -) - -log_encoding_option = click.option( - "--encoding", - required=False, - default="utf-8", - help=( - "The encoding use to decode the log file. Accepts any encoding " - "supported by Python's `codecs` module. Defaults to utf-8." - ), -) - -log_encoding_errors_option = click.option( - "--encoding-errors", - required=False, - default="strict", - help=( - "The error handling scheme to use for decoding errors. " - "Accepts any error handling scheme supported by Python's `codecs`" - "module. Defaults to strict." - ), -) - - -def _get_head_node_ip(address: Optional[str] = None): - """Get the head node ip from the ray address if possible - - Args: - address: ray cluster address, e.g. "auto", "localhost:6379" - - Raises: - click.UsageError if node ip could not be resolved - """ - try: - address = services.canonicalize_bootstrap_address_or_die(address) - return address.split(":")[0] - except (ConnectionError, ValueError) as e: - # Hide all the stack trace - raise click.UsageError(str(e)) - - -def _print_log( - address: Optional[str] = None, - node_id: Optional[str] = None, - node_ip: Optional[str] = None, - filename: Optional[str] = None, - actor_id: Optional[str] = None, - pid: Optional[int] = None, - follow: bool = False, - tail: int = DEFAULT_LOG_LIMIT, - timeout: int = DEFAULT_RPC_TIMEOUT, - interval: Optional[float] = None, - suffix: str = "out", - encoding: str = "utf-8", - encoding_errors: str = "strict", -): - """Wrapper around `get_log()` that prints the preamble and the log lines""" - if tail > 0: - print( - f"--- Log has been truncated to last {tail} lines." - " Use `--tail` flag to toggle. Set to -1 for getting the entire file. ---\n" - ) - - if node_id is None and node_ip is None: - # Auto detect node ip from the ray address when address neither is given - node_ip = _get_head_node_ip(address) - - for chunk in get_log( - address=address, - node_id=node_id, - node_ip=node_ip, - filename=filename, - actor_id=actor_id, - tail=tail, - pid=pid, - follow=follow, - _interval=interval, - timeout=timeout, - suffix=suffix, - encoding=encoding, - errors=encoding_errors, - ): - print(chunk, end="", flush=True) - - -LOG_CLI_HELP_MSG = """ -Get logs based on filename (cluster) or resource identifiers (actor) - -Example: - - Get all the log files available on a node (ray address could be - obtained from `ray start --head` or `ray.init()`). - - ``` - ray logs cluster - ``` - - [ray logs cluster] Print the last 500 lines of raylet.out on a head node. - - ``` - ray logs cluster raylet.out --tail 500 - ``` - - Or simply, using `ray logs` as an alias for `ray logs cluster`: - - ``` - ray logs raylet.out --tail 500 - ``` - - Print the last 500 lines of raylet.out on a worker node id A. - - ``` - ray logs raylet.out --tail 500 —-node-id A - ``` - - [ray logs actor] Follow the log file with an actor id ABC. - - ``` - ray logs actor --id ABC --follow - ``` -""" - - -class LogCommandGroup(click.Group): - def resolve_command(self, ctx, args): - """Try resolve the command line args assuming users omitted the subcommand. - - This overrides the default `resolve_command` for the parent class. - This will allow command alias of `ray ` to `ray cluster `. - """ - ctx.resilient_parsing = True - res = super().resolve_command(ctx, args) - cmd_name, cmd, parsed_args = res - if cmd is None: - # It could have been `ray logs ...`, forward to `ray logs cluster ...` - return super().resolve_command(ctx, ["cluster"] + args) - return cmd_name, cmd, parsed_args - - -logs_state_cli_group = LogCommandGroup(help=LOG_CLI_HELP_MSG) - - -@logs_state_cli_group.command(name="cluster") -@click.argument( - "glob_filter", - required=False, - default="*", -) -@address_option -@log_node_id_option -@log_node_ip_option -@log_follow_option -@log_tail_option -@log_interval_option -@log_timeout_option -@log_encoding_option -@log_encoding_errors_option -@click.pass_context -@PublicAPI(stability="alpha") -def log_cluster( - ctx, - glob_filter: str, - address: Optional[str], - node_id: Optional[str], - node_ip: Optional[str], - follow: bool, - tail: int, - interval: float, - timeout: int, - encoding: str, - encoding_errors: str, -): - """Get/List logs that matches the GLOB_FILTER in the cluster. - By default, it prints a list of log files that match the filter. - By default, it prints the head node logs. - If there's only 1 match, it will print the log file. - - Example: - - Print the last 500 lines of raylet.out on a head node. - - ``` - ray logs [cluster] raylet.out --tail 500 - ``` - - Print the last 500 lines of raylet.out on a worker node id A. - - ``` - ray logs [cluster] raylet.out --tail 500 —-node-id A - ``` - - Download the gcs_server.txt file to the local machine. - - ``` - ray logs [cluster] gcs_server.out --tail -1 > gcs_server.txt - ``` - - Follow the log files from the last 100 lines. - - ``` - ray logs [cluster] raylet.out --tail 100 -f - ``` - - Raises: - :class:`RayStateApiException ` if the CLI - is failed to query the data. - """ # noqa: E501 - - if node_id is None and node_ip is None: - node_ip = _get_head_node_ip(address) - - logs = list_logs( - address=address, - node_id=node_id, - node_ip=node_ip, - glob_filter=glob_filter, - timeout=timeout, - ) - - log_files_found = [] - for _, log_files in logs.items(): - for log_file in log_files: - log_files_found.append(log_file) - - if len(log_files_found) != 1: - # Print the list of log files found if no unique log found - if node_id: - print(f"Node ID: {node_id}") - elif node_ip: - print(f"Node IP: {node_ip}") - print(output_with_format(logs, schema=None, format=AvailableFormat.YAML)) - return - - # If there's only 1 file, that means there's a unique match. - filename = log_files_found[0] - - _print_log( - address=address, - node_id=node_id, - node_ip=node_ip, - filename=filename, - tail=tail, - follow=follow, - interval=interval, - timeout=timeout, - encoding=encoding, - encoding_errors=encoding_errors, - ) - - -@logs_state_cli_group.command(name="actor") -@click.option( - "--id", - "-a", - required=False, - type=str, - default=None, - help="Retrieves the logs corresponding to this ActorID.", -) -@click.option( - "--pid", - "-pid", - required=False, - type=str, - default=None, - help="Retrieves the logs from the actor with this pid.", -) -@address_option -@log_node_id_option -@log_node_ip_option -@log_follow_option -@log_tail_option -@log_interval_option -@log_timeout_option -@log_suffix_option -@click.pass_context -@PublicAPI(stability="alpha") -def log_actor( - ctx, - id: Optional[str], - pid: Optional[str], - address: Optional[str], - node_id: Optional[str], - node_ip: Optional[str], - follow: bool, - tail: int, - interval: float, - timeout: int, - err: bool, -): - """Get/List logs associated with an actor. - - Example: - - Follow the log file with an actor id ABC. - - ``` - ray logs actor --id ABC --follow - ``` - - Get the actor log from pid 123, ip ABC. - Note that this goes well with the driver log of Ray which prints - (ip=ABC, pid=123, class_name) logs. - - ``` - ray logs actor --pid=123 —ip=ABC - ``` - - Get the actor err log file. - - ``` - ray logs actor --id ABC --err - ``` - - Raises: - :class:`RayStateApiException ` - if the CLI is failed to query the data. - MissingParameter if inputs are missing. - """ # noqa: E501 - - if pid is None and id is None: - raise click.MissingParameter( - message="At least one of `--pid` and `--id` has to be set", - param_type="option", - ) - - _print_log( - address=address, - node_id=node_id, - node_ip=node_ip, - pid=pid, - actor_id=id, - tail=tail, - follow=follow, - interval=interval, - timeout=timeout, - suffix="err" if err else "out", - ) - - -@logs_state_cli_group.command(name="worker") -@click.option( - "--pid", - "-pid", - # The only identifier supported for now, TODO(rickyx): add worker id support - required=True, - type=str, - help="Retrieves the logs from the worker with this pid.", -) -@address_option -@log_node_id_option -@log_node_ip_option -@log_follow_option -@log_tail_option -@log_interval_option -@log_timeout_option -@log_suffix_option -@click.pass_context -@PublicAPI(stability="alpha") -def log_worker( - ctx, - pid: Optional[str], - address: Optional[str], - node_id: Optional[str], - node_ip: Optional[str], - follow: bool, - tail: int, - interval: float, - timeout: int, - err: bool, -): - """Get/List logs associated with a worker process. - - Example: - - Follow the log file from a worker process with pid=ABC. - - ``` - ray logs worker --pid ABC --follow - ``` - - Get the stderr logs from a worker process. - - ``` - ray logs worker --pid ABC --err - ``` - - Raises: - :class:`RayStateApiException ` - if the CLI is failed to query the data. - MissingParameter if inputs are missing. - """ # noqa: E501 - - _print_log( - address=address, - node_id=node_id, - node_ip=node_ip, - pid=pid, - tail=tail, - follow=follow, - interval=interval, - timeout=timeout, - suffix="err" if err else "out", - ) +record_deprecated_state_api_import() diff --git a/python/ray/experimental/state/state_manager.py b/python/ray/experimental/state/state_manager.py index 11ea98b89c4c..7720606a68b9 100644 --- a/python/ray/experimental/state/state_manager.py +++ b/python/ray/experimental/state/state_manager.py @@ -1,423 +1,4 @@ -import inspect -import logging -from collections import defaultdict -from functools import wraps -from typing import Dict, List, Optional, Tuple +from ray.util.state.state_manager import * # noqa: F401 F403 +from ray.util.state.util import record_deprecated_state_api_import -import grpc -from grpc.aio._call import UnaryStreamCall - -import ray -import ray.dashboard.modules.log.log_consts as log_consts -from ray._private import ray_constants -from ray._private.gcs_utils import GcsAioClient -from ray._private.utils import hex_to_binary -from ray._raylet import ActorID, JobID -from ray.core.generated import gcs_service_pb2_grpc -from ray.core.generated.gcs_pb2 import ActorTableData -from ray.core.generated.gcs_service_pb2 import ( - GetAllActorInfoReply, - GetAllActorInfoRequest, - GetAllNodeInfoReply, - GetAllNodeInfoRequest, - GetAllPlacementGroupReply, - GetAllPlacementGroupRequest, - GetAllWorkerInfoReply, - GetAllWorkerInfoRequest, - GetTaskEventsReply, - GetTaskEventsRequest, -) -from ray.core.generated.node_manager_pb2 import ( - GetObjectsInfoReply, - GetObjectsInfoRequest, - GetTasksInfoReply, - GetTasksInfoRequest, -) -from ray.core.generated.node_manager_pb2_grpc import NodeManagerServiceStub -from ray.core.generated.reporter_pb2 import ( - ListLogsReply, - ListLogsRequest, - StreamLogRequest, -) -from ray.core.generated.reporter_pb2_grpc import LogServiceStub -from ray.core.generated.runtime_env_agent_pb2 import ( - GetRuntimeEnvsInfoReply, - GetRuntimeEnvsInfoRequest, -) -from ray.core.generated.runtime_env_agent_pb2_grpc import RuntimeEnvServiceStub -from ray.dashboard.datacenter import DataSource -from ray.dashboard.modules.job.common import JobInfo, JobInfoStorageClient -from ray.dashboard.utils import Dict as Dictionary -from ray.experimental.state.common import ( - RAY_MAX_LIMIT_FROM_DATA_SOURCE, - PredicateType, - SupportedFilterType, -) -from ray.experimental.state.exception import DataSourceUnavailable - -logger = logging.getLogger(__name__) - -_STATE_MANAGER_GRPC_OPTIONS = [ - *ray_constants.GLOBAL_GRPC_OPTIONS, - ("grpc.max_send_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), - ("grpc.max_receive_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), -] - - -def handle_grpc_network_errors(func): - """Decorator to add a network handling logic. - - It is a helper method for `StateDataSourceClient`. - The method can only be used for async methods. - """ - assert inspect.iscoroutinefunction(func) - - @wraps(func) - async def api_with_network_error_handler(*args, **kwargs): - """Apply the network error handling logic to each APIs, - such as retry or exception policies. - - Returns: - If RPC succeeds, it returns what the original function returns. - If RPC fails, it raises exceptions. - Exceptions: - DataSourceUnavailable: if the source is unavailable because it is down - or there's a slow network issue causing timeout. - Otherwise, the raw network exceptions (e.g., gRPC) will be raised. - """ - try: - return await func(*args, **kwargs) - except grpc.aio.AioRpcError as e: - if ( - e.code() == grpc.StatusCode.DEADLINE_EXCEEDED - or e.code() == grpc.StatusCode.UNAVAILABLE - ): - raise DataSourceUnavailable( - "Failed to query the data source. " - "It is either there's a network issue, or the source is down." - ) - else: - logger.exception(e) - raise e - - return api_with_network_error_handler - - -class IdToIpMap: - def __init__(self): - # Node IP to node ID mapping. - self._ip_to_node_id = defaultdict(str) - # Node ID to node IP mapping. - self._node_id_to_ip = defaultdict(str) - - def put(self, node_id: str, address: str): - self._ip_to_node_id[address] = node_id - self._node_id_to_ip[node_id] = address - - def get_ip(self, node_id: str): - return self._node_id_to_ip.get(node_id) - - def get_node_id(self, address: str): - return self._ip_to_node_id.get(address) - - def pop(self, node_id: str): - """Pop the given node id. - - Returns: - False if the corresponding node id doesn't exist. - True if it pops correctly. - """ - ip = self._node_id_to_ip.get(node_id) - if not ip: - return None - assert ip in self._ip_to_node_id - self._node_id_to_ip.pop(node_id) - self._ip_to_node_id.pop(ip) - return True - - -class StateDataSourceClient: - """The client to query states from various data sources such as Raylet, GCS, Agents. - - Note that it doesn't directly query core workers. They are proxied through raylets. - - The module is not in charge of service discovery. The caller is responsible for - finding services and register stubs through `register*` APIs. - - Non `register*` APIs - - Return the protobuf directly if it succeeds to query the source. - - Raises an exception if there's any network issue. - - throw a ValueError if it cannot find the source. - """ - - def __init__(self, gcs_channel: grpc.aio.Channel, gcs_aio_client: GcsAioClient): - self.register_gcs_client(gcs_channel) - self._raylet_stubs = {} - self._runtime_env_agent_stub = {} - self._log_agent_stub = {} - self._job_client = JobInfoStorageClient(gcs_aio_client) - self._id_id_map = IdToIpMap() - - def register_gcs_client(self, gcs_channel: grpc.aio.Channel): - self._gcs_actor_info_stub = gcs_service_pb2_grpc.ActorInfoGcsServiceStub( - gcs_channel - ) - self._gcs_pg_info_stub = gcs_service_pb2_grpc.PlacementGroupInfoGcsServiceStub( - gcs_channel - ) - self._gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub( - gcs_channel - ) - self._gcs_worker_info_stub = gcs_service_pb2_grpc.WorkerInfoGcsServiceStub( - gcs_channel - ) - self._gcs_task_info_stub = gcs_service_pb2_grpc.TaskInfoGcsServiceStub( - gcs_channel - ) - - def register_raylet_client(self, node_id: str, address: str, port: int): - full_addr = f"{address}:{port}" - options = _STATE_MANAGER_GRPC_OPTIONS - channel = ray._private.utils.init_grpc_channel( - full_addr, options, asynchronous=True - ) - self._raylet_stubs[node_id] = NodeManagerServiceStub(channel) - self._id_id_map.put(node_id, address) - - def unregister_raylet_client(self, node_id: str): - self._raylet_stubs.pop(node_id) - self._id_id_map.pop(node_id) - - def register_agent_client(self, node_id, address: str, port: int): - options = _STATE_MANAGER_GRPC_OPTIONS - channel = ray._private.utils.init_grpc_channel( - f"{address}:{port}", options=options, asynchronous=True - ) - self._runtime_env_agent_stub[node_id] = RuntimeEnvServiceStub(channel) - self._log_agent_stub[node_id] = LogServiceStub(channel) - self._id_id_map.put(node_id, address) - - def unregister_agent_client(self, node_id: str): - self._runtime_env_agent_stub.pop(node_id) - self._log_agent_stub.pop(node_id) - self._id_id_map.pop(node_id) - - def get_all_registered_raylet_ids(self) -> List[str]: - return self._raylet_stubs.keys() - - def get_all_registered_agent_ids(self) -> List[str]: - assert len(self._log_agent_stub) == len(self._runtime_env_agent_stub) - return self._runtime_env_agent_stub.keys() - - def ip_to_node_id(self, ip: Optional[str]) -> Optional[str]: - """Return the node id that corresponds to the given ip. - - Args: - ip: The ip address. - - Returns: - None if the corresponding id doesn't exist. - Node id otherwise. If None node_ip is given, - it will also return None. - """ - if not ip: - return None - return self._id_id_map.get_node_id(ip) - - @handle_grpc_network_errors - async def get_all_actor_info( - self, - timeout: int = None, - limit: int = None, - filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, - ) -> Optional[GetAllActorInfoReply]: - if not limit: - limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE - if filters is None: - filters = [] - - req_filters = GetAllActorInfoRequest.Filters() - for filter in filters: - key, predicate, value = filter - if predicate != "=": - # We only support EQUAL predicate for source side filtering. - continue - if key == "actor_id": - req_filters.actor_id = ActorID(hex_to_binary(value)).binary() - elif key == "state": - if value not in ActorTableData.ActorState.keys(): - raise ValueError(f"Invalid actor state for filtering: {value}") - req_filters.state = ActorTableData.ActorState.Value(value) - elif key == "job_id": - req_filters.job_id = JobID(hex_to_binary(value)).binary() - - request = GetAllActorInfoRequest(limit=limit, filters=req_filters) - reply = await self._gcs_actor_info_stub.GetAllActorInfo( - request, timeout=timeout - ) - return reply - - @handle_grpc_network_errors - async def get_all_task_info( - self, - timeout: int = None, - limit: int = None, - job_id: Optional[str] = None, - exclude_driver: bool = True, - ) -> Optional[GetTaskEventsReply]: - if not limit: - limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE - if job_id: - job_id = JobID(hex_to_binary(job_id)).binary() - request = GetTaskEventsRequest( - limit=limit, exclude_driver=exclude_driver, job_id=job_id - ) - reply = await self._gcs_task_info_stub.GetTaskEvents(request, timeout=timeout) - return reply - - @handle_grpc_network_errors - async def get_all_placement_group_info( - self, timeout: int = None, limit: int = None - ) -> Optional[GetAllPlacementGroupReply]: - if not limit: - limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE - - request = GetAllPlacementGroupRequest(limit=limit) - reply = await self._gcs_pg_info_stub.GetAllPlacementGroup( - request, timeout=timeout - ) - return reply - - @handle_grpc_network_errors - async def get_all_node_info( - self, timeout: int = None - ) -> Optional[GetAllNodeInfoReply]: - request = GetAllNodeInfoRequest() - reply = await self._gcs_node_info_stub.GetAllNodeInfo(request, timeout=timeout) - return reply - - @handle_grpc_network_errors - async def get_all_worker_info( - self, timeout: int = None, limit: int = None - ) -> Optional[GetAllWorkerInfoReply]: - if not limit: - limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE - - request = GetAllWorkerInfoRequest(limit=limit) - reply = await self._gcs_worker_info_stub.GetAllWorkerInfo( - request, timeout=timeout - ) - return reply - - async def get_job_info(self) -> Optional[Dict[str, JobInfo]]: - # Cannot use @handle_grpc_network_errors because async def is not supported yet. - # TODO(sang): Support timeout & make it async - try: - return await self._job_client.get_all_jobs() - except grpc.aio.AioRpcError as e: - if ( - e.code == grpc.StatusCode.DEADLINE_EXCEEDED - or e.code == grpc.StatusCode.UNAVAILABLE - ): - raise DataSourceUnavailable( - "Failed to query the data source. " - "It is either there's a network issue, or the source is down." - ) - else: - logger.exception(e) - raise e - - async def get_all_cluster_events(self) -> Dictionary: - return DataSource.events - - @handle_grpc_network_errors - async def get_task_info( - self, node_id: str, timeout: int = None, limit: int = None - ) -> Optional[GetTasksInfoReply]: - if not limit: - limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE - - stub = self._raylet_stubs.get(node_id) - if not stub: - raise ValueError(f"Raylet for a node id, {node_id} doesn't exist.") - - reply = await stub.GetTasksInfo( - GetTasksInfoRequest(limit=limit), timeout=timeout - ) - return reply - - @handle_grpc_network_errors - async def get_object_info( - self, node_id: str, timeout: int = None, limit: int = None - ) -> Optional[GetObjectsInfoReply]: - if not limit: - limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE - - stub = self._raylet_stubs.get(node_id) - if not stub: - raise ValueError(f"Raylet for a node id, {node_id} doesn't exist.") - - reply = await stub.GetObjectsInfo( - GetObjectsInfoRequest(limit=limit), - timeout=timeout, - ) - return reply - - @handle_grpc_network_errors - async def get_runtime_envs_info( - self, node_id: str, timeout: int = None, limit: int = None - ) -> Optional[GetRuntimeEnvsInfoReply]: - if not limit: - limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE - - stub = self._runtime_env_agent_stub.get(node_id) - if not stub: - raise ValueError(f"Agent for a node id, {node_id} doesn't exist.") - - reply = await stub.GetRuntimeEnvsInfo( - GetRuntimeEnvsInfoRequest(limit=limit), - timeout=timeout, - ) - return reply - - @handle_grpc_network_errors - async def list_logs( - self, node_id: str, glob_filter: str, timeout: int = None - ) -> ListLogsReply: - stub = self._log_agent_stub.get(node_id) - if not stub: - raise ValueError(f"Agent for node id: {node_id} doesn't exist.") - return await stub.ListLogs( - ListLogsRequest(glob_filter=glob_filter), timeout=timeout - ) - - @handle_grpc_network_errors - async def stream_log( - self, - node_id: str, - log_file_name: str, - keep_alive: bool, - lines: int, - interval: Optional[float], - timeout: int, - ) -> UnaryStreamCall: - stub = self._log_agent_stub.get(node_id) - if not stub: - raise ValueError(f"Agent for node id: {node_id} doesn't exist.") - stream = stub.StreamLog( - StreamLogRequest( - keep_alive=keep_alive, - log_file_name=log_file_name, - lines=lines, - interval=interval, - ), - timeout=timeout, - ) - await self._validate_stream(stream) - return stream - - @staticmethod - async def _validate_stream(stream): - metadata = await stream.initial_metadata() - if metadata.get(log_consts.LOG_GRPC_ERROR) == log_consts.FILE_NOT_FOUND: - raise ValueError('File "{log_file_name}" not found on node {node_id}') +record_deprecated_state_api_import() diff --git a/python/ray/experimental/state/util.py b/python/ray/experimental/state/util.py index f7ba1d599342..24a26dd72982 100644 --- a/python/ray/experimental/state/util.py +++ b/python/ray/experimental/state/util.py @@ -1,47 +1,4 @@ -from typing import Optional, Union +from ray.util.state.util import * # noqa: F401 F403 +from ray.util.state.util import record_deprecated_state_api_import - -def convert_string_to_type( - val: Optional[Union[str, int, float, bool]], convert_type: Union[int, float, bool] -) -> Union[int, float, bool]: - """Convert the given value to a convert type. - - If the given val is None, it will just return None without the conversion. - - It supports, - str -> int/float/bool - int -> int - bool -> bool - float -> float - """ - if val is None: - return None - elif type(val) is convert_type: - return val - elif convert_type is int: - try: - val = int(val) - except ValueError: - raise ValueError( - f"Failed to convert a value {val} of type {type(val)} to {convert_type}" - ) - elif convert_type is float: - try: - val = float(val) - except ValueError: - raise ValueError( - f"Failed to convert a value {val} of type {type(val)} to {convert_type}" - ) - elif convert_type is bool: - # Without this, "False" will become True. - if val == "False" or val == "false" or val == "0": - val = False - elif val == "True" or val == "true" or val == "1": - val = True - else: - raise ValueError( - f"Failed to convert a value {val} of type {type(val)} to {convert_type}" - ) - else: - assert False, f"Unsupported convert type {convert_type}" - return val +record_deprecated_state_api_import() diff --git a/python/ray/experimental/tqdm_ray.py b/python/ray/experimental/tqdm_ray.py index 59ceb4967c34..360e2a7e5c02 100644 --- a/python/ray/experimental/tqdm_ray.py +++ b/python/ray/experimental/tqdm_ray.py @@ -3,6 +3,7 @@ import json import logging import os +import sys import threading import uuid from typing import Any, Dict, Iterable, Optional @@ -37,6 +38,11 @@ def safe_print(*args, **kwargs): By default, the builtin print will be patched to this function when tqdm_ray is used. To disable this, set RAY_TQDM_PATCH_PRINT=0. """ + + # Ignore prints to StringIO objects, etc. + if kwargs.get("file") not in [sys.stdout, sys.stderr, None]: + return _print(*args, **kwargs) + try: instance().hide_bars() _print(*args, **kwargs) diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd index 5d5f3ab593ea..a06630fd2132 100644 --- a/python/ray/includes/common.pxd +++ b/python/ray/includes/common.pxd @@ -117,6 +117,7 @@ cdef extern from "ray/common/status.h" namespace "ray" nogil: c_bool IsNotFound() c_bool IsObjectUnknownOwner() c_bool IsRpcError() + c_bool IsOutOfResource() c_string ToString() c_string CodeAsString() @@ -153,6 +154,8 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil: pass cdef cppclass CWorkerType "ray::core::WorkerType": pass + cdef cppclass CWorkerExitType "ray::rpc::WorkerExitType": + pass cdef cppclass CTaskType "ray::TaskType": pass cdef cppclass CPlacementStrategy "ray::core::PlacementStrategy": @@ -203,6 +206,8 @@ cdef extern from "src/ray/protobuf/common.pb.h" nogil: cdef CWorkerType WORKER_TYPE_SPILL_WORKER "ray::core::WorkerType::SPILL_WORKER" # noqa: E501 cdef CWorkerType WORKER_TYPE_RESTORE_WORKER "ray::core::WorkerType::RESTORE_WORKER" # noqa: E501 cdef CWorkerType WORKER_TYPE_UTIL_WORKER "ray::core::WorkerType::UTIL_WORKER" # noqa: E501 + cdef CWorkerExitType WORKER_EXIT_TYPE_USER_ERROR "ray::rpc::WorkerExitType::USER_ERROR" # noqa: E501 + cdef CWorkerExitType WORKER_EXIT_TYPE_SYSTEM_ERROR "ray::rpc::WorkerExitType::SYSTEM_ERROR" # noqa: E501 cdef extern from "src/ray/protobuf/common.pb.h" nogil: cdef CTaskType TASK_TYPE_NORMAL_TASK "ray::TaskType::NORMAL_TASK" @@ -341,6 +346,25 @@ cdef extern from "ray/gcs/gcs_client/gcs_client.h" nogil: CRayStatus GetAllJobInfo( int64_t timeout_ms, c_vector[CJobTableData]& result) +cdef extern from "ray/gcs/gcs_client/gcs_client.h" namespace "ray::gcs" nogil: + unordered_map[c_string, double] PythonGetResourcesTotal( + const CGcsNodeInfo& node_info) + +cdef extern from "ray/gcs/pubsub/gcs_pub_sub.h" nogil: + + cdef cppclass CPythonGcsPublisher "ray::gcs::PythonGcsPublisher": + + CPythonGcsPublisher(const c_string& gcs_address) + + CRayStatus Connect() + + CRayStatus PublishError( + const c_string &key_id, const CErrorTableData &data, int64_t num_retries) + + CRayStatus PublishLogs(const c_string &key_id, const CLogBatch &data) + + CRayStatus PublishFunctionKey(const CPythonFunction& python_function) + cdef extern from "src/ray/protobuf/gcs.pb.h" nogil: cdef cppclass CJobConfig "ray::rpc::JobConfig": c_string ray_namespace() const @@ -350,12 +374,46 @@ cdef extern from "src/ray/protobuf/gcs.pb.h" nogil: c_string node_id() const c_string node_name() const int state() const + c_string node_manager_address() const + c_string node_manager_hostname() const + int node_manager_port() const + int object_manager_port() const + c_string object_store_socket_name() const + c_string raylet_socket_name() const + int metrics_export_port() const + void ParseFromString(const c_string &serialized) + + cdef enum CGcsNodeState "ray::rpc::GcsNodeInfo_GcsNodeState": + ALIVE "ray::rpc::GcsNodeInfo_GcsNodeState_ALIVE", cdef cppclass CJobTableData "ray::rpc::JobTableData": c_string job_id() const c_bool is_dead() const CJobConfig config() const + cdef cppclass CPythonFunction "ray::rpc::PythonFunction": + void set_key(const c_string &key) + + cdef cppclass CErrorTableData "ray::rpc::ErrorTableData": + c_string job_id() const + c_string type() const + c_string error_message() const + double timestamp() const + + void set_job_id(const c_string &job_id) + void set_type(const c_string &type) + void set_error_message(const c_string &error_message) + void set_timestamp(double timestamp) + + cdef cppclass CLogBatch "ray::rpc::LogBatch": + void set_ip(const c_string &ip) + void set_pid(const c_string &pid) + void set_job_id(const c_string &job_id) + void set_is_error(c_bool is_error) + void add_lines(const c_string &line) + void set_actor_name(const c_string &actor_name) + void set_task_name(const c_string &task_name) + cdef extern from "ray/common/task/task_spec.h" nogil: cdef cppclass CConcurrencyGroup "ray::ConcurrencyGroup": @@ -367,3 +425,7 @@ cdef extern from "ray/common/task/task_spec.h" nogil: c_string GetName() const uint32_t GetMaxConcurrency() const c_vector[CFunctionDescriptor] GetFunctionDescriptors() const + +cdef extern from "ray/common/constants.h" nogil: + cdef const char[] kWorkerSetupHookKeyName + cdef int kResourceUnitScaling diff --git a/python/ray/includes/common.pxi b/python/ray/includes/common.pxi index 89983ff8808c..ea402ded009e 100644 --- a/python/ray/includes/common.pxi +++ b/python/ray/includes/common.pxi @@ -6,6 +6,9 @@ from ray.includes.common cimport ( CObjectLocation, CGcsClientOptions, CPythonGcsClient, + CPythonGcsPublisher, + kWorkerSetupHookKeyName, + kResourceUnitScaling, ) @@ -23,3 +26,7 @@ cdef class GcsClientOptions: cdef CGcsClientOptions* native(self): return (self.inner.get()) + + +WORKER_SETUP_HOOK_KEY_NAME_GCS = str(kWorkerSetupHookKeyName) +RESOURCE_UNIT_SCALING = kResourceUnitScaling diff --git a/python/ray/includes/global_state_accessor.pxi b/python/ray/includes/global_state_accessor.pxi index 8492ee56a89b..55c36f79c7a7 100644 --- a/python/ray/includes/global_state_accessor.pxi +++ b/python/ray/includes/global_state_accessor.pxi @@ -1,5 +1,7 @@ from ray.includes.common cimport ( - CGcsClientOptions + CGcsClientOptions, + CGcsNodeState, + PythonGetResourcesTotal ) from ray.includes.unique_ids cimport ( @@ -51,10 +53,38 @@ cdef class GlobalStateAccessor: return cjob_id.ToInt() def get_node_table(self): - cdef c_vector[c_string] result - with nogil: - result = self.inner.get().GetAllNodeInfo() - return result + cdef: + c_vector[c_string] items + c_string item + CGcsNodeInfo c_node_info + unordered_map[c_string, double] c_resources + with nogil: + items = self.inner.get().GetAllNodeInfo() + results = [] + for item in items: + c_node_info.ParseFromString(item) + node_info = { + "NodeID": ray._private.utils.binary_to_hex(c_node_info.node_id()), + "Alive": c_node_info.state() == CGcsNodeState.ALIVE, + "NodeManagerAddress": c_node_info.node_manager_address().decode(), + "NodeManagerHostname": c_node_info.node_manager_hostname().decode(), + "NodeManagerPort": c_node_info.node_manager_port(), + "ObjectManagerPort": c_node_info.object_manager_port(), + "ObjectStoreSocketName": + c_node_info.object_store_socket_name().decode(), + "RayletSocketName": c_node_info.raylet_socket_name().decode(), + "MetricsExportPort": c_node_info.metrics_export_port(), + "NodeName": c_node_info.node_name().decode(), + } + node_info["alive"] = node_info["Alive"] + c_resources = PythonGetResourcesTotal(c_node_info) + node_info["Resources"] = ( + {key.decode(): value for key, value in c_resources} + if node_info["Alive"] + else {} + ) + results.append(node_info) + return results def get_all_available_resources(self): cdef c_vector[c_string] result @@ -149,9 +179,15 @@ cdef class GlobalStateAccessor: cdef CRayStatus status cdef c_string cnode_ip_address = node_ip_address cdef c_string cnode_to_connect + cdef CGcsNodeInfo c_node_info with nogil: status = self.inner.get().GetNodeToConnectForDriver( cnode_ip_address, &cnode_to_connect) if not status.ok(): raise RuntimeError(status.message()) - return cnode_to_connect + c_node_info.ParseFromString(cnode_to_connect) + return { + "object_store_socket_name": c_node_info.object_store_socket_name().decode(), + "raylet_socket_name": c_node_info.raylet_socket_name().decode(), + "node_manager_port": c_node_info.node_manager_port(), + } diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 41e29f58012e..42c17b8572ca 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -42,6 +42,7 @@ from ray.includes.common cimport ( CJobConfig, CConcurrencyGroup, CSchedulingStrategy, + CWorkerExitType, ) from ray.includes.function_descriptor cimport ( CFunctionDescriptor, @@ -117,10 +118,11 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: const CPlacementGroupID &placement_group_id) CRayStatus WaitPlacementGroupReady( const CPlacementGroupID &placement_group_id, int64_t timeout_seconds) - optional[c_vector[CObjectReference]] SubmitActorTask( + CRayStatus SubmitActorTask( const CActorID &actor_id, const CRayFunction &function, const c_vector[unique_ptr[CTaskArg]] &args, - const CTaskOptions &options) + const CTaskOptions &options, + c_vector[CObjectReference]&) CRayStatus KillActor( const CActorID &actor_id, c_bool force_kill, c_bool no_restart) @@ -150,6 +152,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CJobID GetCurrentJobId() CTaskID GetCurrentTaskId() + int64_t GetCurrentTaskAttemptNumber() CNodeID GetCurrentNodeId() int64_t GetTaskDepth() c_bool GetCurrentTaskRetryExceptions() @@ -253,8 +256,6 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CJobConfig GetJobConfig() - c_bool IsExiting() const - int64_t GetNumTasksSubmitted() const int64_t GetNumLeasesRequested() const @@ -270,6 +271,10 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: void RecordTaskLogEnd(int64_t stdout_end_offset, int64_t stderr_end_offset) const + void Exit(const CWorkerExitType exit_type, + const c_string &detail, + const shared_ptr[LocalMemoryBuffer] &creation_task_exception_pb_bytes) + cdef cppclass CCoreWorkerOptions "ray::core::CoreWorkerOptions": CWorkerType worker_type CLanguage language diff --git a/python/ray/job_config.py b/python/ray/job_config.py index 2772902fa098..5d5b818dc39e 100644 --- a/python/ray/job_config.py +++ b/python/ray/job_config.py @@ -1,69 +1,115 @@ import uuid -from typing import Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import ray._private.gcs_utils as gcs_utils from ray.util.annotations import PublicAPI +if TYPE_CHECKING: + from ray.runtime_env import RuntimeEnv + @PublicAPI class JobConfig: """A class used to store the configurations of a job. - Attributes: + Examples: + .. testcode:: + :hide: + + import ray + ray.shutdown() + + .. testcode:: + + import ray + from ray.job_config import JobConfig + + ray.init(job_config=JobConfig(default_actor_lifetime="non_detached")) + + Args: jvm_options: The jvm options for java workers of the job. code_search_path: A list of directories or jar files that specify the search path for user code. This will be used as `CLASSPATH` in Java and `PYTHONPATH` in Python. - runtime_env: A runtime environment dictionary (see - ``runtime_env.py`` for detailed documentation). - client_job: A boolean represent the source of the job. - default_actor_lifetime: The default value of actor lifetime. - py_driver_sys_path: A list of directories that - specify the search path for python workers. + See :ref:`Ray cross-language programming ` for more details. + runtime_env: A :ref:`runtime environment ` dictionary. + metadata: An opaque metadata dictionary. + ray_namespace: A :ref:`namespace ` + is a logical grouping of jobs and named actors. + default_actor_lifetime: The default value of actor lifetime, + can be "detached" or "non_detached". + See :ref:`actor lifetimes ` for more details. """ def __init__( self, - jvm_options: List[str] = None, - code_search_path: List[str] = None, - runtime_env: dict = None, - client_job: bool = False, + jvm_options: Optional[List[str]] = None, + code_search_path: Optional[List[str]] = None, + runtime_env: Optional[dict] = None, + _client_job: bool = False, metadata: Optional[dict] = None, ray_namespace: Optional[str] = None, default_actor_lifetime: str = "non_detached", - py_driver_sys_path: List[str] = None, + _py_driver_sys_path: Optional[List[str]] = None, ): + #: The jvm options for java workers of the job. self.jvm_options = jvm_options or [] + #: A list of directories or jar files that + #: specify the search path for user code. self.code_search_path = code_search_path or [] # It's difficult to find the error that caused by the # code_search_path is a string. So we assert here. assert isinstance(self.code_search_path, (list, tuple)), ( f"The type of code search path is incorrect: " f"{type(code_search_path)}" ) - self.client_job = client_job + self._client_job = _client_job + #: An opaque metadata dictionary. self.metadata = metadata or {} + #: A namespace is a logical grouping of jobs and named actors. self.ray_namespace = ray_namespace self.set_runtime_env(runtime_env) self.set_default_actor_lifetime(default_actor_lifetime) - self.py_driver_sys_path = py_driver_sys_path or [] + # A list of directories that specify the search path for python workers. + self._py_driver_sys_path = _py_driver_sys_path or [] def set_metadata(self, key: str, value: str) -> None: + """Add key-value pair to the metadata dictionary. + + If the key already exists, the value is overwritten to the new value. + + Examples: + .. testcode:: + + import ray + from ray.job_config import JobConfig + + job_config = JobConfig() + job_config.set_metadata("submitter", "foo") + + Args: + key: The key of the metadata. + value: The value of the metadata. + """ self.metadata[key] = value - def serialize(self): + def _serialize(self) -> str: """Serialize the struct into protobuf string""" - return self.get_proto_job_config().SerializeToString() + return self._get_proto_job_config().SerializeToString() def set_runtime_env( self, - runtime_env: Optional[Union[Dict[str, Any], "RuntimeEnv"]], # noqa: F821 + runtime_env: Optional[Union[Dict[str, Any], "RuntimeEnv"]], validate: bool = False, ) -> None: """Modify the runtime_env of the JobConfig. We don't validate the runtime_env by default here because it may go through some translation before actually being passed to C++ (e.g., - working_dir translated from a local directory to a URI. + working_dir translated from a local directory to a URI). + + Args: + runtime_env: A :ref:`runtime environment ` dictionary. + validate: Whether to validate the runtime env. """ self.runtime_env = runtime_env if runtime_env is not None else {} if validate: @@ -71,11 +117,24 @@ def set_runtime_env( self._cached_pb = None def set_ray_namespace(self, ray_namespace: str) -> None: + """Set Ray :ref:`namespace `. + + Args: + ray_namespace: The namespace to set. + """ + if ray_namespace != self.ray_namespace: self.ray_namespace = ray_namespace self._cached_pb = None def set_default_actor_lifetime(self, default_actor_lifetime: str) -> None: + """Set the default actor lifetime, which can be "detached" or "non_detached". + + See :ref:`actor lifetimes ` for more details. + + Args: + default_actor_lifetime: The default actor lifetime to set. + """ if default_actor_lifetime == "detached": self._default_actor_lifetime = gcs_utils.JobConfig.ActorLifetime.DETACHED elif default_actor_lifetime == "non_detached": @@ -97,7 +156,7 @@ def _validate_runtime_env(self): return self.runtime_env return RuntimeEnv(**self.runtime_env) - def get_proto_job_config(self): + def _get_proto_job_config(self): """Return the protobuf structure of JobConfig.""" # TODO(edoakes): this is really unfortunate, but JobConfig is imported # all over the place so this causes circular imports. We should remove @@ -112,7 +171,7 @@ def get_proto_job_config(self): pb.ray_namespace = self.ray_namespace pb.jvm_options.extend(self.jvm_options) pb.code_search_path.extend(self.code_search_path) - pb.py_driver_sys_path.extend(self.py_driver_sys_path) + pb.py_driver_sys_path.extend(self._py_driver_sys_path) for k, v in self.metadata.items(): pb.metadata[k] = v @@ -131,28 +190,38 @@ def get_proto_job_config(self): return self._cached_pb - def runtime_env_has_working_dir(self): + def _runtime_env_has_working_dir(self): return self._validate_runtime_env().has_working_dir() - def get_serialized_runtime_env(self) -> str: + def _get_serialized_runtime_env(self) -> str: """Return the JSON-serialized parsed runtime env dict""" return self._validate_runtime_env().serialize() - def get_proto_runtime_env_config(self) -> str: + def _get_proto_runtime_env_config(self) -> str: """Return the JSON-serialized parsed runtime env info""" - return self.get_proto_job_config().runtime_env_info.runtime_env_config + return self._get_proto_job_config().runtime_env_info.runtime_env_config @classmethod def from_json(cls, job_config_json): - """ - Generates a JobConfig object from json. + """Generates a JobConfig object from json. + + Examples: + .. testcode:: + + from ray.job_config import JobConfig + + job_config = JobConfig.from_json( + {"runtime_env": {"working_dir": "uri://abc"}}) + + Args: + job_config_json: The job config json dictionary. """ return cls( jvm_options=job_config_json.get("jvm_options", None), code_search_path=job_config_json.get("code_search_path", None), runtime_env=job_config_json.get("runtime_env", None), - client_job=job_config_json.get("client_job", False), metadata=job_config_json.get("metadata", None), ray_namespace=job_config_json.get("ray_namespace", None), - py_driver_sys_path=job_config_json.get("py_driver_sys_path", None), + _client_job=job_config_json.get("client_job", False), + _py_driver_sys_path=job_config_json.get("py_driver_sys_path", None), ) diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py index 04dde9fbd4fc..2d1162b6ce33 100644 --- a/python/ray/remote_function.py +++ b/python/ray/remote_function.py @@ -7,6 +7,7 @@ import ray._private.signature from ray import Language, cross_language from ray._private import ray_option_utils +from ray._private.auto_init_hook import auto_init_ray from ray._private.client_mode_hook import ( client_mode_convert_function, client_mode_should_convert, @@ -242,7 +243,8 @@ def _remote(self, args=None, kwargs=None, **task_options): # We pop the "max_calls" coming from "@ray.remote" here. We no longer need # it in "_remote()". task_options.pop("max_calls", None) - if client_mode_should_convert(auto_init=True): + auto_init_ray() + if client_mode_should_convert(): return client_mode_convert_function(self, args, kwargs, **task_options) worker = ray._private.worker.global_worker diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py index 16623af6f0a4..bf4ac88db39a 100644 --- a/python/ray/runtime_context.py +++ b/python/ray/runtime_context.py @@ -359,7 +359,7 @@ def _get_actor_call_stats(self): @PublicAPI -@client_mode_hook(auto_init=False) +@client_mode_hook def get_runtime_context(): """Get the runtime context of the current driver/worker. diff --git a/python/ray/runtime_env/runtime_env.py b/python/ray/runtime_env/runtime_env.py index 32378b910840..e99072585688 100644 --- a/python/ray/runtime_env/runtime_env.py +++ b/python/ray/runtime_env/runtime_env.py @@ -3,7 +3,7 @@ import os from copy import deepcopy from dataclasses import asdict, is_dataclass -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union import ray from ray._private.ray_constants import DEFAULT_RUNTIME_ENV_TIMEOUT_SECONDS @@ -235,6 +235,11 @@ class MyClass: The `run_options` list spec is here: https://docs.docker.com/engine/reference/run/ env_vars: Environment variables to set. + worker_setup_hook: The setup hook that's called after workers + start and before tasks and actors are scheduled. + The value has to be a callable when passed to the job/task/actor. + The callable is then exported and this value is converted to + the setup hook's function name for the observability purpose. config: config for runtime environment. Either a dict or a RuntimeEnvConfig. Field: (1) setup_timeout_seconds, the timeout of runtime environment creation, timeout is in seconds. @@ -258,6 +263,7 @@ class MyClass: # field which is not supported. We should remove it # with the test. "docker", + "worker_setup_hook", } extensions_fields: Set[str] = { @@ -275,6 +281,7 @@ def __init__( conda: Optional[Union[Dict[str, str], str]] = None, container: Optional[Dict[str, str]] = None, env_vars: Optional[Dict[str, str]] = None, + worker_setup_hook: Optional[Union[Callable, str]] = None, config: Optional[Union[Dict, RuntimeEnvConfig]] = None, _validate: bool = True, **kwargs, @@ -296,6 +303,8 @@ def __init__( runtime_env["env_vars"] = env_vars if config is not None: runtime_env["config"] = config + if worker_setup_hook is not None: + runtime_env["worker_setup_hook"] = worker_setup_hook if runtime_env.get("java_jars"): runtime_env["java_jars"] = runtime_env.get("java_jars") diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 6cb85e53704d..8dc704ea84a8 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -45,12 +45,6 @@ from ray.autoscaler._private.fake_multi_node.node_provider import FAKE_HEAD_NODE_ID from ray.util.annotations import PublicAPI -from ray.experimental.state.state_cli import ( - ray_get, - ray_list, - logs_state_cli_group, - summary_state_cli_group, -) logger = logging.getLogger(__name__) @@ -404,10 +398,10 @@ def debug(address): @click.option( "--dashboard-host", required=False, - default="localhost", + default=ray_constants.DEFAULT_DASHBOARD_IP, help="the host to bind the dashboard server to, either localhost " "(127.0.0.1) or 0.0.0.0 (available from all interfaces). By default, this " - "is localhost.", + "is 127.0.0.1", ) @click.option( "--dashboard-port", @@ -430,6 +424,12 @@ def debug(address): default=None, help="the port for dashboard agents to listen for grpc on.", ) +@click.option( + "--dashboard-grpc-port", + type=int, + default=None, + help="The port for the dashboard head to listen for grpc on.", +) @click.option( "--block", is_flag=True, @@ -552,6 +552,7 @@ def start( dashboard_port, dashboard_agent_listen_port, dashboard_agent_grpc_port, + dashboard_grpc_port, block, plasma_directory, autoscaling_config, @@ -638,6 +639,7 @@ def start( dashboard_port=dashboard_port, dashboard_agent_listen_port=dashboard_agent_listen_port, metrics_agent_port=dashboard_agent_grpc_port, + dashboard_grpc_port=dashboard_grpc_port, _system_config=system_config, enable_object_reconstruction=enable_object_reconstruction, metrics_export_port=metrics_export_port, @@ -2423,10 +2425,22 @@ def add_command_alias(command, name, hidden): cli.add_command(cpp) cli.add_command(disable_usage_stats) cli.add_command(enable_usage_stats) -cli.add_command(ray_list, name="list") -cli.add_command(ray_get, name="get") -add_command_alias(summary_state_cli_group, name="summary", hidden=False) -add_command_alias(logs_state_cli_group, name="logs", hidden=False) + +try: + from ray.util.state.state_cli import ( + ray_get, + ray_list, + logs_state_cli_group, + summary_state_cli_group, + ) + + cli.add_command(ray_list, name="list") + cli.add_command(ray_get, name="get") + add_command_alias(summary_state_cli_group, name="summary", hidden=False) + add_command_alias(logs_state_cli_group, name="logs", hidden=False) +except ImportError as e: + logger.debug(f"Integrating ray state command line tool failed: {e}") + try: from ray.dashboard.modules.job.cli import job_cli_group diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD index deb0959afd6a..c4495b9cd4f7 100644 --- a/python/ray/serve/BUILD +++ b/python/ray/serve/BUILD @@ -26,7 +26,7 @@ py_test( ) py_test( - name = "test_application", + name = "test_built_application", size = "medium", srcs = serve_tests_srcs, tags = ["exclusive", "team:serve"], @@ -573,4 +573,12 @@ py_test( srcs = serve_tests_srcs, tags = ["exclusive", "team:serve"], deps = [":serve_lib"], +) + +py_test( + name = "test_multiplex", + size = "medium", + srcs = serve_tests_srcs, + tags = ["exclusive", "team:serve"], + deps = [":serve_lib"], ) \ No newline at end of file diff --git a/python/ray/serve/__init__.py b/python/ray/serve/__init__.py index 23c0eac007c4..061374e29671 100644 --- a/python/ray/serve/__init__.py +++ b/python/ray/serve/__init__.py @@ -2,6 +2,7 @@ try: from ray.serve.api import ( + build, deployment, get_deployment, get_replica_context, @@ -11,6 +12,11 @@ shutdown, start, delete, + Application, + BuiltApplication, + Deployment, + multiplexed, + get_multiplexed_model_id, ) from ray.serve.air_integrations import PredictorDeployment from ray.serve.batching import batch @@ -29,6 +35,7 @@ __all__ = [ "batch", + "build", "start", "HTTPOptions", "get_replica_context", @@ -40,4 +47,9 @@ "run", "PredictorDeployment", "delete", + "Application", + "BuiltApplication", + "Deployment", + "multiplexed", + "get_multiplexed_model_id", ] diff --git a/python/ray/serve/_private/api.py b/python/ray/serve/_private/api.py index 29e4d69a22ef..132a7ac30109 100644 --- a/python/ray/serve/_private/api.py +++ b/python/ray/serve/_private/api.py @@ -1,10 +1,14 @@ -from typing import Dict, Optional, Tuple, Union +import inspect import logging import os +from types import FunctionType +from typing import Any, Dict, Optional, Tuple, Union + +from pydantic.main import ModelMetaclass import ray from ray._private.usage import usage_lib -from ray.serve.deployment import Deployment +from ray.serve.deployment import Application, Deployment from ray.serve.exceptions import RayServeException from ray.serve.config import HTTPOptions from ray.serve._private.constants import ( @@ -134,7 +138,7 @@ def _start_controller( """ # Initialize ray if needed. - ray._private.worker.global_worker.filter_logs_by_job = False + ray._private.worker.global_worker._filter_logs_by_job = False if not ray.is_initialized(): ray.init(namespace=SERVE_NAMESPACE) @@ -329,3 +333,55 @@ def serve_start( f'namespace "{SERVE_NAMESPACE}".' ) return client + + +def call_app_builder_with_args_if_necessary( + builder: Union[Application, FunctionType], + args: Dict[str, Any], +) -> Application: + """Builds a Serve application from an application builder function. + + If a pre-built application is passed, this is a no-op. + + Else, we validate the signature of the builder, convert the args dictionary to + the user-annotated Pydantic model if provided, and call the builder function. + + The output of the function is returned (must be an Application). + """ + if isinstance(builder, Application): + if len(args) > 0: + raise ValueError( + "Arguments can only be passed to an application builder function, " + "not an already built application." + ) + return builder + elif not isinstance(builder, FunctionType): + raise TypeError( + "Expected a built Serve application or an application builder function " + f"but got: {type(builder)}." + ) + + # Check that the builder only takes a single argument. + # TODO(edoakes): we may want to loosen this to allow optional kwargs in the future. + signature = inspect.signature(builder) + if len(signature.parameters) != 1: + raise TypeError( + "Application builder functions should take exactly one parameter, " + "a dictionary containing the passed arguments." + ) + + # If the sole argument to the builder is a pydantic model, convert the args dict to + # that model. This will perform standard pydantic validation (e.g., raise an + # exception if required fields are missing). + param = signature.parameters[list(signature.parameters.keys())[0]] + if issubclass(type(param.annotation), ModelMetaclass): + args = param.annotation.parse_obj(args) + + app = builder(args) + if not isinstance(app, Application): + raise TypeError( + "Application builder functions must return an `Application` returned " + f"`from `Deployment.bind()`, but got: {type(app)}." + ) + + return app diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py index a89a7db41959..23958d1b87bc 100644 --- a/python/ray/serve/_private/application_state.py +++ b/python/ray/serve/_private/application_state.py @@ -1,5 +1,5 @@ import traceback -from typing import Dict, List +from typing import Dict, List, Optional from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag from ray.serve._private.common import ApplicationStatus from ray.serve._private.deployment_state import DeploymentStateManager @@ -38,29 +38,61 @@ def __init__( deploy_obj_ref: Task ObjRef of deploying application. deployment_time: Deployment timestamp """ + + self._name = name + self._deploy_obj_ref = deploy_obj_ref + self._app_msg = "" + self._deployment_state_manager = deployment_state_manager + self._deployment_params: List[Dict] = [] + # This set tracks old deployments that are being deleted + self._deployments_to_delete = set() + self._ready_to_be_deleted = False + self._route_prefix = None + self._docs_path = None + if deploy_obj_ref: - self.status: ApplicationStatus = ApplicationStatus.DEPLOYING + self._status: ApplicationStatus = ApplicationStatus.DEPLOYING else: - self.status: ApplicationStatus = ApplicationStatus.NOT_STARTED - self.name = name - self.deployment_params: List[Dict] = [] - self.ready_to_be_deleted = False - self.deployment_state_manager = deployment_state_manager + self._status: ApplicationStatus = ApplicationStatus.NOT_STARTED if deployment_time: - self.deployment_timestamp = deployment_time + self._deployment_timestamp = deployment_time else: - self.deployment_timestamp = time.time() - self.deploy_obj_ref = deploy_obj_ref - self.app_msg = "" - self.route_prefix = None - self.docs_path = None + self._deployment_timestamp = time.time() - # This set tracks old deployments that are being deleted - self.deployments_to_delete = set() + @property + def ready_to_be_deleted(self) -> bool: + return self._ready_to_be_deleted + + @property + def route_prefix(self) -> Optional[str]: + return self._route_prefix + + @property + def docs_path(self) -> Optional[str]: + return self._docs_path + + @property + def status(self) -> ApplicationStatus: + return self._status + + @property + def deployment_timestamp(self) -> int: + return self._deployment_timestamp + + @property + def deploy_obj_ref(self) -> Optional[ObjectRef]: + return self._deploy_obj_ref + + @property + def deployments(self) -> List[str]: + """Return all deployments name from the application""" + if self._deployment_params is None: + return [] + return [params["name"] for params in self._deployment_params] def delete(self): """Delete the application""" - self.status = ApplicationStatus.DELETING + self._status = ApplicationStatus.DELETING def deploy(self, deployment_params: List[Dict]) -> List[str]: """Deploy the application. @@ -76,26 +108,26 @@ def deploy(self, deployment_params: List[Dict]) -> List[str]: # that are not used in the new deployment_params to_be_deployed_deployments = {params["name"] for params in deployment_params} cur_deployments_to_delete = [] - for deployment_name in self.get_all_deployments(): + for deployment_name in self.deployments: if deployment_name not in to_be_deployed_deployments: cur_deployments_to_delete.append(deployment_name) - self.deployments_to_delete.add(deployment_name) - self.deployment_params = deployment_params + self._deployments_to_delete.add(deployment_name) + self._deployment_params = deployment_params # Update route prefix for application num_route_prefixes = 0 num_docs_paths = 0 for deploy_param in deployment_params: if deploy_param.get("route_prefix") is not None: - self.route_prefix = deploy_param["route_prefix"] + self._route_prefix = deploy_param["route_prefix"] num_route_prefixes += 1 if deploy_param.get("docs_path") is not None: - self.docs_path = deploy_param["docs_path"] + self._docs_path = deploy_param["docs_path"] num_docs_paths += 1 if num_route_prefixes > 1: raise RayServeException( - f'Found multiple route prefix from application "{self.name}",' + f'Found multiple route prefix from application "{self._name}",' " Please specify only one route prefix for the application " "to avoid this issue." ) @@ -103,19 +135,19 @@ def deploy(self, deployment_params: List[Dict]) -> List[str]: # if user sets the docs path to None in their FastAPI app. if num_docs_paths > 1: raise RayServeException( - f'Found multiple deployments in application "{self.name}" that have ' + f'Found multiple deployments in application "{self._name}" that have ' "a docs path. This may be due to using multiple FastAPI deployments " "in your application. Please only include one deployment with a docs " "path in your application to avoid this issue." ) - self.status = ApplicationStatus.DEPLOYING + self._status = ApplicationStatus.DEPLOYING return cur_deployments_to_delete def update_obj_ref(self, deploy_obj_ref: ObjectRef, deployment_time: int): - self.deploy_obj_ref = deploy_obj_ref - self.deployment_timestamp = deployment_time - self.status = ApplicationStatus.DEPLOYING + self._deploy_obj_ref = deploy_obj_ref + self._deployment_timestamp = deployment_time + self._status = ApplicationStatus.DEPLOYING def _process_terminating_deployments(self): """Update the tracking for all deployments being deleted @@ -123,13 +155,13 @@ def _process_terminating_deployments(self): When a deployment's status is None, the deployment will be removed from application. """ - for name in list(self.deployments_to_delete): - if self.deployment_state_manager.get_deployment(name): + for name in list(self._deployments_to_delete): + if self._deployment_state_manager.get_deployment(name): logger.warning( - f"Deleting deployment {name} from application {self.name}." + f"Deleting deployment {name} from application {self._name}." ) else: - self.deployments_to_delete.remove(name) + self._deployments_to_delete.remove(name) def update(self): """Update the application status, maintain the ApplicationStatus. @@ -141,87 +173,84 @@ def update(self): DELETING: Mark ready_to_be_deleted as True when all deployments are gone. """ - if self.ready_to_be_deleted: + if self._ready_to_be_deleted: return - if self.status == ApplicationStatus.DELETING: + if self._status == ApplicationStatus.DELETING: mark_delete = True # Application won't be deleted until all deployments get cleaned up - for name in self.get_all_deployments(): - if self.deployment_state_manager.get_deployment(name): + for name in self.deployments: + if self._deployment_state_manager.get_deployment(name): logger.debug( - f"Deleting deployment {name} from application {self.name}." + f"Deleting deployment {name} from application {self._name}." ) mark_delete = False break - if self.deployments_to_delete: + if self._deployments_to_delete: mark_delete = False - self.ready_to_be_deleted = mark_delete + self._ready_to_be_deleted = mark_delete self._process_terminating_deployments() return - if self.status == ApplicationStatus.DEPLOYING: - if self.deploy_obj_ref: - finished, pending = ray.wait([self.deploy_obj_ref], timeout=0) + if self._status == ApplicationStatus.DEPLOYING: + if self._deploy_obj_ref: + finished, pending = ray.wait([self._deploy_obj_ref], timeout=0) if pending: return + self._deploy_obj_ref = None try: ray.get(finished[0]) - logger.info(f"Deploy task for app '{self.name}' ran successfully.") + logger.info(f"Deploy task for app '{self._name}' ran successfully.") except RayTaskError as e: - self.status = ApplicationStatus.DEPLOY_FAILED + self._status = ApplicationStatus.DEPLOY_FAILED # NOTE(zcin): we should use str(e) instead of traceback.format_exc() # here because the full details of the error is not displayed # properly with traceback.format_exc(). RayTaskError has its own # custom __str__ function. - self.app_msg = f"Deploying app '{self.name}' failed:\n{str(e)}" - self.deploy_obj_ref = None - logger.warning(self.app_msg) + self._app_msg = f"Deploying app '{self._name}' failed:\n{str(e)}" + logger.warning(self._app_msg) return except RuntimeEnvSetupError: - self.status = ApplicationStatus.DEPLOY_FAILED - self.app_msg = ( - f"Runtime env setup for app '{self.name}' " + self._status = ApplicationStatus.DEPLOY_FAILED + self._app_msg = ( + f"Runtime env setup for app '{self._name}' " f"failed:\n{traceback.format_exc()}" ) - self.deploy_obj_ref = None - logger.warning(self.app_msg) + logger.warning(self._app_msg) + return + except Exception: + self._status = ApplicationStatus.DEPLOY_FAILED + self._app_msg = ( + "Unexpected error occured while deploying application " + f"'{self._name}':\n{traceback.format_exc()}" + ) + logger.warning(self._app_msg) return deployments_statuses = ( - self.deployment_state_manager.get_deployment_statuses( - self.get_all_deployments() - ) + self._deployment_state_manager.get_deployment_statuses(self.deployments) ) num_health_deployments = 0 for deployment_status in deployments_statuses: if deployment_status.status == DeploymentStatus.UNHEALTHY: - self.status = ApplicationStatus.DEPLOY_FAILED + self._status = ApplicationStatus.DEPLOY_FAILED return if deployment_status.status == DeploymentStatus.HEALTHY: num_health_deployments += 1 if num_health_deployments == len(deployments_statuses): - self.status = ApplicationStatus.RUNNING + self._status = ApplicationStatus.RUNNING self._process_terminating_deployments() - def get_all_deployments(self) -> List[str]: - """Return all deployments name from the application""" - if self.deployment_params is None: - return [] - return [params["name"] for params in self.deployment_params] - def get_deployments_statuses(self) -> List[DeploymentStatusInfo]: """Return all deployment status information""" - return self.deployment_state_manager.get_deployment_statuses( - self.get_all_deployments() - ) + return self._deployment_state_manager.get_deployment_statuses(self.deployments) def get_application_status_info(self) -> ApplicationStatusInfo: """Return the application status information""" return ApplicationStatusInfo( - self.status, - message=self.app_msg, - deployment_timestamp=self.deployment_timestamp, + self._status, + message=self._app_msg, + deployment_timestamp=self._deployment_timestamp, ) def list_deployment_details(self) -> Dict[str, DeploymentDetails]: @@ -235,15 +264,15 @@ def list_deployment_details(self) -> Dict[str, DeploymentDetails]: been deleted. """ details = { - name: self.deployment_state_manager.get_deployment_details(name) - for name in self.get_all_deployments() + name: self._deployment_state_manager.get_deployment_details(name) + for name in self.deployments } return {k: v for k, v in details.items() if v is not None} class ApplicationStateManager: def __init__(self, deployment_state_manager): - self.deployment_state_manager = deployment_state_manager + self._deployment_state_manager = deployment_state_manager self._application_states: Dict[str, ApplicationState] = {} def delete_application(self, name: str): @@ -287,7 +316,7 @@ def deploy_application(self, name: str, deployment_args: List[Dict]): if name not in self._application_states: self._application_states[name] = ApplicationState( name, - self.deployment_state_manager, + self._deployment_state_manager, ) record_extra_usage_tag( TagKey.SERVE_NUM_APPS, str(len(self._application_states)) @@ -298,7 +327,7 @@ def get_deployments(self, app_name: str) -> List[str]: """Return all deployment names by app name""" if app_name not in self._application_states: return [] - return self._application_states[app_name].get_all_deployments() + return self._application_states[app_name].deployments def get_deployments_statuses(self, app_name: str) -> List[DeploymentStatusInfo]: """Return all deployment statuses by app name""" @@ -315,6 +344,11 @@ def get_app_status(self, name: str) -> ApplicationStatusInfo: ) return self._application_states[name].get_application_status_info() + def get_deployment_timestamp(self, name: str) -> float: + if name not in self._application_states: + return -1 + return self._application_states[name].deployment_timestamp + def get_docs_path(self, app_name: str): return self._application_states[app_name].docs_path @@ -360,16 +394,11 @@ def create_application_state( else: self._application_states[name] = ApplicationState( name, - self.deployment_state_manager, + self._deployment_state_manager, deploy_obj_ref=deploy_obj_ref, deployment_time=deployment_time, ) - def get_deployment_timestamp(self, name: str) -> float: - if name not in self._application_states: - return -1 - return self._application_states[name].deployment_timestamp - def update(self): """Update each application state""" apps_to_be_deleted = [] diff --git a/python/ray/serve/_private/client.py b/python/ray/serve/_private/client.py index 12d07b1e6840..fd4b9630e9aa 100644 --- a/python/ray/serve/_private/client.py +++ b/python/ray/serve/_private/client.py @@ -13,6 +13,7 @@ StatusOverview, ApplicationStatus, DeploymentStatusInfo, + MultiplexedReplicaInfo, ) from ray.serve.config import DeploymentConfig, HTTPOptions from ray.serve._private.constants import ( @@ -102,7 +103,6 @@ def shutdown(self) -> None: # Shut down handles for k in list(self.handle_cache): - self.handle_cache[k].stop_metrics_pusher() del self.handle_cache[k] if ray.is_initialized() and not self._shutdown: @@ -275,6 +275,7 @@ def deploy_application( route_prefix=deployment["route_prefix"], is_driver_deployment=deployment["is_driver_deployment"], docs_path=deployment["docs_path"], + app_name=name, ) ) @@ -456,7 +457,7 @@ def get_handle( cache_key = (deployment_name, missing_ok, sync) if cache_key in self.handle_cache: cached_handle = self.handle_cache[cache_key] - if cached_handle.is_polling and cached_handle.is_same_loop: + if cached_handle._is_polling and cached_handle._is_same_loop: return cached_handle all_endpoints = ray.get(self._controller.get_all_endpoints.remote()) @@ -526,3 +527,13 @@ def log_deployment_ready(self, name: str, version: str, url: str, tag: str) -> N f"Deployment '{name}{':'+version if version else ''}' is ready" f"{url_part}. {tag}" ) + + @_ensure_connected + def record_multiplexed_replica_info(self, info: MultiplexedReplicaInfo): + """Record multiplexed replica information for replica. + + Args: + info: MultiplexedReplicaInfo including deployment name, replica tag and + model ids. + """ + self._controller.record_multiplexed_replica_info.remote(info) diff --git a/python/ray/serve/_private/common.py b/python/ray/serve/_private/common.py index 7ccfbce34fa8..b4eefd485528 100644 --- a/python/ray/serve/_private/common.py +++ b/python/ray/serve/_private/common.py @@ -6,7 +6,6 @@ import ray from ray.actor import ActorHandle from ray.serve.config import DeploymentConfig, ReplicaConfig -from ray.serve._private.autoscaling_policy import AutoscalingPolicy from ray.serve.generated.serve_pb2 import ( DeploymentInfo as DeploymentInfoProto, DeploymentStatusInfo as DeploymentStatusInfoProto, @@ -16,16 +15,19 @@ ApplicationStatusInfo as ApplicationStatusInfoProto, StatusOverview as StatusOverviewProto, ) +from ray.serve._private.autoscaling_policy import BasicAutoscalingPolicy EndpointTag = str ReplicaTag = str NodeId = str Duration = float +ApplicationName = str @dataclass class EndpointInfo: route: str + app_name: str # Keep in sync with ServeReplicaState in dashboard/client/src/type/serve.ts @@ -56,15 +58,16 @@ def debug_string(self): def to_proto(self): return ApplicationStatusInfoProto( - status=self.status, + status=f"APPLICATION_STATUS_{self.status}", message=self.message, deployment_timestamp=self.deployment_timestamp, ) @classmethod def from_proto(cls, proto: ApplicationStatusInfoProto): + status = ApplicationStatusProto.Name(proto.status)[len("APPLICATION_STATUS_") :] return cls( - status=ApplicationStatus(ApplicationStatusProto.Name(proto.status)), + status=ApplicationStatus(status), message=proto.message, deployment_timestamp=proto.deployment_timestamp, ) @@ -87,14 +90,17 @@ def debug_string(self): def to_proto(self): return DeploymentStatusInfoProto( - name=self.name, status=self.status, message=self.message + name=self.name, + status=f"DEPLOYMENT_STATUS_{self.status}", + message=self.message, ) @classmethod def from_proto(cls, proto: DeploymentStatusInfoProto): + status = DeploymentStatusProto.Name(proto.status)[len("DEPLOYMENT_STATUS_") :] return cls( name=proto.name, - status=DeploymentStatus(DeploymentStatusProto.Name(proto.status)), + status=DeploymentStatus(status), message=proto.message, ) @@ -182,8 +188,9 @@ def __init__( actor_name: Optional[str] = None, version: Optional[str] = None, end_time_ms: Optional[int] = None, - autoscaling_policy: Optional[AutoscalingPolicy] = None, is_driver_deployment: Optional[bool] = False, + app_name: Optional[str] = None, + route_prefix: str = None, ): self.deployment_config = deployment_config self.replica_config = replica_config @@ -194,13 +201,25 @@ def __init__( self.deployer_job_id = deployer_job_id # The time when this deployment was deleted. self.end_time_ms = end_time_ms - self.autoscaling_policy = autoscaling_policy # ephermal state self._cached_actor_def = None self.is_driver_deployment = is_driver_deployment + self.app_name = app_name + self.route_prefix = route_prefix + if deployment_config.autoscaling_config is not None: + self.autoscaling_policy = BasicAutoscalingPolicy( + deployment_config.autoscaling_config + ) + else: + self.autoscaling_policy = None + # Num replicas decided by the autoscaling policy. This is mutually exclusive + # from deployment_config.num_replicas. This value is updated through + # set_autoscaled_num_replicas() + self.autoscaled_num_replicas = None + def __getstate__(self) -> Dict[Any, Any]: clean_dict = self.__dict__.copy() del clean_dict["_cached_actor_def"] @@ -210,6 +229,9 @@ def __setstate__(self, d: Dict[Any, Any]) -> None: self.__dict__ = d self._cached_actor_def = None + def set_autoscaled_num_replicas(self, autoscaled_num_replicas): + self.autoscaled_num_replicas = autoscaled_num_replicas + @property def actor_def(self): # Delayed import as replica depends on this file. @@ -242,6 +264,7 @@ def from_proto(cls, proto: DeploymentInfoProto): "version": proto.version if proto.version != "" else None, "end_time_ms": proto.end_time_ms if proto.end_time_ms != 0 else None, "deployer_job_id": ray.get_runtime_context().get_job_id(), + "app_name": proto.app_name, } return cls(**data) @@ -252,6 +275,7 @@ def to_proto(self): "actor_name": self.actor_name, "version": self.version, "end_time_ms": self.end_time_ms, + "app_name": self.app_name, } if self.deployment_config: data["deployment_config"] = self.deployment_config.to_proto() @@ -301,6 +325,7 @@ class RunningReplicaInfo: actor_handle: ActorHandle max_concurrent_queries: int is_cross_language: bool = False + multiplexed_model_ids: List[str] = field(default_factory=list) def __post_init__(self): # Set hash value when object is constructed. @@ -317,6 +342,7 @@ def __post_init__(self): str(self.actor_handle._actor_id), str(self.max_concurrent_queries), str(self.is_cross_language), + str(self.multiplexed_model_ids), ] ) ) @@ -341,3 +367,22 @@ class ServeDeployMode(str, Enum): UNSET = "UNSET" SINGLE_APP = "SINGLE_APP" MULTI_APP = "MULTI_APP" + + +# Keep in sync with ServeHTTPProxyStatus in +# python/ray/dashboard/client/src/type/serve.ts +class HTTPProxyStatus(str, Enum): + STARTING = "STARTING" + HEALTHY = "HEALTHY" + UNHEALTHY = "UNHEALTHY" + + +class ServeComponentType(str, Enum): + DEPLOYMENT = "deployment" + + +@dataclass +class MultiplexedReplicaInfo: + deployment_name: str + replica_tag: str + model_ids: List[str] diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py index 651c4e12c698..5c21943b0e3b 100644 --- a/python/ray/serve/_private/constants.py +++ b/python/ray/serve/_private/constants.py @@ -29,7 +29,7 @@ DEFAULT_GRPC_PORT = 9000 #: Default Serve application name -SERVE_DEFAULT_APP_NAME = "" +SERVE_DEFAULT_APP_NAME = "default" #: Separator between app name and deployment name when we prepend #: the app name to each deployment name. This prepending is currently @@ -62,10 +62,23 @@ 50, 100, 200, + 300, + 400, 500, 1000, 2000, + # 5 seconds 5000, + # 10 seconds + 10000, + # 60 seconds + 60000, + # 2min + 120000, + # 5 min + 300000, + # 10 min + 600000, ] #: Name of deployment health check method implemented by user. @@ -92,6 +105,9 @@ DEFAULT_HEALTH_CHECK_PERIOD_S = 10 DEFAULT_HEALTH_CHECK_TIMEOUT_S = 30 +# HTTP Proxy health check period +PROXY_HEALTH_CHECK_PERIOD_S = 10 + #: Number of times in a row that a replica must fail the health check before #: being marked unhealthy. REPLICA_HEALTH_CHECK_UNHEALTHY_THRESHOLD = 3 @@ -120,6 +136,13 @@ # Env var to control legacy sync deployment handle behavior in DAG. SYNC_HANDLE_IN_DAG_FEATURE_FLAG_ENV_KEY = "SERVE_DEPLOYMENT_HANDLE_IS_SYNC" +# Maximum duration to wait until broadcasting a long poll update if there are +# still replicas in the RECOVERING state. +RECOVERING_LONG_POLL_BROADCAST_TIMEOUT_S = 10.0 + +# Minimum duration to wait until broadcasting model IDs. +PUSH_MULTIPLEXED_MODEL_IDS_INTERVAL_S = 1.0 + class ServeHandleType(str, Enum): SYNC = "SYNC" @@ -140,3 +163,28 @@ class ServeHandleType(str, Enum): "Please see the documentation for ServeDeploySchema for more details on multi-app " "config files." ) + +# Jsonify the log messages +RAY_SERVE_ENABLE_JSON_LOGGING = os.environ.get("RAY_SERVE_ENABLE_JSON_LOGGING") == "1" +# Logging format attributes +SERVE_LOG_REQUEST_ID = "request_id" +SERVE_LOG_ROUTE = "route" +SERVE_LOG_APPLICATION = "application" +SERVE_LOG_DEPLOYMENT = "deployment" +SERVE_LOG_REPLICA = "replica" +SERVE_LOG_COMPONENT = "component_name" +SERVE_LOG_COMPONENT_ID = "component_id" +SERVE_LOG_MESSAGE = "message" +# This is a reserved for python logging module attribute, it should not be changed. +SERVE_LOG_LEVEL_NAME = "levelname" +SERVE_LOG_TIME = "asctime" + +# Logging format with record key to format string dict +SERVE_LOG_RECORD_FORMAT = { + SERVE_LOG_REQUEST_ID: "%(request_id)s", + SERVE_LOG_ROUTE: "%(route)s", + SERVE_LOG_APPLICATION: "%(application)s", + SERVE_LOG_MESSAGE: "%(filename)s:%(lineno)d - %(message)s", + SERVE_LOG_LEVEL_NAME: "%(levelname)s", + SERVE_LOG_TIME: "%(asctime)s", +} diff --git a/python/ray/serve/_private/deploy_utils.py b/python/ray/serve/_private/deploy_utils.py index 19b096a63f14..b36b6395a3fb 100644 --- a/python/ray/serve/_private/deploy_utils.py +++ b/python/ray/serve/_private/deploy_utils.py @@ -1,10 +1,12 @@ from typing import Dict, Tuple, Union, Callable, Type, Optional, Any +import hashlib +import json import logging import time from ray.serve.config import ReplicaConfig, DeploymentConfig +from ray.serve.schema import ServeApplicationSchema from ray.serve._private.constants import SERVE_LOGGER_NAME -from ray.serve._private.autoscaling_policy import BasicAutoscalingPolicy from ray.serve._private.common import DeploymentInfo import ray @@ -24,6 +26,7 @@ def get_deploy_args( route_prefix: Optional[str] = None, is_driver_deployment: Optional[str] = None, docs_path: Optional[str] = None, + app_name: Optional[str] = None, ) -> Dict: """ Takes a deployment's configuration, and returns the arguments needed @@ -80,6 +83,7 @@ def get_deploy_args( "deployer_job_id": ray.get_runtime_context().get_job_id(), "is_driver_deployment": is_driver_deployment, "docs_path": docs_path, + "app_name": app_name, } return controller_deploy_args @@ -90,8 +94,9 @@ def deploy_args_to_deployment_info( deployment_config_proto_bytes: bytes, replica_config_proto_bytes: bytes, deployer_job_id: Union[str, bytes], - previous_deployment: DeploymentInfo, + route_prefix: Optional[str], is_driver_deployment: Optional[bool] = False, + app_name: Optional[str] = None, ) -> DeploymentInfo: """Takes deployment args passed to the controller after building an application and constructs a DeploymentInfo object. @@ -103,22 +108,6 @@ def deploy_args_to_deployment_info( replica_config_proto_bytes, deployment_config.needs_pickle() ) - autoscaling_config = deployment_config.autoscaling_config - if autoscaling_config is not None: - if autoscaling_config.initial_replicas is not None: - deployment_config.num_replicas = autoscaling_config.initial_replicas - else: - if previous_deployment is None: - deployment_config.num_replicas = autoscaling_config.min_replicas - else: - deployment_config.num_replicas = ( - previous_deployment.deployment_config.num_replicas - ) - - autoscaling_policy = BasicAutoscalingPolicy(autoscaling_config) - else: - autoscaling_policy = None - # Java API passes in JobID as bytes if isinstance(deployer_job_id, bytes): deployer_job_id = ray.JobID.from_int( @@ -132,6 +121,26 @@ def deploy_args_to_deployment_info( replica_config=replica_config, deployer_job_id=deployer_job_id, start_time_ms=int(time.time() * 1000), - autoscaling_policy=autoscaling_policy, is_driver_deployment=is_driver_deployment, + app_name=app_name, + route_prefix=route_prefix, ) + + +def get_app_code_version(app_config: ServeApplicationSchema) -> str: + """Returns the code version of an application. + + Args: + app_config: The application config. + + Returns: a hash of the import path and (application level) runtime env representing + the code version of the application. + """ + encoded = json.dumps( + { + "import_path": app_config.import_path, + "runtime_env": app_config.runtime_env, + }, + sort_keys=True, + ).encode("utf-8") + return hashlib.md5(encoded).hexdigest() diff --git a/python/ray/serve/_private/deployment_function_node.py b/python/ray/serve/_private/deployment_function_node.py index be47328d80c1..c53f816e7211 100644 --- a/python/ray/serve/_private/deployment_function_node.py +++ b/python/ray/serve/_private/deployment_function_node.py @@ -1,4 +1,3 @@ -import inspect from typing import Any, Callable, Dict, List, Union from ray.dag.dag_node import DAGNode @@ -35,13 +34,6 @@ def __init__( ] deployment_shell = schema_to_deployment(deployment_schema) - # Prefer user specified name to override the generated one. - if ( - inspect.isfunction(func_body) - and deployment_shell.name != func_body.__name__ - ): - self._deployment_name = deployment_shell.name - # Set the route prefix, prefer the one user supplied, # otherwise set it to /deployment_name if ( diff --git a/python/ray/serve/_private/deployment_graph_build.py b/python/ray/serve/_private/deployment_graph_build.py index 6ce515936c15..d06c4a05442f 100644 --- a/python/ray/serve/_private/deployment_graph_build.py +++ b/python/ray/serve/_private/deployment_graph_build.py @@ -66,7 +66,8 @@ def build(ray_dag_root_node: DAGNode, name: str = None) -> List[Deployment]: should be executable via `ray_dag_root_node.execute(user_input)` and should have `InputNode` in it. name: Application name,. If provided, formatting all the deployment name to - {name}_{deployment_name} + {name}_{deployment_name}, if not provided, the deployment name won't be + updated. Returns: deployments: All deployments needed for an e2e runnable serve pipeline, @@ -93,6 +94,15 @@ def build(ray_dag_root_node: DAGNode, name: str = None) -> List[Deployment]: ) deployments = extract_deployments_from_serve_dag(serve_root_dag) + # If the ingress deployment is a function and it is bound to other deployments, + # reject. + if isinstance(serve_root_dag, DeploymentFunctionNode) and len(deployments) != 1: + raise ValueError( + "The ingress deployment to your application cannot be a function if there " + "are multiple deployments. If you want to compose them, use a class. If " + "you're using the DAG API, the function should be bound to a DAGDriver." + ) + # After Ray DAG is transformed to Serve DAG with deployments and their init # args filled, generate a minimal weight executor serve dag for perf serve_executor_root_dag = serve_root_dag.apply_recursive( @@ -264,6 +274,16 @@ def replace_with_handle(node): dag_node._body.__annotations__["return"] ) + # Set the deployment name if the user provides. + if "deployment_schema" in dag_node._bound_other_args_to_resolve: + schema = dag_node._bound_other_args_to_resolve["deployment_schema"] + if ( + inspect.isfunction(dag_node._body) + and schema.name != dag_node._body.__name__ + ): + deployment_name = schema.name + + # Update the deployment name if the application name provided. if name: deployment_name = name + DEPLOYMENT_NAME_PREFIX_SEPARATOR + deployment_name diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py index 0a39eabf0363..f6eea3b4992c 100644 --- a/python/ray/serve/_private/deployment_state.py +++ b/python/ray/serve/_private/deployment_state.py @@ -19,7 +19,8 @@ record_extra_usage_tag, ) from ray.actor import ActorHandle -from ray.exceptions import RayActorError, RayError +from ray.exceptions import RayActorError, RayError, RayTaskError + from ray.serve._private.autoscaling_metrics import InMemoryMetricsStore from ray.serve._private.common import ( DeploymentInfo, @@ -30,6 +31,7 @@ ReplicaTag, RunningReplicaInfo, ReplicaState, + MultiplexedReplicaInfo, ) from ray.serve.schema import ( DeploymentDetails, @@ -57,7 +59,7 @@ ) from ray.serve._private.version import DeploymentVersion, VersionedReplica -from ray.util import metrics +from ray.serve import metrics from ray._raylet import GcsClient from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy @@ -97,10 +99,16 @@ def from_deployment_info( num_replicas = 0 version = None else: - num_replicas = info.deployment_config.num_replicas + # If autoscaling config is not none, num replicas should be decided based on + # the autoscaling policy and passed in as autoscaled_num_replicas + if info.autoscaled_num_replicas is not None: + num_replicas = info.autoscaled_num_replicas + else: + num_replicas = info.deployment_config.num_replicas version = DeploymentVersion( info.version, - user_config=info.deployment_config.user_config, + deployment_config=info.deployment_config, + ray_actor_options=info.replica_config.ray_actor_options, ) return cls(info, num_replicas, version, deleting) @@ -199,11 +207,8 @@ def __init__( self._ready_obj_ref: ObjectRef = None self._actor_resources: Dict[str, float] = None - self._max_concurrent_queries: int = None - self._graceful_shutdown_timeout_s: float = 0.0 + self._version: DeploymentVersion = None self._healthy: bool = True - self._health_check_period_s: float = 0.0 - self._health_check_timeout_s: float = 0.0 self._health_check_ref: Optional[ObjectRef] = None self._last_health_check_time: float = 0.0 self._consecutive_health_check_failures = 0 @@ -213,12 +218,14 @@ def __init__( self._pid: int = None self._actor_id: str = None + self._worker_id: str = None if isinstance(scheduling_strategy, NodeAffinitySchedulingStrategy): self._node_id = scheduling_strategy.node_id else: # Populated after replica is allocated. self._node_id: str = None self._node_ip: str = None + self._log_file_path: str = None # Populated in self.stop(). self._graceful_shutdown_ref: ObjectRef = None @@ -258,8 +265,33 @@ def actor_handle(self) -> Optional[ActorHandle]: return self._actor_handle @property - def max_concurrent_queries(self) -> int: - return self._max_concurrent_queries + def version(self) -> Optional[DeploymentVersion]: + return self._version + + @property + def deployment_config(self) -> Optional[DeploymentConfig]: + if self._version: + return self._version.deployment_config + + @property + def max_concurrent_queries(self) -> Optional[int]: + if self.deployment_config: + return self.deployment_config.max_concurrent_queries + + @property + def graceful_shutdown_timeout_s(self) -> Optional[float]: + if self.deployment_config: + return self.deployment_config.graceful_shutdown_timeout_s + + @property + def health_check_period_s(self) -> Optional[float]: + if self.deployment_config: + return self.deployment_config.health_check_period_s + + @property + def health_check_timeout_s(self) -> Optional[float]: + if self.deployment_config: + return self.deployment_config.health_check_timeout_s @property def pid(self) -> Optional[int]: @@ -271,6 +303,11 @@ def actor_id(self) -> Optional[str]: """Returns the actor id, None if not started.""" return self._actor_id + @property + def worker_id(self) -> Optional[str]: + """Returns the worker id, None if not started.""" + return self._worker_id + @property def node_id(self) -> Optional[str]: """Returns the node id of the actor, None if not placed.""" @@ -281,6 +318,11 @@ def node_ip(self) -> Optional[str]: """Returns the node ip of the actor, None if not placed.""" return self._node_ip + @property + def log_file_path(self) -> Optional[str]: + """Returns the relative log file path of the actor, None if not placed.""" + return self._log_file_path + def _check_obj_ref_ready(self, obj_ref: ObjectRef) -> bool: ready, _ = ray.wait([obj_ref], timeout=0) return len(ready) == 1 @@ -289,18 +331,7 @@ def start(self, deployment_info: DeploymentInfo, version: DeploymentVersion): """ Start a new actor for current DeploymentReplica instance. """ - self._max_concurrent_queries = ( - deployment_info.deployment_config.max_concurrent_queries - ) - self._graceful_shutdown_timeout_s = ( - deployment_info.deployment_config.graceful_shutdown_timeout_s - ) - self._health_check_period_s = ( - deployment_info.deployment_config.health_check_period_s - ) - self._health_check_timeout_s = ( - deployment_info.deployment_config.health_check_timeout_s - ) + self._version = version self._actor_resources = deployment_info.replica_config.resource_dict # it is currently not possible to create a placement group @@ -346,6 +377,7 @@ def start(self, deployment_info: DeploymentInfo, version: DeploymentVersion): version, self._controller_name, self._detached, + deployment_info.app_name, ) # TODO(simon): unify the constructor arguments across language elif ( @@ -389,17 +421,20 @@ def start(self, deployment_info: DeploymentInfo, version: DeploymentVersion): # Perform auto method name translation for java handles. # See https://github.com/ray-project/ray/issues/21474 - user_config = self._format_user_config( - deployment_info.deployment_config.user_config + deployment_config = copy(deployment_info.deployment_config) + deployment_config.user_config = self._format_user_config( + deployment_config.user_config ) if self._is_cross_language: self._actor_handle = JavaActorHandleProxy(self._actor_handle) self._allocated_obj_ref = self._actor_handle.is_allocated.remote() - self._ready_obj_ref = self._actor_handle.is_initialized.remote(user_config) + self._ready_obj_ref = self._actor_handle.is_initialized.remote( + deployment_config.to_proto_bytes() + ) else: self._allocated_obj_ref = self._actor_handle.is_allocated.remote() self._ready_obj_ref = self._actor_handle.is_initialized.remote( - user_config, + deployment_config, # Ensure that `is_allocated` will execute before `reconfigure`, # because `reconfigure` runs user code that could block the replica # asyncio loop. If that happens before `is_allocated` is executed, @@ -416,14 +451,28 @@ def _format_user_config(self, user_config: Any): temp = msgpack_deserialize(temp) return temp - def update_user_config(self, user_config: Any): + def reconfigure(self, version: DeploymentVersion) -> bool: """ - Update user config of existing actor behind current - DeploymentReplica instance. + Update replica version. Also, updates the deployment config on the actor + behind this DeploymentReplica instance if necessary. + + Returns: whether the actor is being updated. """ - self._ready_obj_ref = self._actor_handle.reconfigure.remote( - self._format_user_config(user_config) - ) + updating = False + if self._version.requires_actor_reconfigure(version): + # Call into replica actor reconfigure() with updated user config and + # graceful_shutdown_wait_loop_s + updating = True + deployment_config = copy(version.deployment_config) + deployment_config.user_config = self._format_user_config( + deployment_config.user_config + ) + self._ready_obj_ref = self._actor_handle.reconfigure.remote( + deployment_config + ) + + self._version = version + return updating def recover(self): """ @@ -446,7 +495,7 @@ def recover(self): else: self._ready_obj_ref = self._actor_handle.get_metadata.remote() - def check_ready(self) -> Tuple[ReplicaStartupStatus, Optional[DeploymentVersion]]: + def check_ready(self) -> Tuple[ReplicaStartupStatus, Optional[str]]: """ Check if current replica has started by making ray API calls on relevant actor / object ref. @@ -463,11 +512,11 @@ def check_ready(self) -> Tuple[ReplicaStartupStatus, Optional[DeploymentVersion] - replica initialization failed. SUCCEEDED: - replica initialization succeeded. - version: + error_msg: None: - - for PENDING_ALLOCATION, PENDING_INITIALIZATION, or FAILED states - version: - - for SUCCEEDED state + - for PENDING_ALLOCATION, PENDING_INITIALIZATION or SUCCEEDED states + str: + - for FAILED state """ # Check whether the replica has been allocated. @@ -486,28 +535,36 @@ def check_ready(self) -> Tuple[ReplicaStartupStatus, Optional[DeploymentVersion] if self._is_cross_language: return ReplicaStartupStatus.SUCCEEDED, None - deployment_config, version = ray.get(self._ready_obj_ref) - self._max_concurrent_queries = deployment_config.max_concurrent_queries - self._graceful_shutdown_timeout_s = ( - deployment_config.graceful_shutdown_timeout_s - ) - self._health_check_period_s = deployment_config.health_check_period_s - self._health_check_timeout_s = deployment_config.health_check_timeout_s - self._pid, self._actor_id, self._node_id, self._node_ip = ray.get( - self._allocated_obj_ref + # todo: The replica's userconfig whitch java client created + # is different from the controller's userconfig + if not self._deployment_is_cross_language: + _, self._version = ray.get(self._ready_obj_ref) + + ( + self._pid, + self._actor_id, + self._worker_id, + self._node_id, + self._node_ip, + self._log_file_path, + ) = ray.get(self._allocated_obj_ref) + except RayTaskError as e: + logger.exception( + f"Exception in replica '{self._replica_tag}', " + "the replica will be stopped." ) - except Exception: + # NOTE(zcin): we should use str(e) instead of traceback.format_exc() + # here because the full details of the error is not displayed properly + # with traceback.format_exc(). + return ReplicaStartupStatus.FAILED, str(e.as_instanceof_cause()) + except Exception as e: logger.exception( f"Exception in replica '{self._replica_tag}', " "the replica will be stopped." ) - return ReplicaStartupStatus.FAILED, None - if self._deployment_is_cross_language: - # todo: The replica's userconfig whitch java client created - # is different from the controller's userconfig - return ReplicaStartupStatus.SUCCEEDED, None - else: - return ReplicaStartupStatus.SUCCEEDED, version + return ReplicaStartupStatus.FAILED, repr(e) + + return ReplicaStartupStatus.SUCCEEDED, None @property def actor_resources(self) -> Optional[Dict[str, float]]: @@ -528,7 +585,7 @@ def graceful_stop(self) -> Duration: except ValueError: pass - return self._graceful_shutdown_timeout_s + return self.graceful_shutdown_timeout_s def check_stopped(self) -> bool: """Check if the actor has exited.""" @@ -575,12 +632,12 @@ def _check_active_health_check(self) -> ReplicaHealthCheckResponse: f"Health check for replica {self._replica_tag} failed: {e}" ) response = ReplicaHealthCheckResponse.APP_FAILURE - elif time.time() - self._last_health_check_time > self._health_check_timeout_s: + elif time.time() - self._last_health_check_time > self.health_check_timeout_s: # Health check hasn't returned and the timeout is up, consider it failed. logger.warning( "Didn't receive health check response for replica " f"{self._replica_tag} after " - f"{self._health_check_timeout_s}s, marking it unhealthy." + f"{self.health_check_timeout_s}s, marking it unhealthy." ) response = ReplicaHealthCheckResponse.APP_FAILURE else: @@ -597,7 +654,7 @@ def _should_start_new_health_check(self) -> bool: A health check will be started if: 1) There is not already an active health check. - 2) It has been more than self._health_check_period_s since the + 2) It has been more than health_check_period_s since the previous health check was *started*. This assumes that self._health_check_ref is reset to `None` when an @@ -612,7 +669,7 @@ def _should_start_new_health_check(self) -> bool: # check. Add some randomness to avoid synchronizing across all # replicas. time_since_last = time.time() - self._last_health_check_time - randomized_period = self._health_check_period_s * random.uniform(0.9, 1.1) + randomized_period = self.health_check_period_s * random.uniform(0.9, 1.1) return time_since_last > randomized_period def check_health(self) -> bool: @@ -644,7 +701,7 @@ def check_health(self) -> bool: ): logger.warning( f"Replica {self._replica_tag} failed the health " - f"check {self._consecutive_health_check_failures}" + f"check {self._consecutive_health_check_failures} " "times in a row, marking it unhealthy." ) self._healthy = False @@ -701,9 +758,15 @@ def __init__( self._controller_name = controller_name self._deployment_name = deployment_name self._replica_tag = replica_tag - self._version = version self._start_time = None self._prev_slow_startup_warning_time = None + self._actor_details = ReplicaDetails( + actor_name=self._actor._actor_name, + replica_id=self._replica_tag, + state=ReplicaState.STARTING, + start_time_s=0, + ) + self._multiplexed_model_ids: List = [] def get_running_replica_info(self) -> RunningReplicaInfo: return RunningReplicaInfo( @@ -712,25 +775,20 @@ def get_running_replica_info(self) -> RunningReplicaInfo: actor_handle=self._actor.actor_handle, max_concurrent_queries=self._actor.max_concurrent_queries, is_cross_language=self._actor.is_cross_language, + multiplexed_model_ids=self.multiplexed_model_ids, ) - def get_replica_details(self, state: ReplicaState) -> ReplicaDetails: - """Get replica details. + def record_multiplexed_model_ids(self, multiplexed_model_ids: List[str]): + """Record the multiplexed model ids for this replica.""" + self._multiplexed_model_ids = multiplexed_model_ids - Args: - state: The state of the replica, which is not stored within a - DeploymentReplica object - """ - return ReplicaDetails( - replica_id=self.replica_tag, - state=state, - pid=self._actor.pid, - actor_name=self._actor._actor_name, - actor_id=self._actor.actor_id, - node_id=self._actor.node_id, - node_ip=self._actor.node_ip, - start_time_s=self._start_time, - ) + @property + def multiplexed_model_ids(self) -> List[str]: + return self._multiplexed_model_ids + + @property + def actor_details(self) -> ReplicaDetails: + return self._actor_details @property def replica_tag(self) -> ReplicaTag: @@ -742,7 +800,7 @@ def deployment_name(self) -> str: @property def version(self): - return self._version + return self._actor.version @property def actor_handle(self) -> ActorHandle: @@ -760,17 +818,16 @@ def start(self, deployment_info: DeploymentInfo, version: DeploymentVersion): self._actor.start(deployment_info, version) self._start_time = time.time() self._prev_slow_startup_warning_time = time.time() - self._version = version + self.update_actor_details(start_time_s=self._start_time) - def update_user_config(self, user_config: Any): + def reconfigure(self, version: DeploymentVersion) -> bool: """ - Update user config of existing actor behind current - DeploymentReplica instance. + Update replica version. Also, updates the deployment config on the actor + behind this DeploymentReplica instance if necessary. + + Returns: whether the actor is being updated. """ - self._actor.update_user_config(user_config) - self._version = DeploymentVersion( - self._version.code_version, user_config=user_config - ) + return self._actor.reconfigure(version) def recover(self): """ @@ -779,10 +836,9 @@ def recover(self): """ self._actor.recover() self._start_time = time.time() - # Replica version is fetched from recovered replica dynamically in - # check_started() below + self.update_actor_details(start_time_s=self._start_time) - def check_started(self) -> ReplicaStartupStatus: + def check_started(self) -> Tuple[ReplicaStartupStatus, Optional[str]]: """Check if the replica has started. If so, transition to RUNNING. Should handle the case where the replica has already stopped. @@ -791,15 +847,16 @@ def check_started(self) -> ReplicaStartupStatus: status: Most recent state of replica by querying actor obj ref """ - status, version = self._actor.check_ready() - - if status == ReplicaStartupStatus.SUCCEEDED: - # Re-assign DeploymentVersion if start / update / recover succeeded - # by reading re-computed version in RayServeReplica - if version is not None: - self._version = version - - return status + is_ready = self._actor.check_ready() + self.update_actor_details( + pid=self._actor.pid, + node_id=self._actor.node_id, + node_ip=self._actor.node_ip, + actor_id=self._actor.actor_id, + worker_id=self._actor.worker_id, + log_file_path=self._actor.log_file_path, + ) + return is_ready def stop(self, graceful: bool = True) -> None: """Stop the replica. @@ -840,6 +897,15 @@ def check_health(self) -> bool: """ return self._actor.check_health() + def update_state(self, state: ReplicaState) -> None: + """Updates state in actor details.""" + self.update_actor_details(state=state) + + def update_actor_details(self, **kwargs) -> None: + details_kwargs = self._actor_details.dict() + details_kwargs.update(kwargs) + self._actor_details = ReplicaDetails(**details_kwargs) + def resource_requirements(self) -> Tuple[str, str]: """Returns required and currently available resources. @@ -881,6 +947,7 @@ def add(self, state: ReplicaState, replica: VersionedReplica): """ assert isinstance(state, ReplicaState) assert isinstance(replica, VersionedReplica) + replica.update_state(state) self._replicas[state].append(replica) def get( @@ -1032,6 +1099,7 @@ def __init__( self._last_retry: float = 0.0 self._backoff_time_s: int = 1 self._replica_constructor_retry_counter: int = 0 + self._replica_constructor_error_msg: Optional[str] = None self._replicas: ReplicaStateContainer = ReplicaStateContainer() self._curr_status_info: DeploymentStatusInfo = DeploymentStatusInfo( self._name, DeploymentStatus.UPDATING @@ -1043,9 +1111,13 @@ def __init__( "Tracks whether this deployment replica is healthy. 1 means " "healthy, 0 means unhealthy." ), - tag_keys=("deployment", "replica"), + tag_keys=("deployment", "replica", "application"), ) + # Whether the multiplexed model ids have been updated since the last + # time we checked. + self._multiplexed_model_ids_updated = False + def should_autoscale(self) -> bool: """ Check if the deployment is under autoscaling @@ -1115,6 +1187,12 @@ def target_info(self) -> DeploymentInfo: def curr_status_info(self) -> DeploymentStatusInfo: return self._curr_status_info + @property + def app_name(self) -> str: + if self.target_info.app_name: + return self.target_info.app_name + return "" + def get_running_replica_infos(self) -> List[RunningReplicaInfo]: return [ replica.get_running_replica_info() @@ -1122,11 +1200,7 @@ def get_running_replica_infos(self) -> List[RunningReplicaInfo]: ] def list_replica_details(self) -> List[ReplicaDetails]: - return [ - replica.get_replica_details(state) - for state in ReplicaState - for replica in self._replicas.get([state]) - ] + return [replica.actor_details for replica in self._replicas.get()] def _notify_running_replicas_changed(self): self._long_poll_host.notify_changed( @@ -1158,6 +1232,23 @@ def _set_target_state(self, target_info: DeploymentInfo) -> None: target_state = DeploymentTargetState.from_deployment_info(target_info) self._save_checkpoint_func(writeahead_checkpoints={self._name: target_state}) + if self._target_state.version == target_state.version: + # Record either num replica or autoscaling config lightweight update + if ( + self._target_state.version.deployment_config.autoscaling_config + != target_state.version.deployment_config.autoscaling_config + ): + record_extra_usage_tag( + TagKey.SERVE_AUTOSCALING_CONFIG_LIGHTWEIGHT_UPDATED, "True" + ) + elif ( + self._target_state.version.deployment_config.num_replicas + != target_state.version.deployment_config.num_replicas + ): + record_extra_usage_tag( + TagKey.SERVE_NUM_REPLICAS_LIGHTWEIGHT_UPDATED, "True" + ) + self._target_state = target_state self._curr_status_info = DeploymentStatusInfo( self._name, DeploymentStatus.UPDATING @@ -1180,15 +1271,31 @@ def deploy(self, deployment_info: DeploymentInfo) -> bool: existing_info = self._target_state.info if existing_info is not None: # Redeploying should not reset the deployment's start time. - deployment_info.start_time_ms = existing_info.start_time_ms + if not self._target_state.deleting: + deployment_info.start_time_ms = existing_info.start_time_ms if ( - existing_info.deployment_config == deployment_info.deployment_config + not self._target_state.deleting + and existing_info.deployment_config == deployment_info.deployment_config + and existing_info.replica_config.ray_actor_options + == deployment_info.replica_config.ray_actor_options and deployment_info.version is not None and existing_info.version == deployment_info.version ): return False + # If autoscaling config is not none, decide initial num replicas + autoscaling_config = deployment_info.deployment_config.autoscaling_config + if autoscaling_config is not None: + if autoscaling_config.initial_replicas is not None: + autoscaled_num_replicas = autoscaling_config.initial_replicas + else: + if existing_info is not None: + autoscaled_num_replicas = self._target_state.num_replicas + else: + autoscaled_num_replicas = autoscaling_config.min_replicas + deployment_info.set_autoscaled_num_replicas(autoscaled_num_replicas) + self._set_target_state(deployment_info) return True @@ -1213,15 +1320,15 @@ def autoscale( curr_info = self._target_state.info autoscaling_policy = self._target_state.info.autoscaling_policy decision_num_replicas = autoscaling_policy.get_decision_num_replicas( - curr_target_num_replicas=curr_info.deployment_config.num_replicas, + curr_target_num_replicas=self._target_state.num_replicas, current_num_ongoing_requests=current_num_ongoing_requests, current_handle_queued_queries=current_handle_queued_queries, ) - if decision_num_replicas == curr_info.deployment_config.num_replicas: + if decision_num_replicas == self._target_state.num_replicas: return new_config = copy(curr_info) - new_config.deployment_config.num_replicas = decision_num_replicas + new_config.set_autoscaled_num_replicas(decision_num_replicas) if new_config.version is None: new_config.version = self._target_state.version.code_version @@ -1230,8 +1337,11 @@ def autoscale( def delete(self) -> None: self._set_target_state_deleting() - def _stop_wrong_version_replicas(self, max_to_stop=math.inf) -> int: - """Stop the replicas with outdated versions + def _stop_or_update_outdated_version_replicas(self, max_to_stop=math.inf) -> int: + """Stop or update replicas with outdated versions. + + Stop replicas with versions that require the actor to be restarted, and + reconfigure replicas that require refreshing deployment config values. Args: max_to_stop: max number of replicas to stop, by default, @@ -1243,33 +1353,34 @@ def _stop_wrong_version_replicas(self, max_to_stop=math.inf) -> int: max_replicas=max_to_stop, ranking_function=rank_replicas_for_stopping, ) - replicas_stopped = False + replicas_changed = False code_version_changes = 0 - user_config_changes = 0 + reconfigure_changes = 0 for replica in replicas_to_update: - # If the code version is a mismatch, we stop the replica. A new one - # with the correct version will be started later as part of the + # If the new version requires the actors to be restarted, stop the replica. + # A new one with the correct version will be started later as part of the # normal scale-up process. - if replica.version.code_version != self._target_state.version.code_version: + if replica.version.requires_actor_restart(self._target_state.version): code_version_changes += 1 - replica.stop() - self._replicas.add(ReplicaState.STOPPING, replica) - replicas_stopped = True - # If only the user_config is a mismatch, we update it dynamically - # without restarting the replica. - elif ( - replica.version.user_config_hash - != self._target_state.version.user_config_hash - ): - user_config_changes += 1 - replica.update_user_config(self._target_state.version.user_config) - self._replicas.add(ReplicaState.UPDATING, replica) + self._stop_replica(replica) + replicas_changed = True + # Otherwise, only lightweight options in deployment config is a mismatch, so + # we update it dynamically without restarting the replica. + else: + reconfigure_changes += 1 + if replica.version.requires_long_poll_broadcast( + self._target_state.version + ): + replicas_changed = True + actor_updating = replica.reconfigure(self._target_state.version) + if actor_updating: + self._replicas.add(ReplicaState.UPDATING, replica) + else: + self._replicas.add(ReplicaState.RUNNING, replica) logger.debug( "Adding UPDATING to replica_tag: " f"{replica.replica_tag}, deployment_name: {self._name}" ) - else: - assert False, "Update must be code version or user config." if code_version_changes > 0: logger.info( @@ -1277,14 +1388,15 @@ def _stop_wrong_version_replicas(self, max_to_stop=math.inf) -> int: f"deployment '{self._name}' with outdated versions." ) - if user_config_changes > 0: + if reconfigure_changes > 0: logger.info( - f"Updating {user_config_changes} replicas of " - f"deployment '{self._name}' with outdated " - f"user_configs." + f"Updating {reconfigure_changes} replicas of deployment '{self._name}' " + "with outdated deployment configs." ) + # Record user config lightweight update + record_extra_usage_tag(TagKey.SERVE_USER_CONFIG_LIGHTWEIGHT_UPDATED, "True") - return replicas_stopped + return replicas_changed def _check_and_stop_wrong_version_replicas(self) -> bool: """Stops replicas with outdated versions to implement rolling updates. @@ -1337,7 +1449,7 @@ def _check_and_stop_wrong_version_replicas(self) -> bool: rollout_size = max(int(0.2 * self._target_state.num_replicas), 1) max_to_stop = max(rollout_size - pending_replicas, 0) - return self._stop_wrong_version_replicas(max_to_stop) + return self._stop_or_update_outdated_version_replicas(max_to_stop) def _scale_deployment_replicas(self) -> bool: """Scale the given deployment to the number of replicas.""" @@ -1346,7 +1458,7 @@ def _scale_deployment_replicas(self) -> bool: self._target_state.num_replicas >= 0 ), "Number of replicas must be greater than or equal to 0." - replicas_stopped = self._check_and_stop_wrong_version_replicas() + replicas_changed = self._check_and_stop_wrong_version_replicas() current_replicas = self._replicas.count( states=[ReplicaState.STARTING, ReplicaState.UPDATING, ReplicaState.RUNNING] @@ -1357,7 +1469,7 @@ def _scale_deployment_replicas(self) -> bool: self._target_state.num_replicas - current_replicas - recovering_replicas ) if delta_replicas == 0: - return False + return replicas_changed elif delta_replicas > 0: # Don't ever exceed self._target_state.num_replicas. @@ -1380,7 +1492,7 @@ def _scale_deployment_replicas(self) -> bool: time.time() - self._last_retry < self._backoff_time_s + random.uniform(0, 3) ): - return replicas_stopped + return replicas_changed self._last_retry = time.time() logger.info( @@ -1407,7 +1519,7 @@ def _scale_deployment_replicas(self) -> bool: ) elif delta_replicas < 0: - replicas_stopped = True + replicas_changed = True to_remove = -delta_replicas logger.info( f"Removing {to_remove} replica{'s' if to_remove > 1 else ''} " @@ -1429,12 +1541,11 @@ def _scale_deployment_replicas(self) -> bool: f"Adding STOPPING to replica_tag: {replica}, " f"deployment_name: {self._name}" ) - replica.stop() - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica) - return replicas_stopped + return replicas_changed - def _check_curr_status(self) -> bool: + def _check_curr_status(self) -> Tuple[bool, bool]: """Check the current deployment status. Checks the difference between the target vs. running replica count for @@ -1443,8 +1554,7 @@ def _check_curr_status(self) -> bool: This will update the current deployment status depending on the state of the replicas. - Returns: - was_deleted + Returns (deleted, any_replicas_recovering). """ # TODO(edoakes): we could make this more efficient in steady-state by # having a "healthy" flag that gets flipped if an update or replica @@ -1453,6 +1563,9 @@ def _check_curr_status(self) -> bool: target_version = self._target_state.version target_replica_count = self._target_state.num_replicas + any_replicas_recovering = ( + self._replicas.count(states=[ReplicaState.RECOVERING]) > 0 + ) all_running_replica_cnt = self._replicas.count(states=[ReplicaState.RUNNING]) running_at_target_version_replica_cnt = self._replicas.count( states=[ReplicaState.RUNNING], version=target_version @@ -1484,11 +1597,13 @@ def _check_curr_status(self) -> bool: message=( f"The Deployment failed to start {failed_to_start_count} times " "in a row. This may be due to a problem with the deployment " - "constructor or the initial health check failing. See logs for " - f"details. Retrying after {self._backoff_time_s} seconds." + "constructor or the initial health check failing. See " + "controller logs for details. Retrying after " + f"{self._backoff_time_s} seconds. Error:\n" + f"{self._replica_constructor_error_msg}" ), ) - return False + return False, any_replicas_recovering # If we have pending ops, the current goal is *not* ready. if ( @@ -1504,16 +1619,16 @@ def _check_curr_status(self) -> bool: ): # Check for deleting. if self._target_state.deleting and all_running_replica_cnt == 0: - return True + return True, any_replicas_recovering # Check for a non-zero number of deployments. if target_replica_count == running_at_target_version_replica_cnt: self._curr_status_info = DeploymentStatusInfo( self._name, DeploymentStatus.HEALTHY ) - return False + return False, any_replicas_recovering - return False + return False, any_replicas_recovering def _check_startup_replicas( self, original_state: ReplicaState, stop_on_slow=False @@ -1530,7 +1645,7 @@ def _check_startup_replicas( transitioned_to_running = False replicas_failed = False for replica in self._replicas.pop(states=[original_state]): - start_status = replica.check_started() + start_status, error_msg = replica.check_started() if start_status == ReplicaStartupStatus.SUCCEEDED: # This replica should be now be added to handle's replica # set. @@ -1545,10 +1660,10 @@ def _check_startup_replicas( if self._replica_constructor_retry_counter >= 0: # Increase startup failure counter if we're tracking it self._replica_constructor_retry_counter += 1 + self._replica_constructor_error_msg = error_msg replicas_failed = True - replica.stop(graceful=False) - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica) elif start_status in [ ReplicaStartupStatus.PENDING_ALLOCATION, ReplicaStartupStatus.PENDING_INITIALIZATION, @@ -1561,8 +1676,7 @@ def _check_startup_replicas( # Does it make sense to stop replicas in PENDING_ALLOCATION # state? if is_slow and stop_on_slow: - replica.stop(graceful=False) - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica, graceful_stop=False) else: self._replicas.add(original_state, replica) @@ -1583,6 +1697,23 @@ def _check_startup_replicas( return slow_replicas, transitioned_to_running + def _stop_replica(self, replica, graceful_stop=True): + """Stop replica + 1. Stop the replica. + 2. Change the replica into stopping state. + 3. Set the health replica stats to 0. + """ + replica.stop(graceful=graceful_stop) + self._replicas.add(ReplicaState.STOPPING, replica) + self.health_check_gauge.set( + 0, + tags={ + "deployment": self._name, + "replica": replica.replica_tag, + "application": self.app_name, + }, + ) + def _check_and_update_replicas(self) -> bool: """ Check current state of all DeploymentReplica being tracked, and compare @@ -1597,7 +1728,12 @@ def _check_and_update_replicas(self) -> bool: if replica.check_health(): self._replicas.add(ReplicaState.RUNNING, replica) self.health_check_gauge.set( - 1, tags={"deployment": self._name, "replica": replica.replica_tag} + 1, + tags={ + "deployment": self._name, + "replica": replica.replica_tag, + "application": self.app_name, + }, ) else: running_replicas_changed = True @@ -1606,10 +1742,14 @@ def _check_and_update_replicas(self) -> bool: f"{self._name} failed health check, stopping it." ) self.health_check_gauge.set( - 0, tags={"deployment": self._name, "replica": replica.replica_tag} + 0, + tags={ + "deployment": self._name, + "replica": replica.replica_tag, + "application": self.app_name, + }, ) - replica.stop(graceful=False) - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica, graceful_stop=False) # If this is a replica of the target version, the deployment # enters the "UNHEALTHY" status until the replica is # recovered or a new deploy happens. @@ -1707,7 +1847,7 @@ def _check_and_update_replicas(self) -> bool: return running_replicas_changed - def update(self) -> bool: + def update(self) -> Tuple[bool, bool]: """Attempts to reconcile this deployment to match its goal state. This is an asynchronous call; it's expected to be called repeatedly. @@ -1715,8 +1855,9 @@ def update(self) -> bool: Also updates the internal DeploymentStatusInfo based on the current state of the system. - Returns true if this deployment was successfully deleted. + Returns (deleted, any_replicas_recovering). """ + deleted, any_replicas_recovering = False, False try: # Add or remove DeploymentReplica instances in self._replicas. # This should be the only place we adjust total number of replicas @@ -1727,19 +1868,39 @@ def update(self) -> bool: # Check the state of existing replicas and transition if necessary. running_replicas_changed |= self._check_and_update_replicas() + # Check if the model_id has changed. + running_replicas_changed |= self._multiplexed_model_ids_updated + if running_replicas_changed: self._notify_running_replicas_changed() + self._multiplexed_model_ids_updated = False - deleted = self._check_curr_status() + deleted, any_replicas_recovering = self._check_curr_status() except Exception: self._curr_status_info = DeploymentStatusInfo( name=self._name, status=DeploymentStatus.UNHEALTHY, message="Failed to update deployment:" f"\n{traceback.format_exc()}", ) - deleted = False - return deleted + return deleted, any_replicas_recovering + + def record_multiplexed_model_ids( + self, replica_name: str, multiplexed_model_ids: List[str] + ) -> None: + """Records the multiplexed model IDs of a replica. + + Args: + replica_name: Name of the replica. + multiplexed_model_ids: List of model IDs that replica is serving. + """ + # Find the replica + for replica in self._replicas.get(): + if replica.replica_tag == replica_name: + replica.record_multiplexed_model_ids(multiplexed_model_ids) + self._multiplexed_model_ids_updated = True + break + logger.warn(f"Replia {replica_name} not found in deployment {self._name}") def _stop_one_running_replica_for_testing(self): running_replicas = self._replicas.pop(states=[ReplicaState.RUNNING]) @@ -1819,8 +1980,7 @@ def _stop_all_replicas(self) -> bool: ReplicaState.RECOVERING, ] ): - replica.stop() - self._replicas.add(ReplicaState.STOPPING, replica) + self._stop_replica(replica) replica_changed = True return replica_changed @@ -1837,7 +1997,8 @@ def _calculate_max_replicas_to_stop(self) -> int: pending_replicas = nums_nodes - new_running_replicas - old_running_replicas return max(rollout_size - pending_replicas, 0) - def update(self) -> bool: + def update(self) -> Tuple[bool, bool]: + """Returns (deleted, any_replicas_recovering).""" try: if self._target_state.deleting: self._stop_all_replicas() @@ -1854,7 +2015,7 @@ def update(self) -> bool: new_config.version = self._target_state.version.code_version self._set_target_state(new_config) max_to_stop = self._calculate_max_replicas_to_stop() - self._stop_wrong_version_replicas(max_to_stop) + self._stop_or_update_outdated_version_replicas(max_to_stop) self._deploy_driver() self._check_and_update_replicas() return self._check_curr_status() @@ -1864,7 +2025,7 @@ def update(self) -> bool: status=DeploymentStatus.UNHEALTHY, message="Failed to update deployment:" f"\n{traceback.format_exc()}", ) - return False + return False, False def should_autoscale(self) -> bool: return False @@ -2139,6 +2300,15 @@ def deploy(self, deployment_name: str, deployment_info: DeploymentInfo) -> bool: return self._deployment_states[deployment_name].deploy(deployment_info) + def get_deployments_in_application(self, app_name: str) -> List[str]: + """Return list of deployment names in application.""" + states = [] + for name, deployment_state in self._deployment_states.items(): + if deployment_state.target_info.app_name == app_name: + states.append(name) + + return states + def delete_deployment(self, deployment_name: str): # This method must be idempotent. We should validate that the # specified deployment exists on the client. @@ -2194,9 +2364,13 @@ def get_handle_queueing_metrics( current_handle_queued_queries = 0 return current_handle_queued_queries - def update(self): - """Updates the state of all deployments to match their goal state.""" + def update(self) -> bool: + """Updates the state of all deployments to match their goal state. + + Returns True if any of the deployments have replicas in the RECOVERING state. + """ deleted_tags = [] + any_recovering = False for deployment_name, deployment_state in self._deployment_states.items(): if deployment_state.should_autoscale(): current_num_ongoing_requests = self.get_replica_ongoing_request_metrics( @@ -2210,7 +2384,7 @@ def update(self): deployment_state.autoscale( current_num_ongoing_requests, current_handle_queued_queries ) - deleted = deployment_state.update() + deleted, recovering = deployment_state.update() if deleted: deleted_tags.append(deployment_name) deployment_info = deployment_state.target_info @@ -2219,12 +2393,16 @@ def update(self): self._deleted_deployment_metadata.popitem(last=False) self._deleted_deployment_metadata[deployment_name] = deployment_info + any_recovering |= recovering + for tag in deleted_tags: del self._deployment_states[tag] if len(deleted_tags): self._record_deployment_usage() + return any_recovering + def _record_deployment_usage(self): record_extra_usage_tag( TagKey.SERVE_NUM_DEPLOYMENTS, str(len(self._deployment_states)) @@ -2248,3 +2426,20 @@ def _record_deployment_usage(self): record_extra_usage_tag( TagKey.SERVE_NUM_GPU_DEPLOYMENTS, str(num_gpu_deployments) ) + + def record_multiplexed_replica_info(self, info: MultiplexedReplicaInfo): + """ + Record multiplexed model ids for a multiplexed replica. + + Args: + info: Multiplexed replica info including deployment name, + replica tag and model ids. + """ + if info.deployment_name not in self._deployment_states: + logger.error( + f"Deployment {info.deployment_name} not found in state manager." + ) + return + self._deployment_states[info.deployment_name].record_multiplexed_model_ids( + info.replica_tag, info.model_ids + ) diff --git a/python/ray/serve/_private/http_proxy.py b/python/ray/serve/_private/http_proxy.py index a4283006bd98..e2f16d1a458b 100644 --- a/python/ray/serve/_private/http_proxy.py +++ b/python/ray/serve/_private/http_proxy.py @@ -1,5 +1,6 @@ import asyncio from asyncio.tasks import FIRST_COMPLETED +import json import os import logging import pickle @@ -25,14 +26,18 @@ Response, set_socket_reuse_port, ) -from ray.serve._private.common import EndpointInfo, EndpointTag +from ray.serve._private.common import EndpointInfo, EndpointTag, ApplicationName from ray.serve._private.constants import ( SERVE_LOGGER_NAME, SERVE_NAMESPACE, DEFAULT_LATENCY_BUCKET_MS, ) from ray.serve._private.long_poll import LongPollClient, LongPollNamespace -from ray.serve._private.logging_utils import access_log_msg, configure_component_logger +from ray.serve._private.logging_utils import ( + access_log_msg, + configure_component_logger, + get_component_logger_file_path, +) from ray.serve._private.utils import get_random_letters @@ -174,7 +179,7 @@ def __init__(self, get_handle: Callable): # Routes sorted in order of decreasing length. self.sorted_routes: List[str] = list() # Endpoints associated with the routes. - self.route_info: Dict[str, EndpointTag] = dict() + self.route_info: Dict[str, Tuple[EndpointTag, ApplicationName]] = dict() # Contains a ServeHandle for each endpoint. self.handles: Dict[str, RayServeHandle] = dict() @@ -191,7 +196,7 @@ def update_routes(self, endpoints: Dict[EndpointTag, EndpointInfo]) -> None: route_info = {} for endpoint, info in endpoints.items(): routes.append(info.route) - route_info[info.route] = endpoint + route_info[info.route] = (endpoint, info.app_name) if endpoint in self.handles: existing_handles.remove(endpoint) else: @@ -241,10 +246,10 @@ def match_route( matched = True if matched: - endpoint = self.route_info[route] - return route, self.handles[endpoint] + endpoint, app_name = self.route_info[route] + return route, self.handles[endpoint], app_name - return None, None + return None, None, None class HTTPProxy: @@ -259,7 +264,7 @@ def __init__(self, controller_name: str): # Set the controller name so that serve will connect to the # controller instance this proxy is running in. ray.serve.context._set_internal_replica_context( - None, None, controller_name, None + None, None, controller_name, None, None ) # Used only for displaying the route table. @@ -284,10 +289,7 @@ def get_handle(name): self.request_counter = metrics.Counter( "serve_num_http_requests", description="The number of HTTP requests processed.", - tag_keys=( - "route", - "method", - ), + tag_keys=("route", "method", "application", "status_code"), ) self.request_error_counter = metrics.Counter( @@ -310,6 +312,7 @@ def get_handle(name): "error_code", "method", "route", + "application", ), ) self.processing_latency_tracker = metrics.Histogram( @@ -319,7 +322,11 @@ def get_handle(name): "(measured from the Serve HTTP proxy)." ), boundaries=DEFAULT_LATENCY_BUCKET_MS, - tag_keys=("route",), + tag_keys=( + "route", + "application", + "status_code", + ), ) def _update_routes(self, endpoints: Dict[EndpointTag, EndpointInfo]) -> None: @@ -364,21 +371,33 @@ async def __call__(self, scope, receive, send): root_path = scope["root_path"] route_path = scope["path"][len(root_path) :] - self.request_counter.inc( - tags={"route": route_path, "method": scope["method"].upper()} - ) - if route_path == "/-/routes": + self.request_counter.inc( + tags={ + "route": route_path, + "method": scope["method"].upper(), + "application": "", + "status_code": "200", + } + ) return await starlette.responses.JSONResponse(self.route_info)( scope, receive, send ) if route_path == "/-/healthz": + self.request_counter.inc( + tags={ + "route": route_path, + "method": scope["method"].upper(), + "application": "", + "status_code": "200", + } + ) return await starlette.responses.PlainTextResponse("success")( scope, receive, send ) - route_prefix, handle = self.prefix_router.match_route(route_path) + route_prefix, handle, app_name = self.prefix_router.match_route(route_path) if route_prefix is None: self.request_error_counter.inc( tags={ @@ -387,6 +406,14 @@ async def __call__(self, scope, receive, send): "method": scope["method"].upper(), } ) + self.request_counter.inc( + tags={ + "route": route_path, + "method": scope["method"].upper(), + "application": "", + "status_code": "404", + } + ) return await self._not_found(scope, receive, send) # Modify the path and root path so that reverse lookups and redirection @@ -399,11 +426,30 @@ async def __call__(self, scope, receive, send): start_time = time.time() ray.serve.context._serve_request_context.set( - ray.serve.context.RequestContext(route_path, get_random_letters(10)) + ray.serve.context.RequestContext( + route_path, get_random_letters(10), app_name + ) ) status_code = await _send_request_to_handle(handle, scope, receive, send) + + self.request_counter.inc( + tags={ + "route": route_path, + "method": scope["method"].upper(), + "application": app_name, + "status_code": status_code, + } + ) + latency_ms = (time.time() - start_time) * 1000.0 - self.processing_latency_tracker.observe(latency_ms, tags={"route": route_path}) + self.processing_latency_tracker.observe( + latency_ms, + tags={ + "route": route_path, + "application": app_name, + "status_code": status_code, + }, + ) logger.info( access_log_msg( method=scope["method"], @@ -426,6 +472,7 @@ async def __call__(self, scope, receive, send): "error_code": status_code, "method": scope["method"].upper(), "route": route_path, + "application": app_name, } ) @@ -480,7 +527,17 @@ async def ready(self): return_when=asyncio.FIRST_COMPLETED, ) - # Return None, or re-throw the exception from self.running_task. + # Return metadata, or re-throw the exception from self.running_task. + if self.setup_complete.is_set(): + # NOTE(zcin): We need to convert the metadata to a json string because + # of cross-language scenarios. Java can't deserialize a Python tuple. + return json.dumps( + [ + ray._private.worker.global_worker.worker_id.hex(), + get_component_logger_file_path(), + ] + ) + return await done_set.pop() async def block_until_endpoint_exists( @@ -520,3 +577,9 @@ async def run(self): self.setup_complete.set() await server.serve(sockets=[sock]) + + async def check_health(self): + """No-op method to check on the health of the HTTP Proxy. + Make sure the async event loop is not blocked. + """ + pass diff --git a/python/ray/serve/_private/http_state.py b/python/ray/serve/_private/http_state.py index a31b0c95dd2a..3b52a931b017 100644 --- a/python/ray/serve/_private/http_state.py +++ b/python/ray/serve/_private/http_state.py @@ -1,6 +1,9 @@ import asyncio +import json import logging import random +import time +import traceback from typing import Dict, List, Tuple import ray @@ -14,17 +17,118 @@ SERVE_LOGGER_NAME, SERVE_PROXY_NAME, SERVE_NAMESPACE, + PROXY_HEALTH_CHECK_PERIOD_S, ) from ray.serve._private.http_proxy import HTTPProxyActor from ray.serve._private.utils import ( format_actor_name, get_all_node_ids, ) -from ray.serve._private.common import EndpointTag, NodeId +from ray.serve._private.common import EndpointTag, NodeId, HTTPProxyStatus +from ray.serve.schema import HTTPProxyDetails logger = logging.getLogger(SERVE_LOGGER_NAME) +class HTTPProxyState: + def __init__( + self, actor_handle: ActorHandle, actor_name: str, node_id: str, node_ip: str + ): + self._actor_handle = actor_handle + self._actor_name = actor_name + self._node_id = node_id + self._ready_obj_ref = self._actor_handle.ready.remote() + self._status = HTTPProxyStatus.STARTING + self._health_check_obj_ref = None + self._last_health_check_time: float = 0 + + self._actor_details = HTTPProxyDetails( + node_id=node_id, + node_ip=node_ip, + actor_id=self._actor_handle._actor_id.hex(), + actor_name=self._actor_name, + status=self._status, + ) + + @property + def actor_handle(self) -> ActorHandle: + return self._actor_handle + + @property + def actor_name(self) -> str: + return self._actor_name + + @property + def status(self) -> HTTPProxyStatus: + return self._status + + @property + def actor_details(self) -> HTTPProxyDetails: + return self._actor_details + + def set_status(self, status: HTTPProxyStatus) -> None: + """Sets _status and updates _actor_details with the new status.""" + self._status = status + self.update_actor_details(status=self._status) + + def update_actor_details(self, **kwargs) -> None: + """Updates _actor_details with passed in kwargs.""" + details_kwargs = self._actor_details.dict() + details_kwargs.update(kwargs) + self._actor_details = HTTPProxyDetails(**details_kwargs) + + def update(self): + if self._status == HTTPProxyStatus.STARTING: + finished, _ = ray.wait([self._ready_obj_ref], timeout=0) + if finished: + self._ready_obj_ref = None + try: + worker_id, log_file_path = json.loads(ray.get(finished[0])) + self.set_status(HTTPProxyStatus.HEALTHY) + self.update_actor_details( + worker_id=worker_id, + log_file_path=log_file_path, + status=self._status, + ) + except Exception: + self.set_status(HTTPProxyStatus.UNHEALTHY) + logger.warning( + "Unexpected error occured when checking readiness of HTTP " + f"Proxy on node {self._node_id}:\n{traceback.format_exc()}" + ) + return + + # Perform periodic health checks + if self._health_check_obj_ref: + finished, _ = ray.wait([self._health_check_obj_ref], timeout=0) + if finished: + try: + ray.get(finished[0]) + self.set_status(HTTPProxyStatus.HEALTHY) + except Exception as e: + logger.warning( + f"Health check for HTTP proxy {self._actor_name} failed: {e}" + ) + self.set_status(HTTPProxyStatus.UNHEALTHY) + + self._health_check_obj_ref = None + + # If there's no active in-progress health check and it has been more than 10 + # seconds since the last health check, perform another health check + randomized_period_s = PROXY_HEALTH_CHECK_PERIOD_S * random.uniform(0.9, 1.1) + if time.time() - self._last_health_check_time > randomized_period_s: + # If the HTTP Proxy is still blocked, mark unhealthy + if self._health_check_obj_ref: + self.set_status(HTTPProxyStatus.UNHEALTHY) + logger.warning( + f"Health check for HTTP Proxy {self._actor_name} took more than " + f"{PROXY_HEALTH_CHECK_PERIOD_S} seconds." + ) + + self._health_check_obj_ref = self._actor_handle.check_health.remote() + self._last_health_check_time = time.time() + + class HTTPState: """Manages all state for HTTP proxies in the system. @@ -48,8 +152,7 @@ def __init__( self._config = config else: self._config = HTTPOptions() - self._proxy_actors: Dict[NodeId, ActorHandle] = dict() - self._proxy_actor_names: Dict[NodeId, str] = dict() + self._proxy_states: Dict[NodeId, HTTPProxyState] = dict() self._head_node_id: str = head_node_id self._gcs_client = gcs_client @@ -68,14 +171,26 @@ def get_config(self): return self._config def get_http_proxy_handles(self) -> Dict[NodeId, ActorHandle]: - return self._proxy_actors + return { + node_id: state.actor_handle for node_id, state in self._proxy_states.items() + } def get_http_proxy_names(self) -> Dict[NodeId, str]: - return self._proxy_actor_names + return { + node_id: state.actor_name for node_id, state in self._proxy_states.items() + } + + def get_http_proxy_details(self) -> Dict[NodeId, HTTPProxyDetails]: + return { + node_id: state.actor_details + for node_id, state in self._proxy_states.items() + } def update(self): self._start_proxies_if_needed() self._stop_proxies_if_needed() + for proxy_state in self._proxy_states.values(): + proxy_state.update() def _get_target_nodes(self) -> List[Tuple[str, str]]: """Return the list of (node_id, ip_address) to deploy HTTP servers on.""" @@ -119,7 +234,7 @@ def _start_proxies_if_needed(self) -> None: """Start a proxy on every node if it doesn't already exist.""" for node_id, node_ip_address in self._get_target_nodes(): - if node_id in self._proxy_actors: + if node_id in self._proxy_states: continue name = format_actor_name(SERVE_PROXY_NAME, self._controller_name, node_id) @@ -153,22 +268,22 @@ def _start_proxies_if_needed(self) -> None: http_middlewares=self._config.middlewares, ) - self._proxy_actors[node_id] = proxy - self._proxy_actor_names[node_id] = name + self._proxy_states[node_id] = HTTPProxyState( + proxy, name, node_id, node_ip_address + ) def _stop_proxies_if_needed(self) -> bool: """Removes proxy actors from any nodes that no longer exist.""" all_node_ids = {node_id for node_id, _ in get_all_node_ids(self._gcs_client)} to_stop = [] - for node_id in self._proxy_actors: + for node_id in self._proxy_states: if node_id not in all_node_ids: logger.info("Removing HTTP proxy on removed node '{}'.".format(node_id)) to_stop.append(node_id) for node_id in to_stop: - proxy = self._proxy_actors.pop(node_id) - del self._proxy_actor_names[node_id] - ray.kill(proxy, no_restart=True) + proxy = self._proxy_states.pop(node_id) + ray.kill(proxy.actor_handle, no_restart=True) async def ensure_http_route_exists(self, endpoint: EndpointTag, timeout_s: float): """Block until the route has been propagated to all HTTP proxies. @@ -177,7 +292,9 @@ async def ensure_http_route_exists(self, endpoint: EndpointTag, timeout_s: float """ await asyncio.gather( *[ - proxy.block_until_endpoint_exists.remote(endpoint, timeout_s=timeout_s) - for proxy in self._proxy_actors.values() + proxy.actor_handle.block_until_endpoint_exists.remote( + endpoint, timeout_s=timeout_s + ) + for proxy in self._proxy_states.values() ] ) diff --git a/python/ray/serve/_private/logging_utils.py b/python/ray/serve/_private/logging_utils.py index 97e9e6222873..9a96fcbf47e1 100644 --- a/python/ray/serve/_private/logging_utils.py +++ b/python/ray/serve/_private/logging_utils.py @@ -1,18 +1,84 @@ import logging import os from typing import Optional +import json +import copy import ray -from ray.serve._private.constants import DEBUG_LOG_ENV_VAR, SERVE_LOGGER_NAME +from ray.serve._private.constants import ( + DEBUG_LOG_ENV_VAR, + SERVE_LOGGER_NAME, + RAY_SERVE_ENABLE_JSON_LOGGING, + SERVE_LOG_RECORD_FORMAT, + SERVE_LOG_REQUEST_ID, + SERVE_LOG_ROUTE, + SERVE_LOG_APPLICATION, + SERVE_LOG_MESSAGE, + SERVE_LOG_DEPLOYMENT, + SERVE_LOG_COMPONENT, + SERVE_LOG_COMPONENT_ID, + SERVE_LOG_TIME, + SERVE_LOG_LEVEL_NAME, + SERVE_LOG_REPLICA, +) +from ray.serve._private.common import ServeComponentType LOG_FILE_FMT = "{component_name}_{component_id}.log" -COMPONENT_LOG_FMT = ( - "%(levelname)s %(asctime)s {component_name} {component_id} " # noqa:E501 -) -MESSAGE_FMT = "%(filename)s:%(lineno)d - %(message)s" -REQUEST_ID_FMT = "%(request_id)s " -ROUTE_FMT = "%(route)s " + + +class ServeJSONFormatter(logging.Formatter): + """Serve Logging Json Formatter + + The formatter will generate the json log format on the fly + based on the field of record. + """ + + def __init__( + self, + component_name: str, + component_id: str, + component_type: Optional[ServeComponentType] = None, + ): + self.component_log_fmt = { + SERVE_LOG_LEVEL_NAME: SERVE_LOG_RECORD_FORMAT[SERVE_LOG_LEVEL_NAME], + SERVE_LOG_TIME: SERVE_LOG_RECORD_FORMAT[SERVE_LOG_TIME], + } + if component_type and component_type == ServeComponentType.DEPLOYMENT: + self.component_log_fmt[SERVE_LOG_DEPLOYMENT] = component_name + self.component_log_fmt[SERVE_LOG_REPLICA] = component_id + else: + self.component_log_fmt[SERVE_LOG_COMPONENT] = component_name + self.component_log_fmt[SERVE_LOG_COMPONENT_ID] = component_id + + def format(self, record: logging.LogRecord) -> str: + """Format the log record into json format. + + Args: + record: The log record to be formatted. + + Returns: + The formatted log record in json format. + """ + record_format = copy.deepcopy(self.component_log_fmt) + if SERVE_LOG_REQUEST_ID in record.__dict__: + record_format[SERVE_LOG_REQUEST_ID] = SERVE_LOG_RECORD_FORMAT[ + SERVE_LOG_REQUEST_ID + ] + if SERVE_LOG_ROUTE in record.__dict__: + record_format[SERVE_LOG_ROUTE] = SERVE_LOG_RECORD_FORMAT[SERVE_LOG_ROUTE] + if SERVE_LOG_APPLICATION in record.__dict__: + record_format[SERVE_LOG_APPLICATION] = SERVE_LOG_RECORD_FORMAT[ + SERVE_LOG_APPLICATION + ] + + record_format[SERVE_LOG_MESSAGE] = SERVE_LOG_RECORD_FORMAT[SERVE_LOG_MESSAGE] + + # create a formatter using the format string + formatter = logging.Formatter(json.dumps(record_format)) + + # format the log record using the formatter + return formatter.format(record) class ServeFormatter(logging.Formatter): @@ -21,22 +87,39 @@ class ServeFormatter(logging.Formatter): The formatter will generate the log format on the fly based on the field of record. """ - def __init__(self, component_name: str, component_id: str): - self.component_log_fmt = COMPONENT_LOG_FMT.format( + COMPONENT_LOG_FMT = f"%({SERVE_LOG_LEVEL_NAME})s %({SERVE_LOG_TIME})s {{{SERVE_LOG_COMPONENT}}} {{{SERVE_LOG_COMPONENT_ID}}} " # noqa:E501 + + def __init__( + self, + component_name: str, + component_id: str, + ): + self.component_log_fmt = ServeFormatter.COMPONENT_LOG_FMT.format( component_name=component_name, component_id=component_id ) - def format(self, record): - # generate a format string based on the record field. - cur_format = self.component_log_fmt - if "request_id" in record.__dict__: - cur_format += REQUEST_ID_FMT - if "route" in record.__dict__: - cur_format += ROUTE_FMT - cur_format += MESSAGE_FMT + def format(self, record: logging.LogRecord) -> str: + """Format the log record into the format string. + + Args: + record: The log record to be formatted. + + Returns: + The formatted log record in string format. + """ + record_format = self.component_log_fmt + record_formats_attrs = [] + if SERVE_LOG_REQUEST_ID in record.__dict__: + record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_REQUEST_ID]) + if SERVE_LOG_ROUTE in record.__dict__: + record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_ROUTE]) + if SERVE_LOG_APPLICATION in record.__dict__: + record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_APPLICATION]) + record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_MESSAGE]) + record_format += " ".join(record_formats_attrs) # create a formatter using the format string - formatter = logging.Formatter(cur_format) + formatter = logging.Formatter(record_format) # format the log record using the formatter return formatter.format(record) @@ -55,11 +138,27 @@ def log_to_stderr_filter(record: logging.LogRecord) -> bool: return record.log_to_stderr +def get_component_logger_file_path() -> Optional[str]: + """Returns the relative file path for the Serve logger, if it exists. + + If a logger was configured through configure_component_logger() for the Serve + component that's calling this function, this returns the location of the log file + relative to the ray logs directory. + """ + logger = logging.getLogger(SERVE_LOGGER_NAME) + for handler in logger.handlers: + if isinstance(handler, logging.handlers.RotatingFileHandler): + absolute_path = handler.baseFilename + ray_logs_dir = ray._private.worker._global_node.get_logs_dir_path() + if absolute_path.startswith(ray_logs_dir): + return absolute_path[len(ray_logs_dir) :] + + def configure_component_logger( *, component_name: str, component_id: str, - component_type: Optional[str] = None, + component_type: Optional[ServeComponentType] = None, log_level: int = logging.INFO, max_bytes: Optional[int] = None, backup_count: Optional[int] = None, @@ -83,9 +182,11 @@ def record_factory(*args, **kwargs): request_context = ray.serve.context._serve_request_context.get() record = factory(*args, **kwargs) if request_context.route: - record.route = request_context.route + setattr(record, SERVE_LOG_ROUTE, request_context.route) if request_context.request_id: - record.request_id = request_context.request_id + setattr(record, SERVE_LOG_REQUEST_ID, request_context.request_id) + if request_context.app_name: + setattr(record, SERVE_LOG_APPLICATION, request_context.app_name) return record logging.setLogRecordFactory(record_factory) @@ -103,17 +204,28 @@ def record_factory(*args, **kwargs): max_bytes = ray._private.worker._global_node.max_bytes if backup_count is None: backup_count = ray._private.worker._global_node.backup_count + + # For DEPLOYMENT component type, we want to log the deployment name + # instead of adding the component type to the component name. + component_log_file_name = component_name if component_type is not None: - component_name = f"{component_type}_{component_name}" + component_log_file_name = f"{component_type}_{component_name}" + if component_type != ServeComponentType.DEPLOYMENT: + component_name = f"{component_type}_{component_name}" log_file_name = LOG_FILE_FMT.format( - component_name=component_name, component_id=component_id + component_name=component_log_file_name, component_id=component_id ) file_handler = logging.handlers.RotatingFileHandler( os.path.join(logs_dir, log_file_name), maxBytes=max_bytes, backupCount=backup_count, ) - file_handler.setFormatter(ServeFormatter(component_name, component_id)) + if RAY_SERVE_ENABLE_JSON_LOGGING: + file_handler.setFormatter( + ServeJSONFormatter(component_name, component_id, component_type) + ) + else: + file_handler.setFormatter(ServeFormatter(component_name, component_id)) logger.addHandler(file_handler) diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py index 206c873f39e9..6717b0ee75ae 100644 --- a/python/ray/serve/_private/replica.py +++ b/python/ray/serve/_private/replica.py @@ -7,6 +7,7 @@ import pickle import time from typing import Any, Callable, Optional, Tuple, Dict +import traceback import starlette.responses @@ -14,11 +15,15 @@ from ray import cloudpickle from ray.actor import ActorClass, ActorHandle from ray.remote_function import RemoteFunction -from ray.util import metrics +from ray.serve import metrics from ray._private.async_compat import sync_to_async from ray.serve._private.autoscaling_metrics import start_metrics_pusher -from ray.serve._private.common import HEALTH_CHECK_CONCURRENCY_GROUP, ReplicaTag +from ray.serve._private.common import ( + HEALTH_CHECK_CONCURRENCY_GROUP, + ReplicaTag, + ServeComponentType, +) from ray.serve.config import DeploymentConfig from ray.serve._private.constants import ( HEALTH_CHECK_METHOD, @@ -30,7 +35,11 @@ from ray.serve.deployment import Deployment from ray.serve.exceptions import RayServeException from ray.serve._private.http_util import ASGIHTTPSender -from ray.serve._private.logging_utils import access_log_msg, configure_component_logger +from ray.serve._private.logging_utils import ( + access_log_msg, + configure_component_logger, + get_component_logger_file_path, +) from ray.serve._private.router import Query, RequestMetadata from ray.serve._private.utils import ( parse_import_path, @@ -68,9 +77,10 @@ async def __init__( version: DeploymentVersion, controller_name: str, detached: bool, + app_name: str = None, ): configure_component_logger( - component_type="deployment", + component_type=ServeComponentType.DEPLOYMENT, component_name=deployment_name, component_id=replica_tag, ) @@ -121,6 +131,7 @@ async def __init__( replica_tag, controller_name, servable_object=None, + app_name=app_name, ) assert controller_name, "Must provide a valid controller_name" @@ -155,17 +166,18 @@ async def initialize_replica(): replica_tag, controller_name, servable_object=_callable, + app_name=app_name, ) self.replica = RayServeReplica( _callable, deployment_name, replica_tag, - deployment_config, - deployment_config.user_config, + deployment_config.autoscaling_config, version, is_function, controller_handle, + app_name, ) self._init_finish_event.set() @@ -217,44 +229,52 @@ async def is_allocated(self) -> str: to PENDING_INITIALIZATION startup state. Returns: - The PID, actor ID, node ID, node IP of the replica. + The PID, actor ID, node ID, node IP, and log filepath id of the replica. """ + return ( os.getpid(), ray.get_runtime_context().get_actor_id(), + ray._private.worker.global_worker.worker_id.hex(), ray.get_runtime_context().get_node_id(), ray.util.get_node_ip_address(), + get_component_logger_file_path(), ) async def is_initialized( - self, user_config: Optional[Any] = None, _after: Optional[Any] = None + self, + deployment_config: DeploymentConfig = None, + _after: Optional[Any] = None, ): # Unused `_after` argument is for scheduling: passing an ObjectRef # allows delaying reconfiguration until after this call has returned. - await self._initialize_replica() - - metadata = await self.reconfigure(user_config) - - # A new replica should not be considered healthy until it passes an - # initial health check. If an initial health check fails, consider - # it an initialization failure. - await self.check_health() - return metadata + try: + await self._initialize_replica() + metadata = await self.reconfigure(deployment_config) + + # A new replica should not be considered healthy until it passes an + # initial health check. If an initial health check fails, consider + # it an initialization failure. + await self.check_health() + return metadata + except Exception: + raise RuntimeError(traceback.format_exc()) from None async def reconfigure( - self, user_config: Optional[Any] = None + self, deployment_config: DeploymentConfig ) -> Tuple[DeploymentConfig, DeploymentVersion]: - if user_config is not None: - await self.replica.reconfigure(user_config) - - return await self.get_metadata() + try: + await self.replica.reconfigure(deployment_config) + return await self.get_metadata() + except Exception: + raise RuntimeError(traceback.format_exc()) from None async def get_metadata( self, ) -> Tuple[DeploymentConfig, DeploymentVersion]: # Wait for replica initialization to finish await self._init_finish_event.wait() - return self.replica.deployment_config, self.replica.version + return self.replica.version.deployment_config, self.replica.version async def prepare_for_shutdown(self): if self.replica is not None: @@ -281,20 +301,20 @@ def __init__( _callable: Callable, deployment_name: str, replica_tag: ReplicaTag, - deployment_config: DeploymentConfig, - user_config: Any, + autoscaling_config: Any, version: DeploymentVersion, is_function: bool, controller_handle: ActorHandle, + app_name: str, ) -> None: - self.deployment_config = deployment_config self.deployment_name = deployment_name self.replica_tag = replica_tag self.callable = _callable self.is_function = is_function - self.user_config = user_config self.version = version + self.deployment_config = None self.rwlock = aiorwlock.RWLock() + self.app_name = app_name user_health_check = getattr(_callable, HEALTH_CHECK_METHOD, None) if not callable(user_health_check): @@ -311,10 +331,7 @@ def user_health_check(): description=( "The number of queries that have been processed in this replica." ), - tag_keys=("deployment", "replica", "route"), - ) - self.request_counter.set_default_tags( - {"deployment": self.deployment_name, "replica": self.replica_tag} + tag_keys=("route",), ) self.error_counter = metrics.Counter( @@ -322,10 +339,7 @@ def user_health_check(): description=( "The number of exceptions that have occurred in this replica." ), - tag_keys=("deployment", "replica", "route"), - ) - self.error_counter.set_default_tags( - {"deployment": self.deployment_name, "replica": self.replica_tag} + tag_keys=("route",), ) self.restart_counter = metrics.Counter( @@ -333,38 +347,25 @@ def user_health_check(): description=( "The number of times this replica has been restarted due to failure." ), - tag_keys=("deployment", "replica"), - ) - self.restart_counter.set_default_tags( - {"deployment": self.deployment_name, "replica": self.replica_tag} ) self.processing_latency_tracker = metrics.Histogram( "serve_deployment_processing_latency_ms", description="The latency for queries to be processed.", boundaries=DEFAULT_LATENCY_BUCKET_MS, - tag_keys=("deployment", "replica", "route"), - ) - self.processing_latency_tracker.set_default_tags( - {"deployment": self.deployment_name, "replica": self.replica_tag} + tag_keys=("route",), ) self.num_processing_items = metrics.Gauge( "serve_replica_processing_queries", description="The current number of queries being processed.", - tag_keys=("deployment", "replica"), - ) - self.num_processing_items.set_default_tags( - {"deployment": self.deployment_name, "replica": self.replica_tag} ) self.restart_counter.inc() - self._shutdown_wait_loop_s = deployment_config.graceful_shutdown_wait_loop_s - - if deployment_config.autoscaling_config: + if autoscaling_config: process_remote_func = controller_handle.record_autoscaling_metrics.remote - config = deployment_config.autoscaling_config + config = autoscaling_config start_metrics_pusher( interval_s=config.metrics_interval_s, collection_callback=self._collect_autoscaling_metrics, @@ -484,26 +485,36 @@ async def invoke_single(self, request_item: Query) -> Tuple[Any, bool]: return result, success - async def reconfigure(self, user_config: Any): + async def reconfigure(self, deployment_config: DeploymentConfig): async with self.rwlock.writer_lock: - self.user_config = user_config - self.version = DeploymentVersion( - self.version.code_version, user_config=user_config + user_config_changed = False + if ( + self.deployment_config is None + or self.deployment_config.user_config != deployment_config.user_config + ): + user_config_changed = True + self.deployment_config = deployment_config + self.version = DeploymentVersion.from_deployment_version( + self.version, self.deployment_config ) - if self.is_function: - raise ValueError("deployment_def must be a class to use user_config") - elif not hasattr(self.callable, RECONFIGURE_METHOD): - raise RayServeException( - "user_config specified but deployment " - + self.deployment_name - + " missing " - + RECONFIGURE_METHOD - + " method" + + if self.deployment_config.user_config is not None and user_config_changed: + if self.is_function: + raise ValueError( + "deployment_def must be a class to use user_config" + ) + elif not hasattr(self.callable, RECONFIGURE_METHOD): + raise RayServeException( + "user_config specified but deployment " + + self.deployment_name + + " missing " + + RECONFIGURE_METHOD + + " method" + ) + reconfigure_method = sync_to_async( + getattr(self.callable, RECONFIGURE_METHOD) ) - reconfigure_method = sync_to_async( - getattr(self.callable, RECONFIGURE_METHOD) - ) - await reconfigure_method(user_config) + await reconfigure_method(self.deployment_config.user_config) async def handle_request(self, request: Query) -> asyncio.Future: async with self.rwlock.reader_lock: @@ -514,7 +525,7 @@ async def handle_request(self, request: Query) -> asyncio.Future: # handle can pass the correct request context to subsequent replicas. ray.serve.context._serve_request_context.set( ray.serve.context.RequestContext( - request.metadata.route, request.metadata.request_id + request.metadata.route, request.metadata.request_id, self.app_name ) ) @@ -546,7 +557,7 @@ async def prepare_for_shutdown(self): while True: # Sleep first because we want to make sure all the routers receive # the notification to remove this replica first. - await asyncio.sleep(self._shutdown_wait_loop_s) + await asyncio.sleep(self.deployment_config.graceful_shutdown_wait_loop_s) method_stat = self._get_handle_request_stats() # The handle_request method wasn't even invoked. if method_stat is None: @@ -557,8 +568,9 @@ async def prepare_for_shutdown(self): else: logger.info( "Waiting for an additional " - f"{self._shutdown_wait_loop_s}s to shut down because " - f"there are {self.num_ongoing_requests} ongoing requests." + f"{self.deployment_config.graceful_shutdown_wait_loop_s}s to shut " + f"down because there are {self.num_ongoing_requests} ongoing " + "requests." ) # Explicitly call the del method to trigger clean up. diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 5974a94ec457..49c2622c37b4 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -40,6 +40,9 @@ class RequestMetadata: # HTTP route path of the request. route: str = "" + # Application Name + app_name: str = "" + @dataclass class Query: @@ -98,7 +101,7 @@ def __init__( "The current number of queries to this deployment waiting" " to be assigned to a replica." ), - tag_keys=("deployment", "route"), + tag_keys=("deployment", "route", "application"), ) self.num_queued_queries_gauge.set_default_tags( {"deployment": self.deployment_name} @@ -229,7 +232,11 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: """ self.num_queued_queries += 1 self.num_queued_queries_gauge.set( - self.num_queued_queries, tags={"route": query.metadata.route} + self.num_queued_queries, + tags={ + "route": query.metadata.route, + "application": query.metadata.app_name, + }, ) await query.resolve_async_tasks() assigned_ref = self._try_assign_replica(query) @@ -255,7 +262,11 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: assigned_ref = self._try_assign_replica(query) self.num_queued_queries -= 1 self.num_queued_queries_gauge.set( - self.num_queued_queries, tags={"route": query.metadata.route} + self.num_queued_queries, + tags={ + "route": query.metadata.route, + "application": query.metadata.app_name, + }, ) return assigned_ref @@ -279,7 +290,7 @@ def __init__( self.num_router_requests = metrics.Counter( "serve_num_router_requests", description="The number of requests processed by the router.", - tag_keys=("deployment", "route"), + tag_keys=("deployment", "route", "application"), ) self.num_router_requests.set_default_tags({"deployment": deployment_name}) @@ -305,7 +316,9 @@ async def assign_request( ): """Assign a query and returns an object ref represent the result""" - self.num_router_requests.inc(tags={"route": request_meta.route}) + self.num_router_requests.inc( + tags={"route": request_meta.route, "application": request_meta.app_name} + ) return await self._replica_set.assign_replica( Query( args=list(request_args), diff --git a/python/ray/serve/_private/utils.py b/python/ray/serve/_private/utils.py index 038d3f3efd30..c6d4a6797092 100644 --- a/python/ray/serve/_private/utils.py +++ b/python/ray/serve/_private/utils.py @@ -9,7 +9,17 @@ import traceback from enum import Enum from functools import wraps -from typing import Dict, Iterable, List, Tuple, TypeVar, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Tuple, + TypeVar, + Union, + Optional, +) import fastapi.encoders import numpy as np @@ -44,6 +54,19 @@ class DEFAULT(Enum): VALUE = 1 +class DeploymentOptionUpdateType(str, Enum): + # Nothing needs to be done other than setting the target state. + LightWeight = "LightWeight" + # Each DeploymentReplica instance (tracked in DeploymentState) uses certain options + # from the deployment config. These values need to be updated in DeploymentReplica. + NeedsReconfigure = "NeedsReconfigure" + # Options that are sent to the replica actor. If changed, reconfigure() on the actor + # needs to be called to update these values. + NeedsActorReconfigure = "NeedsActorReconfigure" + # If changed, restart all replicas. + HeavyWeight = "HeavyWeight" + + # Type alias: objects that can be DEFAULT.VALUE have type Default[T] T = TypeVar("T") Default = Union[DEFAULT, T] @@ -161,7 +184,7 @@ def get_all_node_ids(gcs_client) -> List[Tuple[str, str]]: """ nodes = gcs_client.get_all_node_info(timeout=RAY_GCS_RPC_TIMEOUT_S) node_ids = [ - (ray.NodeID.from_binary(node_id).hex(), node["node_name"]) + (ray.NodeID.from_binary(node_id).hex(), node["node_name"].decode("utf-8")) for (node_id, node) in nodes.items() if node["state"] == ray.core.generated.gcs_pb2.GcsNodeInfo.ALIVE ] @@ -522,3 +545,27 @@ def record_serve_tag(key: str, value: str): ) record_extra_usage_tag(serve_telemetry_tag_map[key], value) + + +def extract_self_if_method_call(args: List[Any], func: Callable) -> Optional[object]: + """Check if this is a method rather than a function. + + Does this by checking to see if `func` is the attribute of the first + (`self`) argument under `func.__name__`. Unfortunately, this is the most + robust solution to this I was able to find. It would also be preferable + to do this check when the decorator runs, rather than when the method is. + + Returns the `self` object if it's a method call, else None. + + Arguments: + args: arguments to the function/method call. + func: the unbound function that was called. + """ + if len(args) > 0: + method = getattr(args[0], func.__name__, False) + if method: + wrapped = getattr(method, "__wrapped__", False) + if wrapped and wrapped == func: + return args[0] + + return None diff --git a/python/ray/serve/_private/version.py b/python/ray/serve/_private/version.py index 1e43baeed6f9..f6f51b532e8f 100644 --- a/python/ray/serve/_private/version.py +++ b/python/ray/serve/_private/version.py @@ -1,14 +1,25 @@ from abc import ABC +from copy import deepcopy import json -from typing import Any, Optional +from typing import Any, Optional, Dict, List from zlib import crc32 -from ray.serve._private.utils import get_random_letters +from ray.serve._private.utils import get_random_letters, DeploymentOptionUpdateType from ray.serve.generated.serve_pb2 import DeploymentVersion as DeploymentVersionProto +from ray.serve.config import DeploymentConfig + +import logging + +logger = logging.getLogger("ray.serve") class DeploymentVersion: - def __init__(self, code_version: Optional[str], user_config: Optional[Any] = None): + def __init__( + self, + code_version: Optional[str], + deployment_config: DeploymentConfig, + ray_actor_options: Optional[Dict], + ): if code_version is not None and not isinstance(code_version, str): raise TypeError(f"code_version must be str, got {type(code_version)}.") if code_version is None: @@ -18,14 +29,18 @@ def __init__(self, code_version: Optional[str], user_config: Optional[Any] = Non self.unversioned = False self.code_version = code_version - self.user_config = user_config - # TODO(simon): make this xlang compatible - if isinstance(user_config, bytes): - serialized_user_config = user_config - else: - serialized_user_config = str.encode(json.dumps(user_config, sort_keys=True)) - self.user_config_hash = crc32(serialized_user_config) - self._hash = crc32(serialized_user_config + self.code_version.encode("utf-8")) + # Options for this field may be mutated over time, so any logic that uses this + # should access this field directly + self.deployment_config: DeploymentConfig = deployment_config + self.ray_actor_options: Dict = ray_actor_options + self.compute_hashes() + + @classmethod + def from_deployment_version(cls, deployment_version, deployment_config): + version_copy = deepcopy(deployment_version) + version_copy.deployment_config = deployment_config + version_copy.compute_hashes() + return version_copy def __hash__(self) -> int: return self._hash @@ -35,12 +50,105 @@ def __eq__(self, other: Any) -> bool: return False return self._hash == other._hash + def requires_actor_restart(self, new_version): + """Determines whether the new version requires actors of the current version to + be restarted. + """ + return ( + self.code_version != new_version.code_version + or self.ray_actor_options_hash != new_version.ray_actor_options_hash + ) + + def requires_actor_reconfigure(self, new_version): + """Determines whether the new version requires calling reconfigure() on the + replica actor. + """ + return self.reconfigure_actor_hash != new_version.reconfigure_actor_hash + + def requires_long_poll_broadcast(self, new_version): + """Determines whether lightweightly updating an existing replica to the new + version requires broadcasting through long poll that the running replicas has + changed. + """ + return ( + self.deployment_config.max_concurrent_queries + != new_version.deployment_config.max_concurrent_queries + ) + + def compute_hashes(self): + # If this changes, the controller will directly restart all existing replicas. + serialized_ray_actor_options = _serialize(self.ray_actor_options or {}) + self.ray_actor_options_hash = crc32(serialized_ray_actor_options) + + # If this changes, DeploymentReplica.reconfigure() will call reconfigure on the + # actual replica actor + self.reconfigure_actor_hash = crc32( + self._get_serialized_options( + [DeploymentOptionUpdateType.NeedsActorReconfigure] + ) + ) + + # Used by __eq__ in deployment state to either reconfigure the replicas or + # stop and restart them + self._hash = crc32( + self.code_version.encode("utf-8") + + serialized_ray_actor_options + + self._get_serialized_options( + [ + DeploymentOptionUpdateType.NeedsReconfigure, + DeploymentOptionUpdateType.NeedsActorReconfigure, + ] + ) + ) + def to_proto(self) -> bytes: # TODO(simon): enable cross language user config - return DeploymentVersionProto(code_version=self.code_version, user_config=b"") + return DeploymentVersionProto( + code_version=self.code_version, + deployment_config=self.deployment_config.to_proto(), + ray_actor_options=json.dumps(self.ray_actor_options), + ) + + @classmethod + def from_proto(cls, proto: DeploymentVersionProto): + return DeploymentVersion( + proto.code_version, + DeploymentConfig.from_proto(proto.deployment_config), + json.loads(proto.ray_actor_options), + ) + + def _get_serialized_options( + self, update_types: List[DeploymentOptionUpdateType] + ) -> bytes: + """Returns a serialized dictionary containing fields of a deployment config that + should prompt a deployment version update. + """ + reconfigure_dict = {} + for option_name, field in self.deployment_config.__fields__.items(): + option_weight = field.field_info.extra.get("update_type") + if option_weight in update_types: + reconfigure_dict[option_name] = getattr( + self.deployment_config, option_name + ) + + if ( + isinstance(self.deployment_config.user_config, bytes) + and "user_config" in reconfigure_dict + ): + del reconfigure_dict["user_config"] + return self.deployment_config.user_config + _serialize(reconfigure_dict) + + return _serialize(reconfigure_dict) + + +def _serialize(json_object): + return str.encode(json.dumps(json_object, sort_keys=True)) class VersionedReplica(ABC): @property def version(self) -> DeploymentVersion: pass + + def update_state(self, state): + pass diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 8bb34bf9fc57..c86d8b749f36 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -1,7 +1,8 @@ import collections import inspect import logging -from typing import Any, Callable, Dict, Optional, Tuple, Union, overload +from typing import Any, Callable, Dict, Optional, Tuple, Union +from functools import wraps from fastapi import APIRouter, FastAPI from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag @@ -9,11 +10,12 @@ from uvicorn.config import Config from uvicorn.lifespan.on import LifespanOn +import ray from ray import cloudpickle from ray.dag import DAGNode from ray.util.annotations import Deprecated, PublicAPI -from ray.serve.application import Application +from ray.serve.built_application import BuiltApplication from ray.serve._private.client import ServeControllerClient from ray.serve.config import AutoscalingConfig, DeploymentConfig, HTTPOptions from ray.serve._private.constants import ( @@ -28,14 +30,14 @@ get_internal_replica_context, _set_global_client, ) -from ray.serve.deployment import Deployment -from ray.serve.deployment_graph import ClassNode, FunctionNode +from ray.serve.deployment import Application, Deployment +from ray.serve.multiplex import _ModelMultiplexWrapper from ray.serve._private.deployment_graph_build import build as pipeline_build from ray.serve._private.deployment_graph_build import ( get_and_validate_ingress_deployment, ) from ray.serve.exceptions import RayServeException -from ray.serve.handle import RayServeHandle +from ray.serve.handle import RayServeSyncHandle from ray.serve._private.http_util import ASGIHTTPSender, make_fastapi_class_based_view from ray.serve._private.logging_utils import LoggingContext from ray.serve._private.utils import ( @@ -46,10 +48,12 @@ install_serve_encoders_to_fastapi, guarded_deprecation_warning, record_serve_tag, + extract_self_if_method_call, ) from ray.serve._private import api as _private_api + logger = logging.getLogger(__file__) @@ -61,7 +65,7 @@ def start( dedicated_cpu: bool = False, **kwargs, ) -> ServeControllerClient: - """Initialize a serve instance. + """Start Serve on the cluster. By default, the instance will be scoped to the lifetime of the returned Client object (or when the script exits). If detached is set to True, the @@ -73,7 +77,7 @@ def start( detached: Whether not the instance should be detached from this script. If set, the instance will live on the Ray cluster until it is explicitly stopped with serve.shutdown(). - http_options (Optional[Dict, serve.HTTPOptions]): Configuration options + http_options: Configuration options for HTTP proxy. You can pass in a dictionary or HTTPOptions object with fields: @@ -109,10 +113,9 @@ def start( @PublicAPI(stability="stable") def shutdown() -> None: - """Completely shut down the connected Serve instance. + """Completely shut down Serve on the cluster. - Shuts down all processes and deletes all state associated with the - instance. + Deletes all applications and shuts down Serve system actors. """ try: @@ -130,21 +133,28 @@ def shutdown() -> None: @PublicAPI(stability="beta") def get_replica_context() -> ReplicaContext: - """If called from a deployment, returns the deployment and replica tag. + """Returns the deployment and replica tag from within a replica at runtime. A replica tag uniquely identifies a single replica for a Ray Serve - deployment at runtime. Replica tags are of the form - `#`. + deployment. Raises: RayServeException: if not called from within a Ray Serve deployment. Example: - >>> from ray import serve - >>> # deployment_name - >>> serve.get_replica_context().deployment # doctest: +SKIP - >>> # deployment_name#krcwoa - >>> serve.get_replica_context().replica_tag # doctest: +SKIP + + .. code-block:: python + + from ray import serve + @serve.deployment + class MyDeployment: + def __init__(self): + # Prints "MyDeployment" + print(serve.get_replica_context().deployment) + + # Prints "MyDeployment#" + print(serve.get_replica_context().replica_tag) + """ internal_replica_context = get_internal_replica_context() if internal_replica_context is None: @@ -157,24 +167,30 @@ def get_replica_context() -> ReplicaContext: @PublicAPI(stability="beta") -def ingress(app: Union["FastAPI", "APIRouter", Callable]): - """Mark an ASGI application ingress for Serve. - - Args: - app (FastAPI,APIRouter,Starlette,etc): the app or router object serve - as ingress for this deployment. It can be any ASGI compatible - object. +def ingress(app: Union["FastAPI", "APIRouter", Callable]) -> Callable: + """Wrap a deployment class with a FastAPI application for HTTP request parsing. Example: - >>> from fastapi import FastAPI - >>> from ray import serve - >>> app = FastAPI() # doctest: +SKIP - >>> app = FastAPI() # doctest: +SKIP - >>> @serve.deployment # doctest: +SKIP - ... @serve.ingress(app) # doctest: +SKIP - ... class App: # doctest: +SKIP - ... pass # doctest: +SKIP - >>> App.deploy() # doctest: +SKIP + + .. code-block:: python + + from ray import serve + from fastapi import FastAPI + + app = FastAPI() + + @serve.deployment + @serve.ingress(app) + class MyFastAPIDeployment: + @app.get("/hi") + def say_hi(self) -> str: + return "Hello world!" + + app = MyFastAPIDeployment.bind() + + Args: + app: the FastAPI app or router object to wrap this class with. + Can be any ASGI-compatible callable. """ def decorator(cls): @@ -252,32 +268,6 @@ async def __del__(self): return decorator -@overload -def deployment(func_or_class: Callable) -> Deployment: - pass - - -@overload -def deployment( - name: Default[str] = DEFAULT.VALUE, - version: Default[str] = DEFAULT.VALUE, - num_replicas: Default[int] = DEFAULT.VALUE, - init_args: Default[Tuple[Any]] = DEFAULT.VALUE, - init_kwargs: Default[Dict[Any, Any]] = DEFAULT.VALUE, - route_prefix: Default[Union[str, None]] = DEFAULT.VALUE, - ray_actor_options: Default[Dict] = DEFAULT.VALUE, - user_config: Default[Any] = DEFAULT.VALUE, - max_concurrent_queries: Default[int] = DEFAULT.VALUE, - autoscaling_config: Default[Union[Dict, AutoscalingConfig]] = DEFAULT.VALUE, - graceful_shutdown_wait_loop_s: Default[float] = DEFAULT.VALUE, - graceful_shutdown_timeout_s: Default[float] = DEFAULT.VALUE, - health_check_period_s: Default[float] = DEFAULT.VALUE, - health_check_timeout_s: Default[float] = DEFAULT.VALUE, - is_driver_deployment: Optional[bool] = DEFAULT.VALUE, -) -> Callable[[Callable], Deployment]: - pass - - @PublicAPI(stability="beta") def deployment( _func_or_class: Optional[Callable] = None, @@ -297,62 +287,56 @@ def deployment( health_check_timeout_s: Default[float] = DEFAULT.VALUE, is_driver_deployment: Optional[bool] = DEFAULT.VALUE, ) -> Callable[[Callable], Deployment]: - """Define a Serve deployment. - - Args: - name (Default[str]): Globally-unique name identifying this - deployment. If not provided, the name of the class or function will - be used. - version [DEPRECATED] (Default[str]): Version of the deployment. - This is used to indicate a code change for the deployment; when it - is re-deployed with a version change, a rolling update of the - replicas will be performed. If not provided, every deployment will - be treated as a new version. - num_replicas (Default[Optional[int]]): The number of processes to start up that - will handle requests to this deployment. Defaults to 1. - init_args (Default[Tuple[Any]]): Positional args to be passed to the - class constructor when starting up deployment replicas. These can - also be passed when you call `.deploy()` on the returned Deployment. - init_kwargs (Default[Dict[Any, Any]]): Keyword args to be passed to the - class constructor when starting up deployment replicas. These can - also be passed when you call `.deploy()` on the returned Deployment. - route_prefix (Default[Union[str, None]]): Requests to paths under this - HTTP path prefix will be routed to this deployment. Defaults to - '/{name}'. When set to 'None', no HTTP endpoint will be created. - Routing is done based on longest-prefix match, so if you have - deployment A with a prefix of '/a' and deployment B with a prefix - of '/a/b', requests to '/a', '/a/', and '/a/c' go to A and requests - to '/a/b', '/a/b/', and '/a/b/c' go to B. Routes must not end with - a '/' unless they're the root (just '/'), which acts as a - catch-all. - ray_actor_options (Default[Dict]): Options to be passed to the Ray - actor constructor such as resource requirements. Valid options are - `accelerator_type`, `memory`, `num_cpus`, `num_gpus`, - `object_store_memory`, `resources`, and `runtime_env`. - user_config (Default[Optional[Any]]): Config to pass to the - reconfigure method of the deployment. This can be updated - dynamically without changing the version of the deployment and - restarting its replicas. The user_config must be json-serializable - to keep track of updates, so it must only contain json-serializable - types, or json-serializable types nested in lists and dictionaries. - max_concurrent_queries (Default[int]): The maximum number of queries - that will be sent to a replica of this deployment without receiving - a response. Defaults to 100. - is_driver_deployment (Optional[bool]): [Experiment] when set it as True, serve - will deploy exact one deployment to every node. + """Decorator that converts a Python class to a `Deployment`. Example: - >>> from ray import serve - >>> @serve.deployment(name="deployment1") # doctest: +SKIP - ... class MyDeployment: # doctest: +SKIP - ... pass # doctest: +SKIP - >>> MyDeployment.bind(*init_args) # doctest: +SKIP - >>> MyDeployment.options( # doctest: +SKIP - ... num_replicas=2, init_args=init_args).bind() + .. code-block:: python + + from ray import serve + + @serve.deployment(num_replicas=2) + class MyDeployment: + pass + + app = MyDeployment.bind() + + Args: + name: Name uniquely identifying this deployment within the application. + If not provided, the name of the class or function is used. + num_replicas: The number of replicas to run that handle requests to + this deployment. Defaults to 1. + autoscaling_config: Parameters to configure autoscaling behavior. If this + is set, `num_replicas` cannot be set. + init_args: [DEPRECATED] These should be passed to `.bind()` instead. + init_kwargs: [DEPRECATED] These should be passed to `.bind()` instead. + route_prefix: Requests to paths under this HTTP path prefix are routed + to this deployment. Defaults to '/{name}'. This can only be set for the + ingress (top-level) deployment of an application. + ray_actor_options: Options to be passed to the Ray actor decorator, such as + resource requirements. Valid options are `accelerator_type`, `memory`, + `num_cpus`, `num_gpus`, `object_store_memory`, `resources`, + and `runtime_env`. + user_config: Config to pass to the reconfigure method of the deployment. This + can be updated dynamically without restarting the replicas of the + deployment. The user_config must be fully JSON-serializable. + max_concurrent_queries: The maximum number of queries that are sent to a + replica of this deployment without receiving a response. Defaults to 100. + health_check_period_s: How often the health check is called on the replica. + Defaults to 10s. The health check is by default a no-op actor call to the + replica, but you can define your own as a "check_health" method that raises + an exception when unhealthy. + health_check_timeout_s: How long to wait for a health check method to return + before considering it failed. Defaults to 30s. + graceful_shutdown_wait_loop_s: Duration that replicas wait until there is + no more work to be done before shutting down. + graceful_shutdown_timeout_s: Duration that a replica can be gracefully shutting + down before being forcefully killed. + is_driver_deployment: [EXPERIMENTAL] when set, exactly one replica of this + deployment runs on every node (like a daemon set). Returns: - Deployment + `Deployment` """ # NOTE: The user_configured_option_names should be the first thing that's @@ -458,24 +442,25 @@ def list_deployments() -> Dict[str, Deployment]: @PublicAPI(stability="beta") def run( - target: Union[ClassNode, FunctionNode], + target: Application, _blocking: bool = True, host: str = DEFAULT_HTTP_HOST, port: int = DEFAULT_HTTP_PORT, name: str = SERVE_DEFAULT_APP_NAME, route_prefix: str = DEFAULT.VALUE, -) -> Optional[RayServeHandle]: - """Run a Serve application and return a ServeHandle to the ingress. +) -> Optional[RayServeSyncHandle]: + """Run an application and return a handle to its ingress deployment. - Either a ClassNode, FunctionNode, or a pre-built application - can be passed in. If a node is passed in, all of the deployments it depends - on will be deployed. If there is an ingress, its handle will be returned. + The application is returned by `Deployment.bind()`. Example: + + .. code-block:: python + + handle = serve.run(MyDeployment.bind()) + ray.get(handle.remote()) Args: - target (Union[ClassNode, FunctionNode, Application]): - A user-built Serve Application or a ClassNode that acts as the - root node of DAG. By default ClassNode is the Driver - deployment unless user provides a customized one. + target: + A Serve application returned by `Deployment.bind()`. host: Host for HTTP servers to listen on. Defaults to "127.0.0.1". To expose Serve publicly, you probably want to set this to "0.0.0.0". @@ -487,8 +472,7 @@ def run( nor in the ingress deployment, the route prefix will default to '/'. Returns: - RayServeHandle: A regular ray serve handle that can be called by user - to execute the serve DAG. + RayServeSyncHandle: A handle that can be used to call the application. """ client = _private_api.serve_start( detached=True, @@ -499,35 +483,20 @@ def run( record_extra_usage_tag(TagKey.SERVE_API_VERSION, "v2") if isinstance(target, Application): + deployments = pipeline_build(target._get_internal_dag_node(), name) + ingress = get_and_validate_ingress_deployment(deployments) + elif isinstance(target, BuiltApplication): deployments = list(target.deployments.values()) ingress = target.ingress - # Each DAG should always provide a valid Driver ClassNode - elif isinstance(target, ClassNode): - deployments = pipeline_build(target, name) - ingress = get_and_validate_ingress_deployment(deployments) - # Special case where user is doing single function serve.run(func.bind()) - elif isinstance(target, FunctionNode): - deployments = pipeline_build(target, name) - ingress = get_and_validate_ingress_deployment(deployments) - if len(deployments) != 1: - raise ValueError( - "We only support single function node in serve.run, ex: " - "serve.run(func.bind()). For more than one nodes in your DAG, " - "Please provide a driver class and bind it as entrypoint to " - "your Serve DAG." - ) - elif isinstance(target, DAGNode): - raise ValueError( - "Invalid DAGNode type as entry to serve.run(), " - f"type: {type(target)}, accepted: ClassNode, " - "FunctionNode please provide a driver class and bind it " - "as entrypoint to your Serve DAG." - ) else: - raise TypeError( - "Expected a ClassNode, FunctionNode, or Application as target. " - f"Got unexpected type {type(target)} instead." + msg = ( + "`serve.run` expects an `Application` returned by `Deployment.bind()` " + "or a static `BuiltApplication` returned by `serve.build`." ) + if isinstance(target, DAGNode): + msg += " If you are using the DAG API, you must bind the DAG node to a " + "deployment like: `app = Deployment.bind(my_dag_output)`. " + raise TypeError(msg) # when name provided, keep all existing applications # otherwise, delete all of them. @@ -567,30 +536,25 @@ def run( @PublicAPI(stability="alpha") -def build( - target: Union[ClassNode, FunctionNode], name: str = SERVE_DEFAULT_APP_NAME -) -> Application: - """Builds a Serve application into a static application. - - Takes in a ClassNode or FunctionNode and converts it to a - Serve application consisting of one or more deployments. This is intended - to be used for production scenarios and deployed via the Serve REST API or - CLI, so there are some restrictions placed on the deployments: - 1) All of the deployments must be importable. That is, they cannot be - defined in __main__ or inline defined. The deployments will be - imported in production using the same import path they were here. - 2) All arguments bound to the deployment must be JSON-serializable. +def build(target: Application, name: str = None) -> BuiltApplication: + """Builds a Serve application into a static, built application. + + Resolves the provided Application object into a list of deployments. + This can be converted to a Serve config file that can be deployed via + the Serve REST API or CLI. - The returned Application object can be exported to a dictionary or YAML - config. + All of the deployments must be importable. That is, they cannot be + defined in __main__ or inline defined. The deployments will be + imported in the config file using the same import path they were here. Args: - target (Union[ClassNode, FunctionNode]): A ClassNode or FunctionNode - that acts as the top level node of the DAG. - name: The name of the Serve application. + target: The Serve application to run consisting of one or more + deployments. + name: The name of the Serve application. When name is not provided, the + deployment name won't be updated. (SINGLE_APP use case.) Returns: - The static built Serve application + The static built Serve application. """ if in_interactive_shell(): raise RuntimeError( @@ -601,17 +565,182 @@ def build( # TODO(edoakes): this should accept host and port, but we don't # currently support them in the REST API. - return Application(pipeline_build(target, name)) + return BuiltApplication(pipeline_build(target._get_internal_dag_node(), name)) @PublicAPI(stability="alpha") def delete(name: str, _blocking: bool = True): - """Delete an app by its name + """Delete an application by its name. Deletes the app with all corresponding deployments. - - Args: - name: the name of app to delete. """ client = get_global_client() client.delete_apps([name], blocking=_blocking) + + +@PublicAPI(stability="alpha") +def multiplexed( + func: Optional[Callable[..., Any]] = None, max_num_models_per_replica: int = 3 +): + """[EXPERIMENTAL] Defines a function or method used to load multiplexed + models in a replica. + + The function can be standalone function or a method of a class. The + function must have exactly one argument, the model id of type `str` for the + model to be loaded. + + It is required to define the function with `async def` and the function must be + an async function. It is recommended to define coroutines for long running + IO tasks in the function to avoid blocking the event loop. + + The multiplexed function is called to load a model with the given model ID when + necessary. + + When the number of models in one replica is larger than max_num_models_per_replica, + the models will be unloaded using an LRU policy. + + If you want to release resources after the model is loaded, you can define + a `__del__` method in your model class. The `__del__` method will be called when + the model is unloaded. + + Example: + + .. code-block:: python + from ray import serve + + @serve.deployment + class MultiplexedDeployment: + + def __init__(self): + # Define s3 base path to load models. + self.s3_base_path = "s3://my_bucket/my_models" + + @serve.multiplexed(max_num_models_per_replica=5) + async def load_model(self, model_id: str) -> Any: + # Load model with the given tag + # You can use any model loading library here + # and return the loaded model. load_from_s3 is + # a placeholder function. + return load_from_s3(model_id) + + async def __call__(self, request): + # Get the model_id from the request context. + model_id = serve.get_multiplexed_model_id() + # Load the model for the requested model_id. + # If the model is already cached locally, + # this will just be a dictionary lookup. + model = await self.load_model(model_id) + return model(request) + + + Args: + max_num_models_per_replica: the maximum number of models + to be loaded on each replica. By default, it is 3, which + means that each replica can cache up to 3 models. You can + set it to a larger number if you have enough memory on + the node resource, in opposite, you can set it to a smaller + number if you want to save memory on the node resource. + """ + + if func is not None: + if not callable(func): + raise TypeError( + "The `multiplexed` decorator must be used with a function or method." + ) + + # TODO(Sihan): Make the API accept the sync function as well. + # https://github.com/ray-project/ray/issues/35356 + if not inspect.iscoroutinefunction(func): + raise TypeError( + "@serve.multiplexed can only be used to decorate async " + "functions or methods." + ) + signature = inspect.signature(func) + if len(signature.parameters) == 0 or len(signature.parameters) > 2: + raise TypeError( + "@serve.multiplexed can only be used to decorate functions or methods " + "with at least one 'model_id: str' argument." + ) + + if type(max_num_models_per_replica) is not int: + raise TypeError("max_num_models_per_replica must be an integer.") + + if max_num_models_per_replica != -1 and max_num_models_per_replica <= 0: + raise ValueError("max_num_models_per_replica must be positive.") + + def _multiplex_decorator(func: Callable): + @wraps(func) + async def _multiplex_wrapper(*args): + args_check_error_msg = ( + "Functions decorated with `@serve.multiplexed` must take exactly one" + "the multiplexed model ID (str), but got {}" + ) + if not args: + raise TypeError( + args_check_error_msg.format("no arguments are provided.") + ) + self = extract_self_if_method_call(args, func) + + # User defined multiplexed function can be a standalone function or a + # method of a class. If it is a method of a class, the first argument + # is self. + if self is None: + if len(args) != 1: + raise TypeError( + args_check_error_msg.format("more than one arguments.") + ) + multiplex_object = func + model_id = args[0] + else: + # count self as an argument + if len(args) != 2: + raise TypeError( + args_check_error_msg.format("more than one arguments.") + ) + multiplex_object = self + model_id = args[1] + multiplex_attr = f"__serve_multiplex_{func.__name__}" + # If the multiplexed function is called for the first time, + # create a model multiplex wrapper and cache it in the multiplex object. + if not hasattr(multiplex_object, multiplex_attr): + model_multiplex_wrapper = _ModelMultiplexWrapper( + func, self, max_num_models_per_replica + ) + setattr(multiplex_object, multiplex_attr, model_multiplex_wrapper) + else: + model_multiplex_wrapper = getattr(multiplex_object, multiplex_attr) + return await model_multiplex_wrapper.load_model(model_id) + + return _multiplex_wrapper + + return _multiplex_decorator(func) if callable(func) else _multiplex_decorator + + +@PublicAPI(stability="alpha") +def get_multiplexed_model_id() -> str: + """[EXPERIMENTAL] Get the multiplexed model ID for the current request. + + This is used with a function decorated with `@serve.multiplexed` + to retrieve the model ID for the current request. + + .. code-block:: python + import ray + from ray import serve + import requests + + # Set the multiplexed model id with the key + # "ray_serve_multiplexed_model_id" in the request + # headers when sending requests to the http proxy. + requests.get("http://localhost:8000", + headers={"ray_serve_multiplexed_model_id": "model_1"}) + # This can also be set when using `RayServeHandle`. + handle.options(multiplexed_model_id="model_1").remote("blablabla") + + # In your deployment code, you can retrieve the model id from + # `get_multiplexed_model_id()`. + @serve.deployment + def my_deployment_function(request): + assert serve.get_multiplexed_model_id() == "model_1" + """ + _request_context = ray.serve.context._serve_request_context.get() + return _request_context.multiplexed_model_id diff --git a/python/ray/serve/batching.py b/python/ray/serve/batching.py index 43a354b56769..219e954a6831 100644 --- a/python/ray/serve/batching.py +++ b/python/ray/serve/batching.py @@ -9,6 +9,7 @@ from ray._private.signature import extract_signature, flatten_args, recover_args from ray._private.utils import get_or_create_event_loop from ray.serve.exceptions import RayServeException +from ray.serve._private.utils import extract_self_if_method_call from ray.util.annotations import PublicAPI @@ -171,30 +172,6 @@ def __del__(self): self._handle_batch_task.cancel() -def _extract_self_if_method_call(args: List[Any], func: Callable) -> Optional[object]: - """Check if this is a method rather than a function. - - Does this by checking to see if `func` is the attribute of the first - (`self`) argument under `func.__name__`. Unfortunately, this is the most - robust solution to this I was able to find. It would also be preferable - to do this check when the decorator runs, rather than when the method is. - - Returns the `self` object if it's a method call, else None. - - Arguments: - args (List[Any]): arguments to the function/method call. - func: the unbound function that was called. - """ - if len(args) > 0: - method = getattr(args[0], func.__name__, False) - if method: - wrapped = getattr(method, "__wrapped__", False) - if wrapped and wrapped == func: - return args[0] - - return None - - T = TypeVar("T") R = TypeVar("R") F = TypeVar("F", bound=Callable[[List[T]], List[R]]) @@ -210,13 +187,17 @@ def batch(func: F) -> G: # "Decorator factory" use case (called with arguments). @overload def batch( - max_batch_size: Optional[int] = 10, batch_wait_timeout_s: Optional[float] = 0.0 + max_batch_size: int = 10, batch_wait_timeout_s: float = 0.0 ) -> Callable[[F], G]: pass @PublicAPI(stability="beta") -def batch(_func=None, max_batch_size=10, batch_wait_timeout_s=0.0): +def batch( + _func: Optional[Callable] = None, + max_batch_size: int = 10, + batch_wait_timeout_s: float = 0.0, +): """Converts a function to asynchronously handle batches. The function can be a standalone function or a class method. In both @@ -228,19 +209,33 @@ def batch(_func=None, max_batch_size=10, batch_wait_timeout_s=0.0): or `batch_wait_timeout_s` has elapsed, whichever occurs first. Example: - >>> from ray import serve - >>> @serve.batch(max_batch_size=50, batch_wait_timeout_s=0.5) # doctest: +SKIP - ... async def handle_batch(batch: List[str]): # doctest: +SKIP - ... return [s.lower() for s in batch] # doctest: +SKIP - >>> async def handle_single(s: str): # doctest: +SKIP - ... # Returns s.lower(). - ... return await handle_batch(s) # doctest: +SKIP + + .. code-block:: python + + from ray import serve + from starlette.requests import Request + + @serve.deployment + class BatchedDeployment: + @serve.batch(max_batch_size=10, batch_wait_timeout_s=0.1) + async def batch_handler(self, requests: List[Request]) -> List[str]: + response_batch = [] + for r in requests: + name = (await requests.json())["name"] + response_batch.append(f"Hello {name}!") + + return response_batch + + async def __call__(self, request: Request): + return await self.batch_handler(request) + + app = BatchedDeployment.bind() Arguments: max_batch_size: the maximum batch size that will be executed in one call to the underlying function. batch_wait_timeout_s: the maximum duration to wait for - `max_batch_size` elements before running the underlying function. + `max_batch_size` elements before running the current batch. """ # `_func` will be None in the case when the decorator is parametrized. # See the comment at the end of this function for a detailed explanation. @@ -271,7 +266,7 @@ def batch(_func=None, max_batch_size=10, batch_wait_timeout_s=0.0): def _batch_decorator(_func): @wraps(_func) async def batch_wrapper(*args, **kwargs): - self = _extract_self_if_method_call(args, _func) + self = extract_self_if_method_call(args, _func) flattened_args: List = flatten_args(extract_signature(_func), args, kwargs) if self is None: diff --git a/python/ray/serve/application.py b/python/ray/serve/built_application.py similarity index 95% rename from python/ray/serve/application.py rename to python/ray/serve/built_application.py index 942cfdec1f94..0f450d8094ec 100644 --- a/python/ray/serve/application.py +++ b/python/ray/serve/built_application.py @@ -5,10 +5,10 @@ ) from ray.serve.deployment import Deployment -from ray.util.annotations import DeveloperAPI +from ray.util.annotations import PublicAPI -@DeveloperAPI +@PublicAPI(stability="alpha") class ImmutableDeploymentDict(dict): def __init__(self, deployments: Dict[str, Deployment]): super().__init__() @@ -22,8 +22,8 @@ def __setitem__(self, *args): ) -@DeveloperAPI -class Application: +@PublicAPI(stability="alpha") +class BuiltApplication: """A static, pre-built Serve application. An application consists of a number of Serve deployments that can send diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 6c5b4c891747..b0a6299a094c 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -12,6 +12,7 @@ NonNegativeInt, PositiveInt, validator, + Field, ) from ray import cloudpickle @@ -23,7 +24,7 @@ DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT, ) -from ray.serve._private.utils import DEFAULT +from ray.serve._private.utils import DEFAULT, DeploymentOptionUpdateType from ray.serve.generated.serve_pb2 import ( DeploymentConfig as DeploymentConfigProto, DeploymentLanguage, @@ -141,21 +142,37 @@ class DeploymentConfig(BaseModel): The names of options manually configured by the user. """ - num_replicas: NonNegativeInt = 1 - max_concurrent_queries: Optional[int] = None - user_config: Any = None + num_replicas: NonNegativeInt = Field( + default=1, update_type=DeploymentOptionUpdateType.LightWeight + ) + max_concurrent_queries: Optional[int] = Field( + default=None, update_type=DeploymentOptionUpdateType.NeedsReconfigure + ) + user_config: Any = Field( + default=None, update_type=DeploymentOptionUpdateType.NeedsActorReconfigure + ) - graceful_shutdown_timeout_s: NonNegativeFloat = ( - DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_S # noqa: E501 + graceful_shutdown_timeout_s: NonNegativeFloat = Field( + default=DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_S, + update_type=DeploymentOptionUpdateType.NeedsReconfigure, ) - graceful_shutdown_wait_loop_s: NonNegativeFloat = ( - DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S # noqa: E501 + graceful_shutdown_wait_loop_s: NonNegativeFloat = Field( + default=DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S, + update_type=DeploymentOptionUpdateType.NeedsActorReconfigure, ) - health_check_period_s: PositiveFloat = DEFAULT_HEALTH_CHECK_PERIOD_S - health_check_timeout_s: PositiveFloat = DEFAULT_HEALTH_CHECK_TIMEOUT_S + health_check_period_s: PositiveFloat = Field( + default=DEFAULT_HEALTH_CHECK_PERIOD_S, + update_type=DeploymentOptionUpdateType.NeedsReconfigure, + ) + health_check_timeout_s: PositiveFloat = Field( + default=DEFAULT_HEALTH_CHECK_TIMEOUT_S, + update_type=DeploymentOptionUpdateType.NeedsReconfigure, + ) - autoscaling_config: Optional[AutoscalingConfig] = None + autoscaling_config: Optional[AutoscalingConfig] = Field( + default=None, update_type=DeploymentOptionUpdateType.LightWeight + ) # This flag is used to let replica know they are deplyed from # a different language. @@ -165,7 +182,10 @@ class DeploymentConfig(BaseModel): # the deploymnent use. deployment_language: Any = DeploymentLanguage.PYTHON - version: Optional[str] = None + version: Optional[str] = Field( + default=None, + update_type=DeploymentOptionUpdateType.HeavyWeight, + ) # Contains the names of deployment options manually set by the user user_configured_option_names: Set[str] = set() diff --git a/python/ray/serve/context.py b/python/ray/serve/context.py index 3497c55b138c..0823ceac2d88 100644 --- a/python/ray/serve/context.py +++ b/python/ray/serve/context.py @@ -31,6 +31,7 @@ class ReplicaContext: replica_tag: ReplicaTag _internal_controller_name: str servable_object: Callable + app_name: str @PublicAPI(stability="alpha") @@ -73,10 +74,11 @@ def _set_internal_replica_context( replica_tag: ReplicaTag, controller_name: str, servable_object: Callable, + app_name: str, ): global _INTERNAL_REPLICA_CONTEXT _INTERNAL_REPLICA_CONTEXT = ReplicaContext( - deployment, replica_tag, controller_name, servable_object + deployment, replica_tag, controller_name, servable_object, app_name ) @@ -98,7 +100,7 @@ def _connect() -> ServeControllerClient: """ # Initialize ray if needed. - ray._private.worker.global_worker.filter_logs_by_job = False + ray._private.worker.global_worker._filter_logs_by_job = False if not ray.is_initialized(): ray.init(namespace=SERVE_NAMESPACE) @@ -146,6 +148,8 @@ def _connect() -> ServeControllerClient: class RequestContext: route: str = "" request_id: str = "" + app_name: str = "" + multiplexed_model_id: str = "" _serve_request_context = contextvars.ContextVar( diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index be60f3e2190b..360a2ae57a97 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -23,6 +23,7 @@ RunningReplicaInfo, StatusOverview, ServeDeployMode, + MultiplexedReplicaInfo, ) from ray.serve.config import HTTPOptions from ray.serve._private.constants import ( @@ -32,15 +33,22 @@ SERVE_ROOT_URL_ENV_KEY, SERVE_NAMESPACE, RAY_INTERNAL_SERVE_CONTROLLER_PIN_ON_NODE, + RECOVERING_LONG_POLL_BROADCAST_TIMEOUT_S, SERVE_DEFAULT_APP_NAME, DEPLOYMENT_NAME_PREFIX_SEPARATOR, MULTI_APP_MIGRATION_MESSAGE, ) -from ray.serve._private.deploy_utils import deploy_args_to_deployment_info +from ray.serve._private.deploy_utils import ( + deploy_args_to_deployment_info, + get_app_code_version, +) from ray.serve._private.deployment_state import DeploymentStateManager, ReplicaState from ray.serve._private.endpoint_state import EndpointState from ray.serve._private.http_state import HTTPState -from ray.serve._private.logging_utils import configure_component_logger +from ray.serve._private.logging_utils import ( + configure_component_logger, + get_component_logger_file_path, +) from ray.serve._private.long_poll import LongPollHost from ray.serve.exceptions import RayServeException from ray.serve.schema import ( @@ -49,15 +57,14 @@ ApplicationDetails, ServeInstanceDetails, HTTPOptionsSchema, + ServeActorDetails, ) from ray.serve._private.storage.kv_store import RayInternalKVStore from ray.serve._private.utils import ( DEFAULT, override_runtime_envs_except_env_vars, - get_random_letters, ) from ray.serve._private.application_state import ApplicationStateManager -from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -120,6 +127,7 @@ async def __init__( self.deployment_stats = defaultdict(lambda: defaultdict(dict)) self.long_poll_host = LongPollHost() + self.done_recovering_event = asyncio.Event() if _disable_http_proxy: self.http_state = None @@ -158,6 +166,15 @@ async def __init__( # Keep track of single-app vs multi-app self.deploy_mode = ServeDeployMode.UNSET + # Controller actor details + self._actor_details = ServeActorDetails( + node_id=ray.get_runtime_context().get_node_id(), + node_ip=ray.util.get_node_ip_address(), + actor_id=ray.get_runtime_context().get_actor_id(), + actor_name=self.controller_name, + worker_id=ray._private.worker.global_worker.worker_id.hex(), + log_file_path=get_component_logger_file_path(), + ) run_background_task(self.run_control_loop()) @@ -197,6 +214,9 @@ async def listen_for_change(self, keys_to_snapshot_ids: Dict[str, int]): determine whether or not the host should immediately return the data or wait for the value to be changed. """ + if not self.done_recovering_event.is_set(): + await self.done_recovering_event.wait() + return await (self.long_poll_host.listen_for_change(keys_to_snapshot_ids)) async def listen_for_change_java(self, keys_to_snapshot_ids_bytes: bytes): @@ -206,6 +226,9 @@ async def listen_for_change_java(self, keys_to_snapshot_ids_bytes: bytes): keys_to_snapshot_ids_bytes (Dict[str, int]): the protobuf bytes of keys_to_snapshot_ids (Dict[str, int]). """ + if not self.done_recovering_event.is_set(): + await self.done_recovering_event.wait() + return await ( self.long_poll_host.listen_for_change_java(keys_to_snapshot_ids_bytes) ) @@ -250,16 +273,35 @@ async def run_control_loop(self) -> None: # NOTE(edoakes): we catch all exceptions here and simply log them, # because an unhandled exception would cause the main control loop to # halt, which should *never* happen. + recovering_timeout = RECOVERING_LONG_POLL_BROADCAST_TIMEOUT_S + start_time = time.time() while True: - if self.http_state: + if ( + not self.done_recovering_event.is_set() + and time.time() - start_time > recovering_timeout + ): + logger.warning( + f"Replicas still recovering after {recovering_timeout}s, " + "setting done recovering event to broadcast long poll updates." + ) + self.done_recovering_event.set() + + # Don't update http_state until after the done recovering event is set, + # otherwise we may start a new HTTP proxy but not broadcast it any + # info about available deployments & their replicas. + if self.http_state and self.done_recovering_event.is_set(): try: self.http_state.update() except Exception: logger.exception("Exception updating HTTP state.") + try: - self.deployment_state_manager.update() + any_recovering = self.deployment_state_manager.update() + if not self.done_recovering_event.is_set() and not any_recovering: + self.done_recovering_event.set() except Exception: logger.exception("Exception updating deployment state.") + try: self.application_state_manager.update() except Exception: @@ -320,21 +362,16 @@ def _recover_config_from_checkpoint(self): deployment_time, deploy_mode, config_checkpoints_dict = pickle.loads( checkpoint ) - applications = [ - app_config_dict - for app_config_dict, _ in config_checkpoints_dict.values() - ] + applications = list(config_checkpoints_dict.values()) if deploy_mode == ServeDeployMode.SINGLE_APP: self.deploy_apps( ServeApplicationSchema.parse_obj(applications[0]), deployment_time, - False, ) else: self.deploy_apps( ServeDeploySchema.parse_obj({"applications": applications}), deployment_time, - False, ) def _all_running_replicas(self) -> Dict[str, List[RunningReplicaInfo]]: @@ -383,21 +420,27 @@ def deploy( deployer_job_id: Union[str, bytes], docs_path: Optional[str] = None, is_driver_deployment: Optional[bool] = False, + app_name: str = None, ) -> bool: """Deploys a deployment.""" - if route_prefix is not None: assert route_prefix.startswith("/") if docs_path is not None: assert docs_path.startswith("/") + # app_name is None for V1 API, reset it to empty string to avoid + # breaking metrics. + if app_name is None: + app_name = "" + deployment_info = deploy_args_to_deployment_info( deployment_name=name, deployment_config_proto_bytes=deployment_config_proto_bytes, replica_config_proto_bytes=replica_config_proto_bytes, deployer_job_id=deployer_job_id, - previous_deployment=self.deployment_state_manager.get_deployment(name), + route_prefix=route_prefix, is_driver_deployment=is_driver_deployment, + app_name=app_name, ) # TODO(architkulkarni): When a deployment is redeployed, even if @@ -406,7 +449,7 @@ def deploy( updating = self.deployment_state_manager.deploy(name, deployment_info) if route_prefix is not None: - endpoint_info = EndpointInfo(route=route_prefix) + endpoint_info = EndpointInfo(route=route_prefix, app_name=app_name) self.endpoint_state.update_endpoint(name, endpoint_info) else: self.endpoint_state.delete_endpoint(name) @@ -442,7 +485,6 @@ def deploy_apps( self, config: Union[ServeApplicationSchema, ServeDeploySchema], deployment_time: float = 0, - _internal: bool = False, ) -> None: """Kicks off a task that deploys a set of Serve applications. @@ -463,11 +505,6 @@ def deploy_apps( deployment_time: set deployment_timestamp. If not provided, time.time() is used to indicate the deployment time. - - _internal: whether the config is provided by user or internally (i.e. it is - restored from a checkpoint). If it is provided by the user, we need to - prepend the app name to each deployment name. If not, it should already - be prepended. """ # TODO (zcin): We should still support single-app mode, i.e. # ServeApplicationSchema. Eventually, after migration is complete, we should @@ -512,38 +549,13 @@ def deploy_apps( if not deployment_time: deployment_time = time.time() - # Load checkpointed data from last time deploy_apps was called - config_checkpoint = self.kv_store.get(CONFIG_CHECKPOINT_KEY) - if config_checkpoint is None: - config_checkpoints_dict = {} - else: - _, _, config_checkpoints_dict = pickle.loads(config_checkpoint) - new_config_checkpoint = {} for app_config in applications: - app_config_dict = app_config.dict(exclude_unset=True) - - # Compare new config options with old ones, set versions of new deployments - if app_config.name in config_checkpoints_dict: - (prev_app_config, prev_versions) = config_checkpoints_dict[ - app_config.name - ] - - updated_versions = _generate_deployment_config_versions( - app_config_dict, - prev_app_config, - prev_versions, - ) - else: - updated_versions = _generate_deployment_config_versions(app_config_dict) - - deployment_override_options = app_config_dict.get("deployments", []) + code_version = get_app_code_version(app_config) - new_config_checkpoint[app_config.name] = ( - app_config_dict, - updated_versions, - ) + app_config_dict = app_config.dict(exclude_unset=True) + new_config_checkpoint[app_config.name] = app_config_dict logger.info( "Starting deploy_serve_application " @@ -554,10 +566,11 @@ def deploy_apps( ).remote( app_config.import_path, app_config.runtime_env, - deployment_override_options, - updated_versions, + app_config_dict.get("deployments", []), + code_version, app_config_dict.get("route_prefix", DEFAULT.VALUE), app_config.name, + app_config.args, ) self.application_state_manager.create_application_state( @@ -702,11 +715,15 @@ def get_serve_instance_details(self) -> Dict: # route_prefix is set instead in each application. # Eventually we want to remove route_prefix from DeploymentSchema. return ServeInstanceDetails( + controller_info=self._actor_details, proxy_location=http_config.location, http_options=HTTPOptionsSchema( host=http_config.host, port=http_config.port, ), + http_proxies=self.http_state.get_http_proxy_details() + if self.http_state + else None, deploy_mode=self.deploy_mode, applications=applications, ).dict(exclude_unset=True) @@ -746,7 +763,7 @@ def get_app_config(self, name: str = SERVE_DEFAULT_APP_NAME) -> Optional[Dict]: if checkpoint is not None: _, _, config_checkpoints_dict = pickle.loads(checkpoint) if name in config_checkpoints_dict: - config, _ = config_checkpoints_dict[name] + config = config_checkpoints_dict[name] return ServeApplicationSchema.parse_obj(config).dict(exclude_unset=True) def get_all_deployment_statuses(self) -> List[bytes]: @@ -780,97 +797,13 @@ def delete_apps(self, names: Iterable[str]): self.application_state_manager.delete_application(name) self.delete_deployments(deployments_to_delete) - -def _generate_deployment_config_versions( - new_config: Dict, - last_deployed_config: Dict = None, - last_deployed_versions: Dict = None, -) -> Dict[str, str]: - """ - This function determines whether each deployment's version should be changed based - on the newly deployed config. - - When ``import_path`` or ``runtime_env`` is changed, the versions for all deployments - should be changed, so old replicas are torn down. When the options for a deployment - in ``deployments`` change, its version should generally change. The only deployment - options that can be changed without tearing down replicas (i.e. changing the - version) are: - * num_replicas - * user_config - * autoscaling_config - - A deployment option is considered changed when: - * it was not specified in last_deployed_config and is specified in new_config - * it was specified in last_deployed_config and is not specified in new_config - * it is specified in both last_deployed_config and new_config but the specified - value has changed - - Args: - new_config: Newly deployed config dict that follows ServeApplicationSchema - last_deployed_config: Last deployed config dict that follows - ServeApplicationSchema, which is an empty dictionary if there is no previous - deployment - last_deployed_versions: Dictionary of {deployment_name: str -> version: str} - tracking the versions of deployments listed in the last deployed config - - Returns: - Dictionary of {deployment_name: str -> version: str} containing updated - versions for deployments listed in the new config - """ - # If import_path or runtime_env is changed, it is considered a code change - if last_deployed_config is None: - last_deployed_config = {} - if last_deployed_versions is None: - last_deployed_versions = {} - - if last_deployed_config.get("import_path") != new_config.get( - "import_path" - ) or last_deployed_config.get("runtime_env") != new_config.get("runtime_env"): - last_deployed_config, last_deployed_versions = {}, {} - - new_deployments = {d["name"]: d for d in new_config.get("deployments", [])} - old_deployments = { - d["name"]: d for d in last_deployed_config.get("deployments", []) - } - - lightweight_update_options = { - "num_replicas": TagKey.SERVE_NUM_REPLICAS_LIGHTWEIGHT_UPDATED, - "user_config": TagKey.SERVE_USER_CONFIG_LIGHTWEIGHT_UPDATED, - "autoscaling_config": TagKey.SERVE_AUTOSCALING_CONFIG_LIGHTWEIGHT_UPDATED, - } - - def exclude_lightweight_update_options(dict): - # Exclude config options from dict that qualify for a lightweight config - # update. Changes in any other config options are considered a code change, - # and require a version change to trigger an update that tears - # down existing replicas and replaces them with updated ones. - return { - option: dict[option] - for option in dict - if option not in lightweight_update_options - } - - updated_versions = {} - for name in new_deployments: - old_deployment = old_deployments.get(name, {}) - new_deployment = new_deployments[name] - new_deployment_filtered = exclude_lightweight_update_options(new_deployment) - old_deployment_filtered = exclude_lightweight_update_options(old_deployment) - - # If config options haven't changed, version stays the same - # otherwise, generate a new random version - if old_deployment_filtered == new_deployment_filtered: - updated_versions[name] = last_deployed_versions[name] - - # If the rest of the options haven't changed, but a lightweight option has - # changed, then Serve will execute a lightweight update - for option, tagkey in lightweight_update_options.items(): - if old_deployment.get(option) != new_deployment.get(option): - record_extra_usage_tag(tagkey, "True") - else: - updated_versions[name] = get_random_letters() - - return updated_versions + def record_multiplexed_replica_info(self, info: MultiplexedReplicaInfo): + """Record multiplexed model ids for a replica of deployment + Args: + info: MultiplexedReplicaInfo including deployment name, replica tag and + model ids. + """ + self.deployment_state_manager.record_multiplexed_replica_info(info) @ray.remote(num_cpus=0, max_calls=1) @@ -878,9 +811,10 @@ def deploy_serve_application( import_path: str, runtime_env: Dict, deployment_override_options: List[Dict], - deployment_versions: Dict, + code_version: str, route_prefix: str, name: str, + args: Dict, ): """Deploy Serve application from a user-provided config. @@ -899,11 +833,13 @@ def deploy_serve_application( try: from ray import serve from ray.serve.api import build + from ray.serve._private.api import call_app_builder_with_args_if_necessary # Import and build the application. - app = build(import_attr(import_path), name) + app = call_app_builder_with_args_if_necessary(import_attr(import_path), args) + app = build(app, name) - # Override options for each deployment. + # Override options for each deployment listed in the config. for options in deployment_override_options: deployment_name = options["name"] unique_deployment_name = ( @@ -932,13 +868,18 @@ def deploy_serve_application( ) ray_actor_options.update({"runtime_env": merged_env}) options["ray_actor_options"] = ray_actor_options - options["version"] = deployment_versions[deployment_name] options["name"] = unique_deployment_name # Update the deployment's options app.deployments[unique_deployment_name].set_options( **options, _internal=True ) + # Set code version for each deployment + for deployment_name in app.deployments: + app.deployments[deployment_name].set_options( + version=code_version, _internal=True + ) + # Run the application locally on the cluster. serve.run(app, name=name, route_prefix=route_prefix) except KeyboardInterrupt: diff --git a/python/ray/serve/deployment.py b/python/ray/serve/deployment.py index b3a900ddd55d..bd776e28afd3 100644 --- a/python/ray/serve/deployment.py +++ b/python/ray/serve/deployment.py @@ -12,6 +12,7 @@ from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag from ray.serve.context import get_global_client +from ray.dag.dag_node import DAGNodeBase from ray.dag.class_node import ClassNode from ray.dag.function_node import FunctionNode from ray.serve.config import ( @@ -31,8 +32,95 @@ logger = logging.getLogger(SERVE_LOGGER_NAME) +@PublicAPI(stability="beta") +class Application(DAGNodeBase): + """One or more deployments bound with arguments that can be deployed together. + + Can be passed into another `Deployment.bind()` to compose multiple deployments in a + single application, passed to `serve.run`, or deployed via a Serve config file. + + For example, to define an Application and run it in Python: + + .. code-block:: python + + from ray import serve + from ray.serve import Application + + @serve.deployment + class MyDeployment: + pass + + app: Application = MyDeployment.bind(OtherDeployment.bind()) + serve.run(app) + + To run the same app using the command line interface (CLI): + + .. code-block:: bash + + serve run python_file:app + + To deploy the same app via a config file: + + .. code-block:: yaml + + applications: + my_app: + import_path: python_file:app + + """ + + def __init__( + self, *, _internal_dag_node: Optional[Union[ClassNode, FunctionNode]] = None + ): + if _internal_dag_node is None: + raise RuntimeError("This class should not be constructed directly.") + + self._internal_dag_node = _internal_dag_node + + def _get_internal_dag_node(self) -> Union[ClassNode, FunctionNode]: + if self._internal_dag_node is None: + raise RuntimeError("Application object should not be constructed directly.") + + return self._internal_dag_node + + @classmethod + def _from_internal_dag_node(cls, dag_node: Union[ClassNode, FunctionNode]): + return cls(_internal_dag_node=dag_node) + + # Proxy all method calls to the underlying DAG node. This allows this class to be + # passed in place of the ClassNode or FunctionNode in the DAG building code. + def __getattr__(self, name: str) -> Any: + return getattr(self._get_internal_dag_node(), name) + + @PublicAPI class Deployment: + """Class (or function) decorated with the `@serve.deployment` decorator. + + This is run on a number of replica actors. Requests to those replicas call + this class. + + One or more deployments can be composed together into an `Application` which is + then run via `serve.run` or a config file. + + Example: + + .. code-block:: python + + @serve.deployment + class MyDeployment: + def __init__(self, name: str): + self._name = name + + def __call__(self, request): + return "Hello world!" + + app = MyDeployment.bind() + # Run via `serve.run` or the `serve run` CLI command. + serve.run(app) + + """ + def __init__( self, func_or_class: Union[Callable, str], @@ -46,13 +134,6 @@ def __init__( is_driver_deployment: Optional[bool] = False, _internal=False, ) -> None: - """Construct a Deployment. CONSTRUCTOR SHOULDN'T BE USED DIRECTLY. - - Deployments should be created, retrieved, and updated using - `@serve.deployment`, `serve.get_deployment`, and `Deployment.options`, - respectively. - """ - if not _internal: raise RuntimeError( "The Deployment constructor should not be called " @@ -120,10 +201,6 @@ def name(self) -> str: @property def version(self) -> Optional[str]: - """Version of this deployment. - - If None, will be redeployed every time `.deploy()` is called. - """ return self._version @property @@ -160,17 +237,14 @@ def ray_actor_options(self) -> Optional[Dict]: @property def init_args(self) -> Tuple[Any]: - """Positional args passed to the underlying class's constructor.""" return self._init_args @property def init_kwargs(self) -> Tuple[Any]: - """Keyword args passed to the underlying class's constructor.""" return self._init_kwargs @property def url(self) -> Optional[str]: - """Full HTTP url for this deployment.""" if self._route_prefix is None or self._is_driver_deployment: # this deployment is not exposed over HTTP return None @@ -184,11 +258,11 @@ def __call__(self): ) @PublicAPI(stability="beta") - def bind(self, *args, **kwargs) -> Union[ClassNode, FunctionNode]: - """Bind the provided arguments and return a class or function node. + def bind(self, *args, **kwargs) -> Application: + """Bind the arguments to the deployment and return an Application. - The returned bound deployment can be deployed or bound to other - deployments to create a deployment graph. + The returned Application can be deployed using `serve.run` (or via + config file) or bound to another deployment for composition. """ copied_self = copy(self) @@ -196,7 +270,7 @@ def bind(self, *args, **kwargs) -> Union[ClassNode, FunctionNode]: schema_shell = deployment_to_schema(copied_self) if inspect.isfunction(self._func_or_class): - return FunctionNode( + dag_node = FunctionNode( self._func_or_class, args, # Used to bind and resolve DAG only, can take user input kwargs, # Used to bind and resolve DAG only, can take user input @@ -207,7 +281,7 @@ def bind(self, *args, **kwargs) -> Union[ClassNode, FunctionNode]: }, ) else: - return ClassNode( + dag_node = ClassNode( self._func_or_class, args, kwargs, @@ -218,6 +292,8 @@ def bind(self, *args, **kwargs) -> Union[ClassNode, FunctionNode]: }, ) + return Application._from_internal_dag_node(dag_node) + @guarded_deprecation_warning(instructions=MIGRATION_MESSAGE) @Deprecated(message=MIGRATION_MESSAGE) def deploy(self, *init_args, _blocking=True, **init_kwargs): @@ -342,14 +418,7 @@ def options( Only those options passed in will be updated, all others will remain unchanged from the existing deployment. - Args: - Refer to @serve.deployment decorator docstring for all non-private - arguments. - - _internal: If True, this function won't log deprecation warnings - and won't update this deployment's config's - user_configured_option_names. It should only be True when used - internally by Serve. It should be False when called by users. + Refer to the `@serve.deployment` decorator docs for available arguments. """ # NOTE: The user_configured_option_names should be the first thing that's @@ -469,10 +538,13 @@ def set_options( is_driver_deployment: bool = DEFAULT.VALUE, _internal: bool = False, ) -> None: - """Overwrite this deployment's options. Mutates the deployment. + """Overwrite this deployment's options in-place. Only those options passed in will be updated, all others will remain unchanged. + + Refer to the @serve.deployment decorator docstring for all non-private + arguments. """ validated = self.options( diff --git a/python/ray/serve/drivers.py b/python/ray/serve/drivers.py index dc2e8f0d71e7..25019a713103 100644 --- a/python/ray/serve/drivers.py +++ b/python/ray/serve/drivers.py @@ -3,6 +3,7 @@ from typing import Any, Callable, Optional, Union, Dict import ray from ray._private.utils import get_or_create_event_loop +from ray._private.tls_utils import add_port_to_grpc_server from ray.serve._private.utils import install_serve_encoders_to_fastapi, record_serve_tag from ray.util.annotations import PublicAPI @@ -50,7 +51,6 @@ def __init__( install_serve_encoders_to_fastapi() http_adapter = load_http_adapter(http_adapter) - self.app = FastAPI() if isinstance(dags, dict): @@ -146,13 +146,25 @@ def __init__(self, port: int = DEFAULT_GRPC_PORT): async def run(self): """Start gRPC Server""" - logger.info( "Starting gRPC server with on node:{} " "listening on port {}".format(ray.util.get_node_ip_address(), self.port) ) + address = "[::]:{}".format(self.port) + try: + # Depending on whether RAY_USE_TLS is on, `add_port_to_grpc_server` + # can create a secure or insecure channel + self.grpc_port = add_port_to_grpc_server(self.server, address) + except Exception: + # TODO(SongGuyang): Catch the exception here because there is + # port conflict issue which brought from static port. We should + # remove this after we find better port resolution. + logger.exception( + "Failed to add port to grpc server. GRPC service will be disabled" + ) + self.server = None + self.grpc_port = None - self.server.add_insecure_port("[::]:{}".format(self.port)) self.setup_complete.set() await self.server.start() await self.server.wait_for_termination() diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 711bca9073da..e5187a9d069e 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -76,37 +76,43 @@ class HandleOptions: @PublicAPI(stability="beta") class RayServeHandle: - """A handle to a service deployment. + """A handle used to make requests from one deployment to another. - Invoking this deployment with .remote is equivalent to pinging - an HTTP deployment. + This is used to compose multiple deployments into a single application. After + building the application, this handle is substituted at runtime for deployments + passed as arguments via `.bind()`. Example: - >>> import ray - >>> serve_client = ... # doctest: +SKIP - >>> handle = serve_client.get_handle("my_deployment") # doctest: +SKIP - >>> handle # doctest: +SKIP - RayServeSyncHandle(deployment_name="my_deployment") - >>> my_request_content = ... # doctest: +SKIP - >>> handle.remote(my_request_content) # doctest: +SKIP - ObjectRef(...) - >>> ray.get(handle.remote(...)) # doctest: +SKIP - # result - >>> let_it_crash_request = ... # doctest: +SKIP - >>> ray.get(handle.remote(let_it_crash_request)) # doctest: +SKIP - # raises RayTaskError Exception - >>> async_handle = serve_client.get_handle( # doctest: +SKIP - ... "my_deployment", sync=False) - >>> async_handle # doctest: +SKIP - RayServeHandle(deployment="my_deployment") - >>> await async_handle.remote(my_request_content) # doctest: +SKIP - ObjectRef(...) - >>> ray.get(await async_handle.remote(...)) # doctest: +SKIP - # result - >>> ray.get( # doctest: +SKIP - ... await async_handle.remote(let_it_crash_request) - ... ) - # raises RayTaskError Exception + + .. code-block:: python + + import ray + from ray import serve + from ray.serve.handle import RayServeHandle, RayServeSyncHandle + + @serve.deployment + class Downstream: + def __init__(self, message: str): + self._message = message + + def __call__(self, name: str) -> str: + return self._message + name + + @serve.deployment + class Ingress: + def __init__(self, handle: RayServeHandle): + self._handle = handle + + async def __call__(self, name: str) -> str: + obj_ref: ray.ObjectRef = await self._handle.remote(name) + return await obj_ref + + app = Ingress.bind(Downstream.bind("Hello ")) + handle: RayServeSyncHandle = serve.run(app) + + # Prints "Hello Mr. Magoo" + print(ray.get(handle.remote("Mr. Magoo"))) + """ def __init__( @@ -130,7 +136,7 @@ def __init__( "The number of handle.remote() calls that have been " "made on this handle." ), - tag_keys=("handle", "deployment", "route"), + tag_keys=("handle", "deployment", "route", "application"), ) self.request_counter.set_default_tags( {"handle": self.handle_tag, "deployment": self.deployment_name} @@ -173,28 +179,23 @@ def stop_metrics_pusher(self): self._pusher.join() @property - def is_polling(self) -> bool: + def _is_polling(self) -> bool: """Whether this handle is actively polling for replica updates.""" return self.router.long_poll_client.is_running @property - def is_same_loop(self) -> bool: + def _is_same_loop(self) -> bool: """Whether the caller's asyncio loop is the same loop for handle. This is only useful for async handles. """ return get_or_create_event_loop() == self.router._event_loop - def options( + def _options( self, *, method_name: Union[str, DEFAULT] = DEFAULT.VALUE, ): - """Set options for this handle. - - Args: - method_name: The method to invoke. - """ new_options_dict = self.handle_options.__dict__.copy() user_modified_options_dict = { key: value @@ -212,6 +213,24 @@ def options( _internal_pickled_http_request=self._pickled_http_request, ) + def options( + self, + *, + method_name: Union[str, DEFAULT] = DEFAULT.VALUE, + ) -> "RayServeHandle": + """Set options for this handle and return an updated copy of it. + + Example: + + .. code-block:: python + + # The following two lines are equivalent: + obj_ref = await handle.other_method.remote(*args) + obj_ref = await handle.options(method_name="other_method").remote(*args) + + """ + return self._options(method_name=method_name) + def _remote(self, deployment_name, handle_options, args, kwargs) -> Coroutine: _request_context = ray.serve.context._serve_request_context.get() request_metadata = RequestMetadata( @@ -220,26 +239,33 @@ def _remote(self, deployment_name, handle_options, args, kwargs) -> Coroutine: call_method=handle_options.method_name, http_arg_is_pickled=self._pickled_http_request, route=_request_context.route, + app_name=_request_context.app_name, + ) + self.request_counter.inc( + tags={ + "route": _request_context.route, + "application": _request_context.app_name, + } ) - self.request_counter.inc(tags={"route": _request_context.route}) coro = self.router.assign_request(request_metadata, *args, **kwargs) return coro @_wrap_into_async_task - async def remote(self, *args, **kwargs): - """Issue an asynchronous request to the deployment. + async def remote(self, *args, **kwargs) -> asyncio.Task: + """Issue an asynchronous request to the __call__ method of the deployment. + + Returns an `asyncio.Task` whose underlying result is a Ray ObjectRef that + points to the final result of the request. + + The final result can be retrieved by awaiting the ObjectRef. + + Example: + + .. code-block:: python + + obj_ref = await handle.remote(*args) + result = await obj_ref - Returns a Ray ObjectRef whose results can be waited for or retrieved - using ray.wait or ray.get (or ``await object_ref``), respectively. - - Returns: - ray.ObjectRef - Args: - request_data(dict, Any): If it's a dictionary, the data will be - available in ``request.json()`` or ``request.form()``. - Otherwise, it will be available in ``request.body()``. - ``**kwargs``: All keyword arguments will be available in - ``request.query_params``. """ return await self._remote( self.deployment_name, self.handle_options, args, kwargs @@ -271,8 +297,32 @@ def __del__(self): @PublicAPI(stability="beta") class RayServeSyncHandle(RayServeHandle): + """A handle used to make requests to the ingress deployment of an application. + + This is returned by `serve.run` and can be used to invoke the application from + Python rather than over HTTP. For example: + + .. code-block:: python + + import ray + from ray import serve + from ray.serve.handle import RayServeSyncHandle + + @serve.deployment + class Ingress: + def __call__(self, name: str) -> str: + return f"Hello {name}" + + app = Ingress.bind() + handle: RayServeSyncHandle = serve.run(app) + + # Prints "Hello Mr. Magoo" + print(ray.get(handle.remote("Mr. Magoo"))) + + """ + @property - def is_same_loop(self) -> bool: + def _is_same_loop(self) -> bool: # NOTE(simon): For sync handle, the caller doesn't have to be in the # same loop as the handle's loop, so we always return True here. return True @@ -285,22 +335,35 @@ def _make_router(self) -> Router: event_loop=_create_or_get_async_loop_in_thread(), ) - def remote(self, *args, **kwargs): - """Issue an asynchronous request to the deployment. + def options( + self, + *, + method_name: Union[str, DEFAULT] = DEFAULT.VALUE, + ) -> "RayServeSyncHandle": + """Set options for this handle and return an updated copy of it. + + Example: + + .. code-block:: python + + # The following two lines are equivalent: + obj_ref = handle.other_method.remote(*args) + obj_ref = handle.options(method_name="other_method").remote(*args) + + """ + return self._options(method_name=method_name) + + def remote(self, *args, **kwargs) -> ray.ObjectRef: + """Issue an asynchronous request to the __call__ method of the deployment. Returns a Ray ObjectRef whose results can be waited for or retrieved - using ray.wait or ray.get (or ``await object_ref``), respectively. - - Returns: - ray.ObjectRef - Args: - request_data(dict, Any): If it's a dictionary, the data will be - available in ``request.json()`` or ``request.form()``. - If it's a Starlette Request object, it will be passed in to the - handler directly, unmodified. Otherwise, the data will be - available in ``request.data``. - ``**kwargs``: All keyword arguments will be available in - ``request.args``. + using ray.wait or ray.get, respectively. + + .. code-block:: python + + obj_ref = handle.remote(*args) + result = ray.get(obj_ref) + """ coro = self._remote(self.deployment_name, self.handle_options, args, kwargs) future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( @@ -334,7 +397,7 @@ def __init__( # requirement of serve.start; Thus handle is fulfilled at runtime. self.handle: RayServeHandle = None - def options(self, *, method_name: str): + def options(self, *, method_name: str) -> "RayServeDeploymentHandle": return self.__class__( self.deployment_name, HandleOptions(method_name=method_name) ) diff --git a/python/ray/serve/metrics.py b/python/ray/serve/metrics.py index 356538be30bf..80b671a1c9a6 100644 --- a/python/ray/serve/metrics.py +++ b/python/ray/serve/metrics.py @@ -1,24 +1,38 @@ from ray.util import metrics -from typing import Tuple, Optional, Dict, List +from typing import Tuple, Optional, Dict, List, Union from ray.serve import context +import ray DEPLOYMENT_TAG = "deployment" REPLICA_TAG = "replica" +APPLICATION_TAG = "application" +ROUTE_TAG = "route" -def _add_serve_metric_tags(tag_keys: Optional[Tuple[str]] = None): +def _add_serve_metric_tags(tag_keys: Optional[Tuple[str]] = None) -> Tuple[str]: """Add serve context tags to the tag_keys""" + if tag_keys is None: + tag_keys = tuple() + + # If the context doesn't exist, no serve tag is added. if context.get_internal_replica_context() is None: return tag_keys + # Check no collision with customer tag if DEPLOYMENT_TAG in tag_keys: raise ValueError(f"'{DEPLOYMENT_TAG}' tag is reserved for Ray Serve metrics") if REPLICA_TAG in tag_keys: raise ValueError(f"'{REPLICA_TAG}' tag is reserved for Ray Serve metrics") + if APPLICATION_TAG in tag_keys: + raise ValueError(f"'{APPLICATION_TAG}' tag is reserved for Ray Serve metrics") + # Get serve tag inserted: + ray_serve_tags = (DEPLOYMENT_TAG, REPLICA_TAG) + if context.get_internal_replica_context().app_name: + ray_serve_tags += (APPLICATION_TAG,) if tag_keys: - tag_keys = (DEPLOYMENT_TAG, REPLICA_TAG) + tag_keys + tag_keys = ray_serve_tags + tag_keys else: - tag_keys = (DEPLOYMENT_TAG, REPLICA_TAG) + tag_keys = ray_serve_tags return tag_keys @@ -30,16 +44,32 @@ def _add_serve_metric_default_tags(default_tags: Dict[str, str]): raise ValueError(f"'{DEPLOYMENT_TAG}' tag is reserved for Ray Serve metrics") if REPLICA_TAG in default_tags: raise ValueError(f"'{REPLICA_TAG}' tag is reserved for Ray Serve metrics") + if APPLICATION_TAG in default_tags: + raise ValueError(f"'{APPLICATION_TAG}' tag is reserved for Ray Serve metrics") replica_context = context.get_internal_replica_context() default_tags[DEPLOYMENT_TAG] = replica_context.deployment default_tags[REPLICA_TAG] = replica_context.replica_tag + if replica_context.app_name: + default_tags[APPLICATION_TAG] = replica_context.app_name return default_tags +def _add_serve_context_tag_values(tag_keys: Tuple, tags: Dict[str, str]): + """Add serve context tag values to the metric tags""" + + _request_context = ray.serve.context._serve_request_context.get() + if ROUTE_TAG in tag_keys and ROUTE_TAG not in tags: + tags[ROUTE_TAG] = _request_context.route + + class Counter(metrics.Counter): def __init__( self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None ): + if tag_keys and not isinstance(tag_keys, tuple): + raise TypeError( + "tag_keys should be a tuple type, got: " f"{type(tag_keys)}" + ) tag_keys = _add_serve_metric_tags(tag_keys) super().__init__(name, description, tag_keys) self.set_default_tags({}) @@ -47,11 +77,22 @@ def __init__( def set_default_tags(self, default_tags: Dict[str, str]): super().set_default_tags(_add_serve_metric_default_tags(default_tags)) + def inc(self, value: Union[int, float] = 1.0, tags: Dict[str, str] = None): + """Increment the counter by the given value, add serve context + tag values to the tags + """ + _add_serve_context_tag_values(self._tag_keys, tags) + super().inc(value, tags) + class Gauge(metrics.Gauge): def __init__( self, name: str, description: str = "", tag_keys: Optional[Tuple[str]] = None ): + if tag_keys and not isinstance(tag_keys, tuple): + raise TypeError( + "tag_keys should be a tuple type, got: " f"{type(tag_keys)}" + ) tag_keys = _add_serve_metric_tags(tag_keys) super().__init__(name, description, tag_keys) self.set_default_tags({}) @@ -59,6 +100,13 @@ def __init__( def set_default_tags(self, default_tags: Dict[str, str]): super().set_default_tags(_add_serve_metric_default_tags(default_tags)) + def set(self, value: Union[int, float], tags: Dict[str, str] = None): + """Set the gauge to the given value, add serve context + tag values to the tags + """ + _add_serve_context_tag_values(self._tag_keys, tags) + super().set(value, tags) + class Histogram(metrics.Histogram): def __init__( @@ -68,9 +116,20 @@ def __init__( boundaries: List[float] = None, tag_keys: Optional[Tuple[str]] = None, ): + if tag_keys and not isinstance(tag_keys, tuple): + raise TypeError( + "tag_keys should be a tuple type, got: " f"{type(tag_keys)}" + ) tag_keys = _add_serve_metric_tags(tag_keys) super().__init__(name, description, boundaries, tag_keys) self.set_default_tags({}) def set_default_tags(self, default_tags: Dict[str, str]): super().set_default_tags(_add_serve_metric_default_tags(default_tags)) + + def observe(self, value: Union[int, float], tags: Dict[str, str] = None): + """Observe the given value, add serve context + tag values to the tags + """ + _add_serve_context_tag_values(self._tag_keys, tags) + super().observe(value, tags) diff --git a/python/ray/serve/multiplex.py b/python/ray/serve/multiplex.py new file mode 100644 index 000000000000..a45887d2d473 --- /dev/null +++ b/python/ray/serve/multiplex.py @@ -0,0 +1,173 @@ +import asyncio +from collections import OrderedDict +import inspect +import logging +import time +from typing import Any, Callable + +from ray._private.async_compat import sync_to_async +from ray.serve._private.constants import ( + SERVE_LOGGER_NAME, + PUSH_MULTIPLEXED_MODEL_IDS_INTERVAL_S, +) +from ray.serve.context import ( + get_global_client, + get_internal_replica_context, +) +from ray.serve._private.common import MultiplexedReplicaInfo +from ray._private.utils import run_background_task +from ray.serve import metrics + + +logger = logging.getLogger(SERVE_LOGGER_NAME) + + +class _ModelMultiplexWrapper: + """A wrapper class that wraps the model load function and + provides the LRU caching functionality. + + The model multiplexer is a wrapper class that wraps the model load function + and provides the LRU caching functionality, and the model load function should + be a coroutine function that takes the model ID as the first argument and + returns the user-constructed model object. + The model multiplexer will also ensure that the number of models on the current + replica does not exceed the specified limit. + The model will be unloaded in the LRU order, the model multiplexer will call the + model's __del__ attribute if it exists to clean up the model resources eagerly. + + """ + + def __init__( + self, + model_load_func: Callable[[str], Any], + self_arg: Any, + max_num_models_per_replica: int, + ): + """Initialize the model multiplexer. + Args: + model_load_func: the model load async function. + self_arg: self argument when model_load_func is class method. + max_num_models_per_replica: the maximum number of models to be loaded on the + current replica. If it is -1, there is no limit for the number of models + per replica. + """ + self.models = OrderedDict() + self._func: Callable = model_load_func + self.self_arg: Any = self_arg + self.max_num_models_per_replica: int = max_num_models_per_replica + + self.model_load_latency_s = metrics.Gauge( + "serve_multiplexed_model_load_latency_s", + description="The time it takes to load a model.", + ) + self.model_unload_latency_s = metrics.Gauge( + "serve_multiplexed_model_unload_latency_s", + description="The time it takes to unload a model.", + ) + self.num_models = metrics.Gauge( + "serve_num_multiplexed_models", + description="The number of models loaded on the current replica.", + ) + + self.models_unload_counter = metrics.Counter( + "serve_multiplexed_models_unload_counter", + description="The counter for unloaded models on the current replica.", + ) + self.models_load_counter = metrics.Counter( + "serve_multiplexed_models_load_counter", + description="The counter for loaded models on the current replica.", + ) + + context = get_internal_replica_context() + if context is None: + raise RuntimeError( + "Fail to retrieve serve replica context, the model multiplexer ", + "can only be used within `Deployment`.", + ) + self._deployment_name: str = context.deployment + self._replica_tag: str = context.replica_tag + + # Whether to push the multiplexed replica info to the controller. + self._push_multiplexed_replica_info: bool = False + + # Push the model IDs to the controller periodically. + run_background_task(self._push_model_ids()) + + async def load_model(self, model_id: str) -> Any: + """Load the model if it is not loaded yet, and return the user-constructed model object. + + Args: + model_id: the model ID. + + Returns: + The user-constructed model object. + """ + + if type(model_id) != str: + raise TypeError("The model ID must be a string.") + + if not model_id: + raise ValueError("The model ID cannot be empty.") + + self.num_models.set(len(self.models)) + + if model_id in self.models: + # Move the model to the end of the OrderedDict to ensure LRU caching. + model = self.models.pop(model_id) + self.models[model_id] = model + else: + # If the number of models per replica is specified, check if the number of + # models on the current replica has reached the limit. + if ( + self.max_num_models_per_replica > 0 + and len(self.models) >= self.max_num_models_per_replica + ): + # Unload the least recently used model. + self.models_unload_counter.inc() + unload_start_time = time.time() + await self.unload_model() + self.model_unload_latency_s.set(time.time() - unload_start_time) + # Load the model. + logger.info(f"Loading model '{model_id}'.") + self.models_load_counter.inc() + load_start_time = time.time() + if self.self_arg is None: + self.models[model_id] = await self._func(model_id) + else: + self.models[model_id] = await self._func(self.self_arg, model_id) + self._push_multiplexed_replica_info = True + self.model_load_latency_s.set(time.time() - load_start_time) + return self.models[model_id] + + async def unload_model(self) -> None: + """Unload the least recently used model.""" + model_id, model = self.models.popitem(last=False) + logger.info(f"Unloading model '{model_id}'.") + + # If the model has __del__ attribute, call it. + # This is to clean up the model resources eagerly. + if hasattr(model, "__del__"): + if not inspect.iscoroutinefunction(model.__del__): + await asyncio.get_running_loop().run_in_executor(None, model.__del__) + else: + await sync_to_async(model.__del__)() + setattr(model, "__del__", lambda _: None) + + async def _push_model_ids(self): + """Push the multiplexed replica info to the controller.""" + + while True: + try: + if self._push_multiplexed_replica_info: + get_global_client().record_multiplexed_replica_info( + MultiplexedReplicaInfo( + self._deployment_name, self._replica_tag, self.models.keys() + ) + ) + self._push_multiplexed_replica_info = False + except Exception as e: + logger.warning( + "Failed to push the multiplexed replica info " + f"to the controller. Error: {e}" + ) + await asyncio.sleep(PUSH_MULTIPLEXED_MODEL_IDS_INTERVAL_S) diff --git a/python/ray/serve/schema.py b/python/ray/serve/schema.py index 76ead035e00e..8b655a0be18c 100644 --- a/python/ray/serve/schema.py +++ b/python/ray/serve/schema.py @@ -11,10 +11,12 @@ DeploymentInfo, ReplicaState, ServeDeployMode, + HTTPProxyStatus, ) from ray.serve.config import DeploymentMode from ray.serve._private.utils import DEFAULT, dict_keys_snake_to_camel_case from ray.util.annotations import DeveloperAPI, PublicAPI +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME def _route_prefix_format(cls, v): @@ -304,9 +306,7 @@ class ServeApplicationSchema(BaseModel, extra=Extra.forbid): """ name: str = Field( - # TODO(cindy): eventually we should set the default app name to a non-empty - # string and forbid empty app names. - default="", + default=SERVE_DEFAULT_APP_NAME, description=( "Application name, the name should be unique within the serve instance" ), @@ -320,6 +320,7 @@ class ServeApplicationSchema(BaseModel, extra=Extra.forbid): ), ) import_path: str = Field( + ..., description=( "An import path to a bound deployment node. Should be of the " 'form "module.submodule_1...submodule_n.' @@ -357,7 +358,11 @@ class ServeApplicationSchema(BaseModel, extra=Extra.forbid): ) deployments: List[DeploymentSchema] = Field( default=[], - description=("Deployment options that override options specified in the code."), + description="Deployment options that override options specified in the code.", + ) + args: Dict = Field( + default={}, + description="Arguments that will be passed to the application builder.", ) @validator("runtime_env") @@ -532,8 +537,7 @@ class ServeDeploySchema(BaseModel, extra=Extra.forbid): default=HTTPOptionsSchema(), description="Options to start the HTTP Proxy with." ) applications: List[ServeApplicationSchema] = Field( - default=[], - description=("The set of Serve applications to run on the Ray cluster."), + ..., description=("The set of Serve applications to run on the Ray cluster.") ) @validator("applications") @@ -604,7 +608,26 @@ def get_empty_schema_dict() -> Dict: @PublicAPI(stability="alpha") -class ReplicaDetails(BaseModel, extra=Extra.forbid, frozen=True): +class ServeActorDetails(BaseModel, frozen=True): + node_id: Optional[str] = Field( + description="ID of the node that the actor is running on." + ) + node_ip: Optional[str] = Field( + description="IP address of the node that the actor is running on." + ) + actor_id: Optional[str] = Field(description="Actor ID.") + actor_name: Optional[str] = Field(description="Actor name.") + worker_id: Optional[str] = Field(description="Worker ID.") + log_file_path: Optional[str] = Field( + description=( + "The relative path to the Serve actor's log file from the ray logs " + "directory." + ) + ) + + +@PublicAPI(stability="alpha") +class ReplicaDetails(ServeActorDetails, frozen=True): """Detailed info about a single deployment replica.""" replica_id: str = Field( @@ -616,14 +639,6 @@ class ReplicaDetails(BaseModel, extra=Extra.forbid, frozen=True): ) state: ReplicaState = Field(description="Current state of the replica.") pid: Optional[int] = Field(description="PID of the replica actor process.") - actor_name: str = Field(description="Name of the replica actor.") - actor_id: Optional[str] = Field(description="ID of the replica actor.") - node_id: Optional[str] = Field( - description="ID of the node that the replica actor is running on." - ) - node_ip: Optional[str] = Field( - description="IP address of the node that the replica actor is running on." - ) start_time_s: float = Field( description=( "The time at which the replica actor was started. If the controller dies, " @@ -755,6 +770,11 @@ def get_status_dict(self) -> Dict: ) +@PublicAPI(stability="alpha") +class HTTPProxyDetails(ServeActorDetails, frozen=True): + status: HTTPProxyStatus = Field(description="Current status of the HTTP Proxy.") + + @PublicAPI(stability="alpha") class ServeInstanceDetails(BaseModel, extra=Extra.forbid): """ @@ -764,6 +784,9 @@ class ServeInstanceDetails(BaseModel, extra=Extra.forbid): This is the response JSON schema for v2 REST API `GET /api/serve/applications`. """ + controller_info: ServeActorDetails = Field( + description="Details about the Serve controller actor." + ) proxy_location: Optional[DeploymentMode] = Field( description=( "The location of HTTP servers.\n" @@ -773,6 +796,11 @@ class ServeInstanceDetails(BaseModel, extra=Extra.forbid): ), ) http_options: Optional[HTTPOptionsSchema] = Field(description="HTTP Proxy options.") + http_proxies: Optional[Dict[str, HTTPProxyDetails]] = Field( + description=( + "Mapping from node_id to details about the HTTP Proxy running on that node." + ) + ) deploy_mode: ServeDeployMode = Field( description=( "Whether a single-app config of format ServeApplicationSchema or multi-app " @@ -790,7 +818,7 @@ def get_empty_schema_dict() -> Dict: Represents no Serve instance running on the cluster. """ - return {"deploy_mode": "UNSET", "applications": {}} + return {"deploy_mode": "UNSET", "controller_info": {}, "applications": {}} @PublicAPI(stability="beta") diff --git a/python/ray/serve/scripts.py b/python/ray/serve/scripts.py index c0f381fca175..87d2f207705a 100644 --- a/python/ray/serve/scripts.py +++ b/python/ray/serve/scripts.py @@ -3,7 +3,7 @@ import pathlib import sys import time -from typing import Optional, Union, Tuple +from typing import Dict, Optional, Tuple import click import yaml @@ -26,8 +26,7 @@ SERVE_DEFAULT_APP_NAME, ) from ray.serve._private.common import ServeDeployMode -from ray.serve.deployment import deployment_to_schema -from ray.serve.deployment_graph import ClassNode, FunctionNode +from ray.serve.deployment import Application, deployment_to_schema from ray.serve._private import api as _private_api from ray.serve.schema import ( ServeApplicationSchema, @@ -100,18 +99,35 @@ def process_dict_for_yaml_dump(data): for k, v in data.items(): if isinstance(v, dict): data[k] = process_dict_for_yaml_dump(v) + if isinstance(v, list): + data[k] = [process_dict_for_yaml_dump(item) for item in v] elif isinstance(v, str): data[k] = remove_ansi_escape_sequences(v) return data -@click.group(help="CLI for managing Serve instances on a Ray cluster.") +def convert_args_to_dict(args: Tuple[str]) -> Dict[str, str]: + args_dict = dict() + for arg in args: + split = arg.split("=") + if len(split) != 2: + raise click.ClickException( + f"Invalid application argument '{arg}', " + "must be of the form '='." + ) + + args_dict[split[0]] = split[1] + + return args_dict + + +@click.group(help="CLI for managing Serve applications on a Ray cluster.") def cli(): pass -@cli.command(help="Start a detached Serve instance on the Ray cluster.") +@cli.command(help="Start Serve on the Ray cluster.") @click.option( "--address", "-a", @@ -125,21 +141,21 @@ def cli(): default=DEFAULT_HTTP_HOST, required=False, type=str, - help="Host for HTTP servers to listen on. " f"Defaults to {DEFAULT_HTTP_HOST}.", + help="Host for HTTP proxies to listen on. " f"Defaults to {DEFAULT_HTTP_HOST}.", ) @click.option( "--http-port", default=DEFAULT_HTTP_PORT, required=False, type=int, - help="Port for HTTP servers to listen on. " f"Defaults to {DEFAULT_HTTP_PORT}.", + help="Port for HTTP proxies to listen on. " f"Defaults to {DEFAULT_HTTP_PORT}.", ) @click.option( "--http-location", default=DeploymentMode.HeadOnly, required=False, type=click.Choice(list(DeploymentMode)), - help="Location of the HTTP servers. Defaults to HeadOnly.", + help="Location of the HTTP proxies. Defaults to HeadOnly.", ) def start(address, http_host, http_port, http_location): ray.init( @@ -204,28 +220,30 @@ def deploy(config_file_name: str, address: str): # Error deploying application raise - cli_logger.newline() cli_logger.success( - "\nSent deploy request successfully!\n " - "* Use `serve status` to check deployments' statuses.\n " - "* Use `serve config` to see the current config(s).\n" + "\nSent deploy request successfully.\n " + "* Use `serve status` to check applications' statuses.\n " + "* Use `serve config` to see the current application config(s).\n" ) - cli_logger.newline() @cli.command( short_help="Run Serve application(s).", help=( - "Runs the Serve application from the specified import path (e.g. my_script:" - "my_bound_deployment) or application(s) from a YAML config.\n\n" - "If using a YAML config, existing deployments with no code changes in an " - "application will not be redeployed.\n\n" - "Any import path must lead to a FunctionNode or ClassNode object. " - "By default, this will block and periodically log status. If you " - "Ctrl-C the command, it will tear down the app." + "Runs an application from the specified import path (e.g., my_script:" + "app) or application(s) from a YAML config.\n\n" + "If passing an import path, it must point to a Serve Application or " + "a function that returns one. If a function is used, arguments can be " + "passed to it in 'key=val' format after the import path, for example:\n\n" + "serve run my_script:app model_path='/path/to/model.pkl' num_replicas=5\n\n" + "If passing a YAML config, existing applications with no code changes will not " + "be updated.\n\n" + "By default, this will block and stream logs to the console. If you " + "Ctrl-C the command, it will shut down Serve on the cluster." ), ) @click.argument("config_or_import_path") +@click.argument("arguments", nargs=-1, required=False) @click.option( "--runtime-env", type=str, @@ -282,7 +300,7 @@ def deploy(config_file_name: str, address: str): "-p", required=False, type=int, - help=f"Port for HTTP servers to listen on. Defaults to {DEFAULT_HTTP_PORT}.", + help=f"Port for HTTP proxies to listen on. Defaults to {DEFAULT_HTTP_PORT}.", ) @click.option( "--blocking/--non-blocking", @@ -303,6 +321,7 @@ def deploy(config_file_name: str, address: str): ) def run( config_or_import_path: str, + arguments: Tuple[str], runtime_env: str, runtime_env_json: str, working_dir: str, @@ -314,7 +333,7 @@ def run( gradio: bool, ): sys.path.insert(0, app_dir) - + args_dict = convert_args_to_dict(arguments) final_runtime_env = parse_runtime_env_args( runtime_env=runtime_env, runtime_env_json=runtime_env_json, @@ -322,9 +341,14 @@ def run( ) if pathlib.Path(config_or_import_path).is_file(): + if len(args_dict) > 0: + cli_logger.warning( + "Application arguments are ignored when running a config file." + ) + is_config = True config_path = config_or_import_path - cli_logger.print(f'Deploying from config file: "{config_path}".') + cli_logger.print(f"Running config file: '{config_path}'.") with open(config_path, "r") as config_file: config_dict = yaml.safe_load(config_file) @@ -377,8 +401,10 @@ def run( if port is None: port = DEFAULT_HTTP_PORT import_path = config_or_import_path - cli_logger.print(f'Deploying from import path: "{import_path}".') - node = import_attr(import_path) + cli_logger.print(f"Running import path: '{import_path}'.") + app = _private_api.call_app_builder_with_args_if_necessary( + import_attr(import_path), args_dict + ) # Setting the runtime_env here will set defaults for the deployments. ray.init(address=address, namespace=SERVE_NAMESPACE, runtime_env=final_runtime_env) @@ -394,7 +420,7 @@ def run( if gradio: handle = serve.get_deployment("DAGDriver").get_handle() else: - handle = serve.run(node, host=host, port=port) + handle = serve.run(app, host=host, port=port) cli_logger.success("Deployed Serve app successfully.") if gradio: @@ -479,7 +505,7 @@ def config(address: str, name: Optional[str]): @cli.command( - short_help="Get the current status of all live Serve applications and deployments.", + short_help="Get the current status of all Serve applications on the cluster.", help=( "Prints status information about all applications on the cluster.\n\n" "An application may be:\n\n" @@ -558,7 +584,7 @@ def status(address: str, name: Optional[str]): @cli.command( - help="Deletes the Serve app.", + help="Shuts down Serve on the cluster, deleting all applications.", ) @click.option( "--address", @@ -572,25 +598,25 @@ def status(address: str, name: Optional[str]): def shutdown(address: str, yes: bool): if not yes: click.confirm( - f"\nThis will shutdown the Serve application at address " - f'"{address}" and delete all deployments there. Do you ' + f"This will shut down Serve on the cluster at address " + f'"{address}" and delete all applications there. Do you ' "want to continue?", abort=True, ) ServeSubmissionClient(address).delete_application() - cli_logger.newline() - cli_logger.success("\nSent delete request successfully!\n") - cli_logger.newline() + cli_logger.success( + "Sent shutdown request; applications will be deleted asynchronously." + ) @cli.command( - short_help="Writes a Serve Deployment Graph's config file.", + short_help="Generate a config file for the specified application(s).", help=( - "Imports the ClassNode(s) or FunctionNode(s) at IMPORT_PATH(S) and generates a " + "Imports the Application at IMPORT_PATH(S) and generates a " "structured config for it. If the flag --multi-app is set, accepts multiple " - "ClassNode/FunctionNodes and generates a multi-application config. Config " + "Applications and generates a multi-application config. Config " "outputted from this command can be used by `serve deploy` or the REST API. " ), ) @@ -634,14 +660,13 @@ def build( sys.path.insert(0, app_dir) def build_app_config(import_path: str, name: str = None): - node: Union[ClassNode, FunctionNode] = import_attr(import_path) - if not isinstance(node, (ClassNode, FunctionNode)): + app: Application = import_attr(import_path) + if not isinstance(app, Application): raise TypeError( - f"Expected '{import_path}' to be ClassNode or " - f"FunctionNode, but got {type(node)}." + f"Expected '{import_path}' to be an Application but got {type(app)}." ) - app = build_app(node) + app = build_app(app) schema = ServeApplicationSchema( import_path=import_path, runtime_env={}, diff --git a/python/ray/serve/tests/test_advanced.py b/python/ray/serve/tests/test_advanced.py index 495ba9a7cb2f..73c3d6c6f8b2 100644 --- a/python/ray/serve/tests/test_advanced.py +++ b/python/ray/serve/tests/test_advanced.py @@ -6,6 +6,7 @@ import ray from ray import serve from ray._private.test_utils import SignalActor +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME def test_serve_forceful_shutdown(serve_instance): @@ -16,7 +17,7 @@ def sleeper(): handle = serve.run(sleeper.bind()) ref = handle.remote() - sleeper.delete() + serve.delete(SERVE_DEFAULT_APP_NAME) with pytest.raises(ray.exceptions.RayActorError): ray.get(ref) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index e070e2c901a0..597a6998723a 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -1,18 +1,26 @@ import asyncio import os -from ray.serve.deployment_graph import RayServeDAGHandle +from typing import Optional +from fastapi import FastAPI import requests +from pydantic import BaseModel, ValidationError import pytest import starlette.responses -from fastapi import FastAPI import ray from ray import serve from ray._private.test_utils import SignalActor, wait_for_condition -from ray.serve.application import Application +from ray.serve.built_application import BuiltApplication +from ray.serve.deployment import Application +from ray.serve.deployment_graph import RayServeDAGHandle from ray.serve.drivers import DAGDriver from ray.serve.exceptions import RayServeException +from ray.serve._private.api import call_app_builder_with_args_if_necessary +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) @serve.deployment() @@ -386,13 +394,13 @@ def test_run_get_ingress_app(serve_instance): def g(): return "got g" - app = Application([g]) + app = BuiltApplication([g]) ingress_handle = serve.run(app) assert ray.get(ingress_handle.remote()) == "got g" serve_instance.delete_deployments(["g"]) - no_ingress_app = Application([g.options(route_prefix=None)]) + no_ingress_app = BuiltApplication([g.options(route_prefix=None)]) ingress_handle = serve.run(no_ingress_app) assert ingress_handle is None @@ -419,27 +427,6 @@ def __call__(self, *args): assert ray.get(ingress_handle.remote()) == "got f" -def test_run_delete_old_deployments(serve_instance): - """Check that serve.run() can remove all old deployments""" - - @serve.deployment(name="f", route_prefix="/test1") - def f(): - return "got f" - - @serve.deployment(name="g", route_prefix="/test2") - def g(): - return "got g" - - ingress_handle = serve.run(f.bind()) - assert ray.get(ingress_handle.remote()) == "got f" - - ingress_handle = serve.run(g.bind()) - assert ray.get(ingress_handle.remote()) == "got g" - - assert "g" in serve.list_deployments() - assert "f" not in serve.list_deployments() - - class TestSetOptions: def test_set_options_basic(self): @serve.deployment( @@ -570,7 +557,10 @@ def g(): serve.run(g.bind()) deployment_info = ray.get(controller._all_running_replicas.remote()) - assert "g" in deployment_info + assert ( + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}g" + in deployment_info + ) @serve.deployment def f(): @@ -708,6 +698,138 @@ def f(): f.options(autoscaling_config={"min_replicas": "1"}) +class TestAppBuilder: + @serve.deployment + class A: + pass + + @serve.deployment + def f(): + pass + + class TypedArgs(BaseModel): + message: str + num_replicas: Optional[int] + + def test_prebuilt_app(self): + a = self.A.bind() + assert call_app_builder_with_args_if_necessary(a, {}) == a + + f = self.f.bind() + assert call_app_builder_with_args_if_necessary(f, {}) == f + + with pytest.raises( + ValueError, + match="Arguments can only be passed to an application builder function", + ): + call_app_builder_with_args_if_necessary(f, {"key": "val"}) + + def test_invalid_builder(self): + class ThisShouldBeAFunction: + pass + + with pytest.raises( + TypeError, + match=( + "Expected a built Serve application " + "or an application builder function" + ), + ): + call_app_builder_with_args_if_necessary(ThisShouldBeAFunction, {}) + + def test_invalid_signature(self): + def builder_with_two_args(args1, args2): + return self.f.bind() + + with pytest.raises( + TypeError, + match="Application builder functions should take exactly one parameter", + ): + call_app_builder_with_args_if_necessary(builder_with_two_args, {}) + + def test_builder_returns_bad_type(self): + def return_none(args): + self.f.bind() + + with pytest.raises( + TypeError, + match="Application builder functions must return a", + ): + call_app_builder_with_args_if_necessary(return_none, {}) + + def return_unbound_deployment(args): + return self.f + + with pytest.raises( + TypeError, + match="Application builder functions must return a", + ): + call_app_builder_with_args_if_necessary(return_unbound_deployment, {}) + + def test_basic_no_args(self): + def build_function(args): + return self.A.bind() + + assert isinstance( + call_app_builder_with_args_if_necessary(build_function, {}), Application + ) + + def build_class(args): + return self.f.bind() + + assert isinstance( + call_app_builder_with_args_if_necessary(build_class, {}), Application + ) + + def test_args_dict(self): + args_dict = {"message": "hiya", "num_replicas": "3"} + + def build(args): + assert len(args) == 2 + assert args["message"] == "hiya" + assert args["num_replicas"] == "3" + return self.A.options(num_replicas=int(args["num_replicas"])).bind( + args["message"] + ) + + app = call_app_builder_with_args_if_necessary(build, args_dict) + assert isinstance(app, Application) + + def test_args_typed(self): + args_dict = {"message": "hiya", "num_replicas": "3"} + + def build(args: self.TypedArgs): + assert isinstance(args, self.TypedArgs) + assert args.message == "hiya" + assert args.num_replicas == 3 + return self.A.options(num_replicas=args.num_replicas).bind(args.message) + + app = call_app_builder_with_args_if_necessary(build, args_dict) + assert isinstance(app, Application) + + # Sanity check that pydantic validation works. + + # 1) Check that validation permits a missing optional field. + def check_missing_optional(args: self.TypedArgs): + assert args.message == "hiya" + assert args.num_replicas is None + return self.A.bind() + + app = call_app_builder_with_args_if_necessary( + check_missing_optional, {"message": "hiya"} + ) + assert isinstance(app, Application) + + # 2) Check that validation rejects a missing required field. + def check_missing_required(args: self.TypedArgs): + assert False, "Shouldn't get here because validation failed." + + with pytest.raises(ValidationError, match="field required"): + call_app_builder_with_args_if_necessary( + check_missing_required, {"num_replicas": "10"} + ) + + if __name__ == "__main__": import sys diff --git a/python/ray/serve/tests/test_application_state.py b/python/ray/serve/tests/test_application_state.py index fb6108642f9e..28aeea2499b2 100644 --- a/python/ray/serve/tests/test_application_state.py +++ b/python/ray/serve/tests/test_application_state.py @@ -1,100 +1,109 @@ import sys import pytest -from typing import List +from typing import List, Tuple, Dict import time import ray from ray._private.test_utils import SignalActor from ray.serve._private.application_state import ApplicationStateManager -from ray.serve._private.common import ApplicationStatus +from ray.serve._private.common import ApplicationStatus, DeploymentInfo from ray.serve._private.common import DeploymentStatus, DeploymentStatusInfo +from ray.serve.config import DeploymentConfig, ReplicaConfig from ray.serve.exceptions import RayServeException class MockDeploymentStateManager: def __init__(self): - self.deployment_statuses = [ - DeploymentStatusInfo("d1", DeploymentStatus.UPDATING), - DeploymentStatusInfo("d2", DeploymentStatus.UPDATING), - ] + self.deployment_statuses: Dict[str, DeploymentStatusInfo] = dict() - def add_deployment_status(self, status: DeploymentStatusInfo): - assert type(status) == DeploymentStatusInfo - self.deployment_statuses.append(status) + def deploy(self, deployment_name: str, deployment_info: DeploymentInfo): + self.deployment_statuses[deployment_name] = DeploymentStatusInfo( + name=deployment_name, + status=DeploymentStatus.UPDATING, + message="", + ) + + @property + def deployments(self) -> List[str]: + return list(self.deployment_statuses.keys()) - def set_deployment_statuses_unhealthy(self, index: int = 0): - self.deployment_statuses[index].status = DeploymentStatus.UNHEALTHY + def set_deployment_statuses_unhealthy(self, name: str): + self.deployment_statuses[name].status = DeploymentStatus.UNHEALTHY - def set_deployment_statuses_healthy(self, index: int = 0): - self.deployment_statuses[index].status = DeploymentStatus.HEALTHY + def set_deployment_statuses_healthy(self, name: str): + self.deployment_statuses[name].status = DeploymentStatus.HEALTHY def get_deployment_statuses(self, deployment_names: List[str]): - return [ - status - for status in self.deployment_statuses - if status.name in deployment_names - ] + return list(self.deployment_statuses.values()) + + def get_deployment(self, deployment_name: str) -> DeploymentInfo: + if deployment_name in self.deployment_statuses: + # Return dummy deployment info object + return DeploymentInfo( + deployment_config=DeploymentConfig(num_replicas=1, user_config={}), + replica_config=ReplicaConfig.create(lambda x: x), + start_time_ms=0, + deployer_job_id="", + ) - def get_all_deployments(self): - return [d.name for d in self.deployment_statuses] + def delete_deployment(self, deployment_name: str): + del self.deployment_statuses[deployment_name] - def add_deployment(self, status: DeploymentStatusInfo): - self.deployment_statuses.append(status) - def get_deployment(self, deployment_name: str) -> DeploymentStatusInfo: - for deployment in self.deployment_statuses: - if deployment.name == deployment_name: - return deployment +@pytest.fixture +def mocked_application_state_manager() -> Tuple[ + ApplicationStateManager, MockDeploymentStateManager +]: + deployment_state_manager = MockDeploymentStateManager() + application_state_manager = ApplicationStateManager(deployment_state_manager) + yield application_state_manager, deployment_state_manager - def delete_deployment(self, deployment_name: str): - statuses = [] - for deployment in self.deployment_statuses: - if deployment.name != deployment_name: - statuses.append(deployment) - self.deployment_statuses = statuses - -def test_deploy_app(): +def test_deploy_app(mocked_application_state_manager): """Test DEPLOYING status""" - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) - app_state_manager.deploy_application("test_app", {}) + app_state_manager, _ = mocked_application_state_manager + app_state_manager.deploy_application("test_app", [{"name": "d1"}]) app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOYING assert app_status.deployment_timestamp > 0 -def test_delete_app(): +def test_delete_app(mocked_application_state_manager): """Test DELETING status""" - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) - app_state_manager.deploy_application("test_app", {}) + app_state_manager, _ = mocked_application_state_manager + app_state_manager.deploy_application("test_app", [{"name": "d1"}]) app_state_manager.delete_application("test_app") app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DELETING -def test_create_app(): +def test_create_app(mocked_application_state_manager): """Test object ref based deploy and set DEPLOYING""" - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) + app_state_manager, _ = mocked_application_state_manager app_state_manager.create_application_state("test_app", ray.ObjectRef.nil()) app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOYING -def test_update_app_running(): +def test_update_app_running(mocked_application_state_manager): """Test DEPLOYING -> RUNNING""" - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) + app_state_manager, deployment_state_manager = mocked_application_state_manager app_state_manager.deploy_application( "test_app", [{"name": "d1"}, {"name": "d2"}], ) + # Simulate controller + deployment_state_manager.deploy("d1", None) + deployment_state_manager.deploy("d2", None) + app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOYING - app_state_manager.deployment_state_manager.set_deployment_statuses_healthy(0) + deployment_state_manager.set_deployment_statuses_healthy("d1") app_state_manager.update() app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOYING - app_state_manager.deployment_state_manager.set_deployment_statuses_healthy(1) + deployment_state_manager.set_deployment_statuses_healthy("d2") app_state_manager.update() app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.RUNNING @@ -105,13 +114,16 @@ def test_update_app_running(): assert app_status.status == ApplicationStatus.RUNNING -def test_update_app_deploy_failed(): +def test_update_app_deploy_failed(mocked_application_state_manager): """Test DEPLOYING -> DEPLOY_FAILED""" - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) + app_state_manager, deployment_state_manager = mocked_application_state_manager app_state_manager.deploy_application("test_app", [{"name": "d1"}]) + # Simulate controller + deployment_state_manager.deploy("d1", None) + app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOYING - app_state_manager.deployment_state_manager.set_deployment_statuses_unhealthy(0) + deployment_state_manager.set_deployment_statuses_unhealthy("d1") app_state_manager.update() app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOY_FAILED @@ -122,7 +134,7 @@ def test_update_app_deploy_failed(): @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") @pytest.mark.parametrize("fail_deploy", [False, True]) -def test_config_deploy_app(fail_deploy): +def test_config_deploy_app(mocked_application_state_manager, fail_deploy): """Test config based deploy DEPLOYING -> RUNNING DEPLOYING -> DEPLOY_FAILED @@ -136,7 +148,7 @@ def task(): raise Exception("fail!") object_ref = task.remote() - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) + app_state_manager, deployment_state_manager = mocked_application_state_manager app_state_manager.create_application_state("test_app", object_ref) app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOYING @@ -152,18 +164,25 @@ def task(): app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOY_FAILED else: - app_state_manager.deployment_state_manager.set_deployment_statuses_healthy(0) - app_state_manager.deployment_state_manager.set_deployment_statuses_healthy(1) + # Simulate task calling deploy_application on controller + app_state_manager.deploy_application("test_app", [{"name": "d1"}]) + deployment_state_manager.deploy("d1", None) + + deployment_state_manager.set_deployment_statuses_healthy("d1") app_state_manager.update() app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.RUNNING -def test_redeploy_same_app(): +def test_redeploy_same_app(mocked_application_state_manager): """Test deploying the same app with different deploy_params.""" - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) + app_state_manager, deployment_state_manager = mocked_application_state_manager app_state_manager.deploy_application("test_app", [{"name": "d1"}, {"name": "d2"}]) + # Simulate controller + deployment_state_manager.deploy("d1", None) + deployment_state_manager.deploy("d2", None) + app_status = app_state_manager.get_app_status("test_app") assert app_status.status == ApplicationStatus.DEPLOYING @@ -173,25 +192,24 @@ def test_redeploy_same_app(): ) assert unused_deployments == ["d1"] - app_state_manager.deployment_state_manager.add_deployment_status( - DeploymentStatusInfo("d3", DeploymentStatus.UPDATING) - ) - assert app_state_manager._application_states["test_app"].deployments_to_delete == { + deployment_state_manager.deploy("d3", None) + assert app_state_manager._application_states["test_app"]._deployments_to_delete == { "d1" } # After updating, the deployment should be deleted successfully, and # deployments_to_delete should be empty - app_state_manager.deployment_state_manager.delete_deployment("d1") + deployment_state_manager.delete_deployment("d1") app_state_manager.update() assert ( - app_state_manager._application_states["test_app"].deployments_to_delete == set() + app_state_manager._application_states["test_app"]._deployments_to_delete + == set() ) -def test_deploy_with_route_prefix_conflict(): +def test_deploy_with_route_prefix_conflict(mocked_application_state_manager): """Test that an application fails to deploy with a route prefix conflict.""" - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) + app_state_manager, _ = mocked_application_state_manager app_state_manager.deploy_application( "test_app", [{"name": "d1", "route_prefix": "/url1"}] @@ -202,21 +220,24 @@ def test_deploy_with_route_prefix_conflict(): ) -def test_deploy_with_renamed_app(): +def test_deploy_with_renamed_app(mocked_application_state_manager): """ Test that an application deploys successfully when there is a route prefix conflict with an old app running on the cluster. """ - app_state_manager = ApplicationStateManager(MockDeploymentStateManager()) + app_state_manager, deployment_state_manager = mocked_application_state_manager # deploy app1 app_state_manager.deploy_application( "app1", [{"name": "d1", "route_prefix": "/url1"}] ) + # Simulate controller + deployment_state_manager.deploy("d1", None) + app_status = app_state_manager.get_app_status("app1") assert app_status.status == ApplicationStatus.DEPLOYING - app_state_manager.deployment_state_manager.set_deployment_statuses_healthy(0) + deployment_state_manager.set_deployment_statuses_healthy("d1") app_state_manager.update() app_status = app_state_manager.get_app_status("app1") assert app_status.status == ApplicationStatus.RUNNING @@ -230,17 +251,20 @@ def test_deploy_with_renamed_app(): app_state_manager.deploy_application( "app2", [{"name": "d2", "route_prefix": "/url1"}] ) + # Simulate controller + deployment_state_manager.deploy("d2", None) + app_status = app_state_manager.get_app_status("app2") assert app_status.status == ApplicationStatus.DEPLOYING # app2 deploys before app1 finishes deleting - app_state_manager.deployment_state_manager.set_deployment_statuses_healthy(1) + deployment_state_manager.set_deployment_statuses_healthy("d2") app_state_manager.update() app_status = app_state_manager.get_app_status("app2") assert app_status.status == ApplicationStatus.RUNNING # app1 finally finishes deleting - app_state_manager.deployment_state_manager.delete_deployment("d1") + deployment_state_manager.delete_deployment("d1") app_state_manager.update() app_status = app_state_manager.get_app_status("app1") assert app_status.status == ApplicationStatus.NOT_STARTED diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py index 0bdcb4b7b75c..efe330e8769d 100644 --- a/python/ray/serve/tests/test_autoscaling_policy.py +++ b/python/ray/serve/tests/test_autoscaling_policy.py @@ -17,10 +17,14 @@ from ray.serve._private.common import DeploymentInfo from ray.serve._private.common import ReplicaState from ray.serve.config import AutoscalingConfig -from ray.serve._private.constants import CONTROL_LOOP_PERIOD_S +from ray.serve._private.constants import ( + CONTROL_LOOP_PERIOD_S, + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) from ray.serve.controller import ServeController from ray.serve.deployment import Deployment -import ray.experimental.state.api as state_api +import ray.util.state as state_api from ray.dashboard.modules.serve.sdk import ServeSubmissionClient import ray @@ -108,28 +112,38 @@ def test_smoothing_factor(self): assert 5 <= desired_num_replicas <= 8 # 10 + 0.5 * (2.5 - 10) = 6.25 -def get_running_replicas(controller: ServeController, deployment: Deployment) -> List: +def get_running_replicas( + controller: ServeController, deployment: Deployment, app_name +) -> List: """Get the replicas currently running for given deployment""" + if app_name: + deployment_name = app_name + DEPLOYMENT_NAME_PREFIX_SEPARATOR + deployment.name + else: + deployment_name = deployment.name replicas = ray.get( - controller._dump_replica_states_for_testing.remote(deployment.name) + controller._dump_replica_states_for_testing.remote(deployment_name) ) running_replicas = replicas.get([ReplicaState.RUNNING]) return running_replicas def get_running_replica_tags( - controller: ServeController, deployment: Deployment + controller: ServeController, + deployment: Deployment, + app_name: str = SERVE_DEFAULT_APP_NAME, ) -> List: """Get the replica tags of running replicas for given deployment""" - running_replicas = get_running_replicas(controller, deployment) + running_replicas = get_running_replicas(controller, deployment, app_name) return [replica.replica_tag for replica in running_replicas] def get_num_running_replicas( - controller: ServeController, deployment: Deployment + controller: ServeController, + deployment: Deployment, + app_name: str = SERVE_DEFAULT_APP_NAME, ) -> int: """Get the amount of replicas currently running for given deployment""" - running_replicas = get_running_replicas(controller, deployment) + running_replicas = get_running_replicas(controller, deployment, app_name) return len(running_replicas) @@ -167,7 +181,11 @@ def test_assert_no_replicas_deprovisioned(): assert_no_replicas_deprovisioned(replica_tags_2, replica_tags_1) -def get_deployment_start_time(controller: ServeController, deployment: Deployment): +def get_deployment_start_time( + controller: ServeController, + deployment: Deployment, + app_name: str = SERVE_DEFAULT_APP_NAME, +): """Return start time for given deployment""" deployment_route_list = DeploymentRouteList.FromString( ray.get(controller.list_deployments.remote()) @@ -179,7 +197,11 @@ def get_deployment_start_time(controller: ServeController, deployment: Deploymen ) for deployment_route in deployment_route_list.deployment_routes } - deployment_info, _route_prefix = deployments[deployment.name] + if app_name: + deployment_name = app_name + DEPLOYMENT_NAME_PREFIX_SEPARATOR + deployment.name + else: + deployment_name = deployment.name + deployment_info, _route_prefix = deployments[deployment_name] return deployment_info.start_time_ms @@ -673,7 +695,6 @@ def __call__(self): controller = serve_instance._controller start_time = get_deployment_start_time(controller, A) - A.get_handle() [handle.remote() for _ in range(50)] wait_for_condition( @@ -830,19 +851,19 @@ def __call__(self): print("Deployed A.") controller = serve_instance._controller - start_time = get_deployment_start_time(controller, A) + start_time = get_deployment_start_time(controller, A, app_name=None) - assert get_num_running_replicas(controller, A) == 0 + assert get_num_running_replicas(controller, A, app_name=None) == 0 handle = A.get_handle() [handle.remote() for _ in range(1)] print("Issued one request.") time.sleep(2) - assert get_num_running_replicas(controller, A) == 1 + assert get_num_running_replicas(controller, A, app_name=None) == 1 print("Scale up to 1 replica.") - first_deployment_replicas = get_running_replica_tags(controller, A) + first_deployment_replicas = get_running_replica_tags(controller, A, app_name=None) A.options( autoscaling_config={ @@ -859,14 +880,16 @@ def __call__(self): ).deploy() print("Redeployed A with min_replicas set to 2.") - wait_for_condition(lambda: get_num_running_replicas(controller, A) >= 2) + wait_for_condition( + lambda: get_num_running_replicas(controller, A, app_name=None) >= 2 + ) time.sleep(5) # Confirm that autoscaler doesn't scale above 2 even after waiting - assert get_num_running_replicas(controller, A) == 2 + assert get_num_running_replicas(controller, A, app_name=None) == 2 print("Autoscaled to 2 without issuing any new requests.") - second_deployment_replicas = get_running_replica_tags(controller, A) + second_deployment_replicas = get_running_replica_tags(controller, A, app_name=None) # Confirm that none of the original replicas were de-provisioned assert_no_replicas_deprovisioned( @@ -878,12 +901,14 @@ def __call__(self): print("Completed request.") # As the queue is drained, we should scale back down. - wait_for_condition(lambda: get_num_running_replicas(controller, A) <= 2) - assert get_num_running_replicas(controller, A) > 1 + wait_for_condition( + lambda: get_num_running_replicas(controller, A, app_name=None) <= 2 + ) + assert get_num_running_replicas(controller, A, app_name=None) > 1 print("Stayed at 2 replicas.") # Make sure start time did not change for the deployment - assert get_deployment_start_time(controller, A) == start_time + assert get_deployment_start_time(controller, A, app_name=None) == start_time @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") @@ -903,7 +928,15 @@ def f(): # f should start with initial_replicas (2) deployments actors = state_api.list_actors( - filters=[("class_name", "=", "ServeReplica:f"), ("state", "=", "ALIVE")] + filters=[ + ( + "class_name", + "=", + f"ServeReplica:{SERVE_DEFAULT_APP_NAME}" + f"{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f", + ), + ("state", "=", "ALIVE"), + ] ) print(actors) assert len(actors) == 2 @@ -911,7 +944,15 @@ def f(): # f should scale down to min_replicas (1) deployments def check_one_replica(): actors = state_api.list_actors( - filters=[("class_name", "=", "ServeReplica:f"), ("state", "=", "ALIVE")] + filters=[ + ( + "class_name", + "=", + f"ServeReplica:{SERVE_DEFAULT_APP_NAME}" + f"{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f", + ), + ("state", "=", "ALIVE"), + ] ) return len(actors) == 1 @@ -946,7 +987,12 @@ def scaler(): def check_two_replicas(): actors = state_api.list_actors( filters=[ - ("class_name", "=", "ServeReplica:scaler"), + ( + "class_name", + "=", + f"ServeReplica:{SERVE_DEFAULT_APP_NAME}" + f"{DEPLOYMENT_NAME_PREFIX_SEPARATOR}scaler", + ), ("state", "=", "ALIVE"), ] ) @@ -969,12 +1015,25 @@ def check_two_replicas(): def check_num_replicas(live: int, dead: int): live_actors = state_api.list_actors( filters=[ - ("class_name", "=", "ServeReplica:scaler"), + ( + "class_name", + "=", + f"ServeReplica:{SERVE_DEFAULT_APP_NAME}" + f"{DEPLOYMENT_NAME_PREFIX_SEPARATOR}scaler", + ), ("state", "=", "ALIVE"), ] ) dead_actors = state_api.list_actors( - filters=[("class_name", "=", "ServeReplica:scaler"), ("state", "=", "DEAD")] + filters=[ + ( + "class_name", + "=", + f"ServeReplica:{SERVE_DEFAULT_APP_NAME}" + f"{DEPLOYMENT_NAME_PREFIX_SEPARATOR}scaler", + ), + ("state", "=", "DEAD"), + ] ) return len(live_actors) == live and len(dead_actors) == dead @@ -1063,7 +1122,15 @@ def send_request(): def check_num_replicas(num: int): actors = state_api.list_actors( - filters=[("class_name", "=", "ServeReplica:g"), ("state", "=", "ALIVE")] + filters=[ + ( + "class_name", + "=", + f"ServeReplica:{SERVE_DEFAULT_APP_NAME}" + f"{DEPLOYMENT_NAME_PREFIX_SEPARATOR}g", + ), + ("state", "=", "ALIVE"), + ] ) return len(actors) == num diff --git a/python/ray/serve/tests/test_application.py b/python/ray/serve/tests/test_built_application.py similarity index 89% rename from python/ray/serve/tests/test_application.py rename to python/ray/serve/tests/test_built_application.py index 4919b32c0bd6..9269f77d57d2 100644 --- a/python/ray/serve/tests/test_application.py +++ b/python/ray/serve/tests/test_built_application.py @@ -3,11 +3,11 @@ import ray from ray import serve -from ray.serve.application import Application +from ray.serve.built_application import BuiltApplication from ray._private.test_utils import wait_for_condition -class TestApplicationConstruction: +class TestBuiltApplicationConstruction: @serve.deployment def f(*args): return "got f" @@ -18,7 +18,7 @@ def __call__(self, *args): return "got C" def test_valid_deployments(self): - app = Application([self.f, self.C]) + app = BuiltApplication([self.f, self.C]) assert len(app.deployments) == 2 app_deployment_names = {d.name for d in app.deployments.values()} @@ -27,14 +27,14 @@ def test_valid_deployments(self): def test_repeated_deployment_names(self): with pytest.raises(ValueError): - Application([self.f, self.C.options(name="f")]) + BuiltApplication([self.f, self.C.options(name="f")]) with pytest.raises(ValueError): - Application([self.C, self.f.options(name="C")]) + BuiltApplication([self.C, self.f.options(name="C")]) def test_non_deployments(self): with pytest.raises(TypeError): - Application([self.f, 5, "hello"]) + BuiltApplication([self.f, 5, "hello"]) class TestServeRun: @@ -68,7 +68,7 @@ def deploy_and_check_responses( for i in range(len(deployments)): serve.run( - Application([deployments[i]]), + BuiltApplication([deployments[i]]), name=f"app{i}", _blocking=blocking, ) @@ -102,7 +102,7 @@ def test_basic_run(self, serve_instance): self.deploy_and_check_responses(deployments, responses) def test_non_blocking_run(self, serve_instance): - """Checks Application's deploy() behavior when blocking=False.""" + """Checks BuiltApplication's deploy() behavior when blocking=False.""" deployments = [self.f, self.g, self.C, self.D] responses = ["f reached", "g reached", "C reached", "D reached"] @@ -144,14 +144,14 @@ async def request_echo(self, echo: str): MutualHandles.options(name=deployment_name, init_args=(handle_name,)) ) - serve.run(Application(deployments), _blocking=True) + serve.run(BuiltApplication(deployments), _blocking=True) for deployment in deployments: assert (ray.get(deployment.get_handle().remote("hello"))) == "hello" def test_decorated_deployments(self, serve_instance): """ - Checks Application's deploy behavior when deployments have options set + Checks BuiltApplication's deploy behavior when deployments have options set in their @serve.deployment decorator. """ @@ -170,18 +170,18 @@ async def __call__(self): self.deploy_and_check_responses(deployments, responses) def test_empty_list(self, serve_instance): - """Checks Application's deploy behavior when deployment group is empty.""" + """Checks BuiltApplication's deploy behavior when deployment group is empty.""" self.deploy_and_check_responses([], []) def test_invalid_input(self, serve_instance): """ - Checks Application's deploy behavior when deployment group contains + Checks BuiltApplication's deploy behavior when deployment group contains non-Deployment objects. """ with pytest.raises(TypeError): - Application([self.f, self.C, "not a Deployment object"]).deploy( + BuiltApplication([self.f, self.C, "not a Deployment object"]).deploy( blocking=True ) @@ -242,11 +242,11 @@ def test_different_pymodules(self, serve_instance): def test_import_path_deployment_decorated(self, serve_instance): func = serve.deployment(name="decorated_func", route_prefix="/decorated_func")( - "ray.serve.tests.test_application.decorated_func" + "ray.serve.tests.test_built_application.decorated_func" ) clss = serve.deployment(name="decorated_clss", route_prefix="/decorated_clss")( - "ray.serve.tests.test_application.DecoratedClass" + "ray.serve.tests.test_built_application.DecoratedClass" ) deployments = [func, clss] @@ -273,7 +273,7 @@ def __call__(self, req=None): def test_immutable_deployment_list(serve_instance): - app = Application([DecoratedClass, decorated_func]) + app = BuiltApplication([DecoratedClass, decorated_func]) assert len(app.deployments.values()) == 2 for name in app.deployments.keys(): diff --git a/python/ray/serve/tests/test_cli.py b/python/ray/serve/tests/test_cli.py index 217e1e56512a..976ed497e9cf 100644 --- a/python/ray/serve/tests/test_cli.py +++ b/python/ray/serve/tests/test_cli.py @@ -7,20 +7,26 @@ from tempfile import NamedTemporaryFile from typing import List +import click +from pydantic import BaseModel import pytest import requests import yaml import ray from ray import serve -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray._private.test_utils import wait_for_condition from ray.serve.schema import ServeApplicationSchema from ray.serve._private.constants import SERVE_NAMESPACE, MULTI_APP_MIGRATION_MESSAGE from ray.serve.deployment_graph import RayServeDAGHandle from ray.tests.conftest import tmp_working_dir # noqa: F401, E501 from ray.dashboard.modules.serve.sdk import ServeSubmissionClient -from ray.serve.scripts import remove_ansi_escape_sequences +from ray.serve.scripts import convert_args_to_dict, remove_ansi_escape_sequences +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) CONNECTION_ERROR_MSG = "connection error" @@ -47,6 +53,20 @@ def assert_deployments_live(names: List[str]): assert all_deployments_live, f'"{nonliving_deployment}" deployment is not live.' +def test_convert_args_to_dict(): + assert convert_args_to_dict(tuple()) == {} + + with pytest.raises( + click.ClickException, match="Invalid application argument 'bad_arg'" + ): + convert_args_to_dict(("bad_arg",)) + + assert convert_args_to_dict(("key1=val1", "key2=val2")) == { + "key1": "val1", + "key2": "val2", + } + + def test_start_shutdown(ray_start_stop): subprocess.check_output(["serve", "start"]) subprocess.check_output(["serve", "shutdown", "-y"]) @@ -66,7 +86,7 @@ def test_deploy(ray_start_stop): os.path.dirname(__file__), "test_config_files", "arithmetic.yaml" ) - success_message_fragment = b"Sent deploy request successfully!" + success_message_fragment = b"Sent deploy request successfully." # Ensure the CLI is idempotent num_iterations = 2 @@ -91,11 +111,11 @@ def test_deploy(ray_start_stop): print("Deployments are reachable over HTTP.") deployment_names = [ - "DAGDriver", - "create_order", - "Router", - "Multiplier", - "Adder", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}DAGDriver", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}create_order", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Router", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Multiplier", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Adder", ] assert_deployments_live(deployment_names) print("All deployments are live.\n") @@ -119,7 +139,12 @@ def test_deploy(ray_start_stop): ) print("Deployments are reachable over HTTP.") - deployment_names = ["DAGDriver", "Router", "Add", "Subtract"] + deployment_names = [ + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}DAGDriver", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Router", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Add", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Subtract", + ] assert_deployments_live(deployment_names) print("All deployments are live.\n") @@ -136,7 +161,7 @@ def test_deploy_with_http_options(ray_start_stop): f2 = os.path.join( os.path.dirname(__file__), "test_config_files", "basic_graph.yaml" ) - success_message_fragment = b"Sent deploy request successfully!" + success_message_fragment = b"Sent deploy request successfully." with open(f1, "r") as config_file: config = yaml.safe_load(config_file) @@ -183,7 +208,7 @@ def test_deploy_multi_app(ray_start_stop): os.path.dirname(__file__), "test_config_files", "pizza_world.yaml" ) - success_message_fragment = b"Sent deploy request successfully!" + success_message_fragment = b"Sent deploy request successfully." # Ensure the CLI is idempotent num_iterations = 2 @@ -355,6 +380,38 @@ def test_deploy_single_with_name(ray_start_stop): assert MULTI_APP_MIGRATION_MESSAGE in e.value.output.decode("utf-8") +@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") +def test_deploy_multi_app_builder_with_args(ray_start_stop): + """Deploys a config file containing multiple applications that take arguments.""" + # Create absolute file names to YAML config file. + apps_with_args = os.path.join( + os.path.dirname(__file__), "test_config_files", "apps_with_args.yaml" + ) + + subprocess.check_output(["serve", "deploy", apps_with_args]) + + wait_for_condition( + lambda: requests.post("http://localhost:8000/untyped_default").text + == "DEFAULT", + timeout=10, + ) + + wait_for_condition( + lambda: requests.post("http://localhost:8000/untyped_hello").text == "hello", + timeout=10, + ) + + wait_for_condition( + lambda: requests.post("http://localhost:8000/typed_default").text == "DEFAULT", + timeout=10, + ) + + wait_for_condition( + lambda: requests.post("http://localhost:8000/typed_hello").text == "hello", + timeout=10, + ) + + @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") def test_config(ray_start_stop): """Deploys config and checks that `serve config` returns correct response.""" @@ -368,7 +425,7 @@ def test_config(ray_start_stop): config_file_name = os.path.join( os.path.dirname(__file__), "test_config_files", "basic_graph.yaml" ) - success_message_fragment = b"Sent deploy request successfully!" + success_message_fragment = b"Sent deploy request successfully." with open(config_file_name, "r") as config_file: config = yaml.safe_load(config_file) @@ -431,11 +488,11 @@ def num_live_deployments(): serve_status = yaml.safe_load(status_response) expected_deployments = { - "DAGDriver", - "Multiplier", - "Adder", - "Router", - "create_order", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}DAGDriver", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Multiplier", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Adder", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}Router", + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}create_order", } for status in serve_status["deployment_statuses"]: expected_deployments.remove(status["name"]) @@ -460,22 +517,20 @@ def test_status_error_msg_format(ray_start_stop): subprocess.check_output(["serve", "deploy", config_file_name]) - status_response = subprocess.check_output( - ["serve", "status", "-a", "http://localhost:52365/"] - ) - serve_status = yaml.safe_load(status_response) - print("serve_status", serve_status) - def check_for_failed_deployment(): + serve_status = yaml.safe_load( + subprocess.check_output( + ["serve", "status", "-a", "http://localhost:52365/"] + ) + ) app_status = ServeSubmissionClient("http://localhost:52365").get_status() return ( - len(serve_status["deployment_statuses"]) == 0 - and serve_status["app_status"]["status"] == "DEPLOY_FAILED" + serve_status["app_status"]["status"] == "DEPLOY_FAILED" and remove_ansi_escape_sequences(app_status["app_status"]["message"]) in serve_status["app_status"]["message"] ) - wait_for_condition(check_for_failed_deployment, timeout=2) + wait_for_condition(check_for_failed_deployment) @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") @@ -522,6 +577,56 @@ def check_for_failed_deployment(): wait_for_condition(check_for_failed_deployment) +@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") +def test_status_constructor_error(ray_start_stop): + """Deploys Serve deployment that errors out in constructor, checks that the + traceback is surfaced. + """ + + config_file_name = os.path.join( + os.path.dirname(__file__), "test_config_files", "deployment_fail.yaml" + ) + + subprocess.check_output(["serve", "deploy", config_file_name]) + + def check_for_failed_deployment(): + status_response = subprocess.check_output( + ["serve", "status", "-a", "http://localhost:52365/"] + ) + serve_status = yaml.safe_load(status_response) + return ( + serve_status["app_status"]["status"] == "DEPLOY_FAILED" + and "ZeroDivisionError" in serve_status["deployment_statuses"][0]["message"] + ) + + wait_for_condition(check_for_failed_deployment) + + +@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") +def test_status_package_unavailable_in_controller(ray_start_stop): + """Test that exceptions raised from packages that are installed on deployment actors + but not on controller is serialized and surfaced properly. + """ + + config_file_name = os.path.join( + os.path.dirname(__file__), "test_config_files", "sqlalchemy.yaml" + ) + + subprocess.check_output(["serve", "deploy", config_file_name]) + + def check_for_failed_deployment(): + status_response = subprocess.check_output( + ["serve", "status", "-a", "http://localhost:52365/"] + ) + serve_status = yaml.safe_load(status_response) + return ( + serve_status["app_status"]["status"] == "DEPLOY_FAILED" + and "some_wrong_url" in serve_status["deployment_statuses"][0]["message"] + ) + + wait_for_condition(check_for_failed_deployment, timeout=15) + + @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") def test_status_multi_app(ray_start_stop): """Deploys a multi-app config file and checks their status.""" @@ -753,6 +858,72 @@ def test_run_deployment_node(ray_start_stop): assert ping_endpoint("Macaw") == CONNECTION_ERROR_MSG +@serve.deployment +class Echo: + def __init__(self, message: str): + print("Echo message:", message) + self._message = message + + def __call__(self, *args): + return self._message + + +def build_echo_app(args): + return Echo.bind(args.get("message", "DEFAULT")) + + +class TypedArgs(BaseModel): + message: str = "DEFAULT" + + +def build_echo_app_typed(args: TypedArgs): + return Echo.bind(args.message) + + +@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") +@pytest.mark.parametrize( + "import_path", + [ + "ray.serve.tests.test_cli.build_echo_app", + "ray.serve.tests.test_cli.build_echo_app_typed", + ], +) +def test_run_builder_with_args(ray_start_stop, import_path: str): + """Test `serve run` with args passed into a builder function. + + Tests both the untyped and typed args cases. + """ + # First deploy without any arguments, should get default response. + p = subprocess.Popen( + [ + "serve", + "run", + "--address=auto", + import_path, + ] + ) + wait_for_condition(lambda: ping_endpoint("") == "DEFAULT", timeout=10) + p.send_signal(signal.SIGINT) + p.wait() + assert ping_endpoint("") == CONNECTION_ERROR_MSG + + # Now deploy passing a message as an argument, should get passed message. + p = subprocess.Popen( + [ + "serve", + "run", + "--address=auto", + import_path, + "message=hello world", + ] + ) + wait_for_condition(lambda: ping_endpoint("") == "hello world", timeout=10) + + p.send_signal(signal.SIGINT) + p.wait() + assert ping_endpoint("") == CONNECTION_ERROR_MSG + + @serve.deployment class MetalDetector: def __call__(self, *args): @@ -913,7 +1084,6 @@ async def __call__(self): @pytest.mark.parametrize("node", ["TestBuildFNode", "TestBuildDagNode"]) def test_build(ray_start_stop, node): with NamedTemporaryFile(mode="w+", suffix=".yaml") as tmp: - print(f'Building node "{node}".') # Build an app subprocess.check_output( @@ -945,7 +1115,6 @@ def test_build(ray_start_stop, node): @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") def test_build_multi_app(ray_start_stop): with NamedTemporaryFile(mode="w+", suffix=".yaml") as tmp: - print('Building nodes "TestApp1Node" and "TestApp2Node".') # Build an app subprocess.check_output( @@ -1030,7 +1199,7 @@ def test_idempotence_after_controller_death(ray_start_stop, use_command: bool): config_file_name = os.path.join( os.path.dirname(__file__), "test_config_files", "basic_graph.yaml" ) - success_message_fragment = b"Sent deploy request successfully!" + success_message_fragment = b"Sent deploy request successfully." deploy_response = subprocess.check_output(["serve", "deploy", config_file_name]) assert success_message_fragment in deploy_response diff --git a/python/ray/serve/tests/test_cluster.py b/python/ray/serve/tests/test_cluster.py index 97d8927894d1..209e6da71c76 100644 --- a/python/ray/serve/tests/test_cluster.py +++ b/python/ray/serve/tests/test_cluster.py @@ -152,7 +152,7 @@ def get_replicas(replica_state): replica = get_replicas(ReplicaState.STARTING)[0] # currently there are no resources to allocate the replica - assert replica.check_started() == ReplicaStartupStatus.PENDING_ALLOCATION + assert replica.check_started()[0] == ReplicaStartupStatus.PENDING_ALLOCATION # add the necessary resources to allocate the replica cluster.add_node(num_cpus=4) @@ -160,7 +160,7 @@ def get_replicas(replica_state): wait_for_condition(lambda: (ray.available_resources().get("CPU", 0) >= 2)) def is_replica_pending_initialization(): - status = replica.check_started() + status, _ = replica.check_started() print(status) return status == ReplicaStartupStatus.PENDING_INITIALIZATION @@ -169,7 +169,7 @@ def is_replica_pending_initialization(): # send signal to complete replica intialization signal.send.remote() wait_for_condition( - lambda: replica.check_started() == ReplicaStartupStatus.SUCCEEDED + lambda: replica.check_started()[0] == ReplicaStartupStatus.SUCCEEDED ) diff --git a/python/ray/serve/tests/test_config_files/apps_with_args.yaml b/python/ray/serve/tests/test_config_files/apps_with_args.yaml new file mode 100644 index 000000000000..f2b95e6e071a --- /dev/null +++ b/python/ray/serve/tests/test_config_files/apps_with_args.yaml @@ -0,0 +1,17 @@ +applications: + - name: untyped_default + route_prefix: /untyped_default + import_path: ray.serve.tests.test_config_files.arg_builders.build_echo_app + - name: untyped_hello + route_prefix: /untyped_hello + import_path: ray.serve.tests.test_config_files.arg_builders.build_echo_app + args: + message: hello + - name: typed_default + route_prefix: /typed_default + import_path: ray.serve.tests.test_config_files.arg_builders.build_echo_app_typed + - name: typed_hello + route_prefix: /typed_hello + import_path: ray.serve.tests.test_config_files.arg_builders.build_echo_app_typed + args: + message: hello diff --git a/python/ray/serve/tests/test_config_files/arg_builders.py b/python/ray/serve/tests/test_config_files/arg_builders.py new file mode 100644 index 000000000000..78bb5cf2b424 --- /dev/null +++ b/python/ray/serve/tests/test_config_files/arg_builders.py @@ -0,0 +1,25 @@ +from pydantic import BaseModel + +from ray import serve + + +class TypedArgs(BaseModel): + message: str = "DEFAULT" + + +@serve.deployment(ray_actor_options={"num_cpus": 0}) +class Echo: + def __init__(self, message: str): + print("Echo message:", message) + self._message = message + + def __call__(self, *args): + return self._message + + +def build_echo_app(args): + return Echo.bind(args.get("message", "DEFAULT")) + + +def build_echo_app_typed(args: TypedArgs): + return Echo.bind(args.message) diff --git a/python/ray/serve/tests/test_config_files/deployment_fail.yaml b/python/ray/serve/tests/test_config_files/deployment_fail.yaml index 25db75ea389d..a7d87c39243c 100644 --- a/python/ray/serve/tests/test_config_files/deployment_fail.yaml +++ b/python/ray/serve/tests/test_config_files/deployment_fail.yaml @@ -1 +1 @@ -import_path: fail.node +import_path: ray.serve.tests.test_config_files.fail.node diff --git a/python/ray/serve/tests/test_config_files/fail.py b/python/ray/serve/tests/test_config_files/fail.py index 72dca4d5e478..4b69ed6aed89 100644 --- a/python/ray/serve/tests/test_config_files/fail.py +++ b/python/ray/serve/tests/test_config_files/fail.py @@ -1 +1,10 @@ -1 / 0 +from ray import serve + + +@serve.deployment +class A: + def __init__(self): + 1 / 0 + + +node = A.bind() diff --git a/python/ray/serve/tests/test_config_files/pid.py b/python/ray/serve/tests/test_config_files/pid.py index 9b30c9de510d..3b8cf91ee5ef 100644 --- a/python/ray/serve/tests/test_config_files/pid.py +++ b/python/ray/serve/tests/test_config_files/pid.py @@ -1,28 +1,71 @@ from ray import serve -from ray.serve.deployment_graph import RayServeDAGHandle + +# from ray.serve.deployment_graph import RayServeDAGHandle import os +import time +import asyncio @serve.deployment class f: - def __init__(self, name: str = "default_name"): - self.name = name + def __init__(self, async_wait: bool = False): + self._async = async_wait + self.name = "default_name" + # for __call__() + self.ready = True + self.counter = 0 + # for check_health() + self.health_check_ready = True + self.health_check_counter = 0 + + async def get_counter(self, health_check=False) -> int: + if health_check: + return self.health_check_counter + else: + return self.counter + + def send(self, clear=False, health_check=False): + if health_check: + self.health_check_ready = not clear + else: + self.ready = not clear + + def wait(self, health_check=False): + if health_check: + while not self.health_check_ready: + time.sleep(0.1) + else: + while not self.ready: + time.sleep(0.1) + + async def async_wait(self, health_check=False): + if health_check: + while not self.health_check_ready: + await asyncio.sleep(0.1) + else: + while not self.ready: + await asyncio.sleep(0.1) def reconfigure(self, config: dict): self.name = config.get("name", "default_name") async def __call__(self): - return os.getpid() - + self.counter += 1 + if self._async: + await self.async_wait() + else: + self.wait() -@serve.deployment -class BasicDriver: - def __init__(self, dag: RayServeDAGHandle): - self.dag = dag + return os.getpid(), self.name - async def __call__(self): - return await self.dag.remote() + async def check_health(self): + self.health_check_counter += 1 + if self._async: + await self.async_wait(health_check=True) + else: + self.wait(health_check=True) node = f.bind() -bnode = BasicDriver.bind(node) +dup_node = f.bind() +async_node = f.bind(async_wait=True) diff --git a/python/ray/serve/tests/test_config_files/sqlalchemy.py b/python/ray/serve/tests/test_config_files/sqlalchemy.py new file mode 100644 index 000000000000..6ec900f40f4d --- /dev/null +++ b/python/ray/serve/tests/test_config_files/sqlalchemy.py @@ -0,0 +1,15 @@ +from ray import serve + + +@serve.deployment +class TestDeployment: + def __init__(self): + from sqlalchemy import create_engine + import pymysql + + pymysql.install_as_MySQLdb() + + create_engine("mysql://some_wrong_url:3306").connect() + + +app = TestDeployment.bind() diff --git a/python/ray/serve/tests/test_config_files/sqlalchemy.yaml b/python/ray/serve/tests/test_config_files/sqlalchemy.yaml new file mode 100644 index 000000000000..ebc54442a148 --- /dev/null +++ b/python/ray/serve/tests/test_config_files/sqlalchemy.yaml @@ -0,0 +1,13 @@ +import_path: ray.serve.tests.test_config_files.sqlalchemy.app + +host: 127.0.0.1 +port: 8000 + +deployments: + - name: TestDeployment + num_replicas: 1 + ray_actor_options: + runtime_env: + pip: + - PyMySQL + - sqlalchemy==1.3.19 \ No newline at end of file diff --git a/python/ray/serve/tests/test_constructor_failure.py b/python/ray/serve/tests/test_constructor_failure.py index 56a592c1bc68..ffed1c5b868f 100644 --- a/python/ray/serve/tests/test_constructor_failure.py +++ b/python/ray/serve/tests/test_constructor_failure.py @@ -6,6 +6,14 @@ import ray from ray import serve +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) + + +def get_deployment_name(name: str): + return f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}{name}" def test_deploy_with_consistent_constructor_failure(serve_instance): @@ -23,8 +31,9 @@ async def serve(self, request): # Assert no replicas are running in deployment deployment after failed # deploy call + deployment_name = get_deployment_name("ConstructorFailureDeploymentOneReplica") deployment_dict = ray.get(serve_instance._controller._all_running_replicas.remote()) - assert deployment_dict["ConstructorFailureDeploymentOneReplica"] == [] + assert deployment_dict[deployment_name] == [] # # Test failed to deploy with total of 2 replicas @serve.deployment(num_replicas=2) @@ -40,8 +49,9 @@ async def serve(self, request): # Assert no replicas are running in deployment deployment after failed # deploy call + deployment_name = get_deployment_name("ConstructorFailureDeploymentTwoReplicas") deployment_dict = ray.get(serve_instance._controller._all_running_replicas.remote()) - assert deployment_dict["ConstructorFailureDeploymentTwoReplicas"] == [] + assert deployment_dict[deployment_name] == [] def test_deploy_with_partial_constructor_failure(serve_instance): @@ -75,7 +85,8 @@ async def serve(self, request): # Assert 2 replicas are running in deployment deployment after partially # successful deploy call deployment_dict = ray.get(serve_instance._controller._all_running_replicas.remote()) - assert len(deployment_dict["PartialConstructorFailureDeployment"]) == 2 + deployment_name = get_deployment_name("PartialConstructorFailureDeployment") + assert len(deployment_dict[deployment_name]) == 2 def test_deploy_with_transient_constructor_failure(serve_instance): @@ -101,7 +112,8 @@ async def serve(self, request): # Assert 2 replicas are running in deployment deployment after partially # successful deploy call with transient error deployment_dict = ray.get(serve_instance._controller._all_running_replicas.remote()) - assert len(deployment_dict["TransientConstructorFailureDeployment"]) == 2 + deployment_name = get_deployment_name("TransientConstructorFailureDeployment") + assert len(deployment_dict[deployment_name]) == 2 if __name__ == "__main__": diff --git a/python/ray/serve/tests/test_controller.py b/python/ray/serve/tests/test_controller.py index 1e283cd3fea0..f5a3250e8c6b 100644 --- a/python/ray/serve/tests/test_controller.py +++ b/python/ray/serve/tests/test_controller.py @@ -1,13 +1,19 @@ import pytest import time -import copy import ray from ray import serve from ray.serve._private.common import DeploymentInfo from ray.serve.generated.serve_pb2 import DeploymentRoute -from ray.serve.controller import _generate_deployment_config_versions +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) + + +def get_deployment_name(name: str): + return f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}{name}" def test_redeploy_start_time(serve_instance): @@ -20,8 +26,9 @@ def test(_): return "1" serve.run(test.bind()) + deployment_name = get_deployment_name("test") deployment_route = DeploymentRoute.FromString( - ray.get(controller.get_deployment_info.remote("test")) + ray.get(controller.get_deployment_info.remote(deployment_name)) ) deployment_info_1 = DeploymentInfo.from_proto(deployment_route.deployment_info) start_time_ms_1 = deployment_info_1.start_time_ms @@ -34,7 +41,7 @@ def test(_): serve.run(test.bind()) deployment_route = DeploymentRoute.FromString( - ray.get(controller.get_deployment_info.remote("test")) + ray.get(controller.get_deployment_info.remote(deployment_name)) ) deployment_info_2 = DeploymentInfo.from_proto(deployment_route.deployment_info) start_time_ms_2 = deployment_info_2.start_time_ms @@ -42,99 +49,6 @@ def test(_): assert start_time_ms_1 == start_time_ms_2 -@pytest.mark.parametrize("last_config_had_option", [True, False]) -@pytest.mark.parametrize( - "option_to_update,config_update", - [ - ("num_replicas", True), - ("autoscaling_config", True), - ("user_config", True), - ("ray_actor_options", False), - ], -) -def test_config_versions_deployments_update( - last_config_had_option: bool, option_to_update: str, config_update: bool -): - """ - Check that controller._generate_deployment_config_versions() has correct behavior - when the config options in the ``deployments`` field is updated. - """ - - options = { - "num_replicas": {"old": 1, "new": 2}, - "autoscaling_config": { - "old": None, - "new": {"max_replicas": 2}, - }, - "user_config": { - "old": None, - "new": {"name": "bob"}, - }, - "ray_actor_options": { - "old": {"num_cpus": 0.1}, - "new": {"num_cpus": 0.2}, - }, - } - - old_config = { - "import_path": "ray.serve.tests.test_config_files.pid.node", - "deployments": [{"name": "f"}], - } - - if last_config_had_option: - old_config["deployments"][0][option_to_update] = options[option_to_update][ - "old" - ] - - new_config = copy.deepcopy(old_config) - new_config["deployments"][0][option_to_update] = options[option_to_update]["new"] - - versions = {"f": "v1"} - new_versions = _generate_deployment_config_versions( - new_config, old_config, versions - ) - assert ( - new_versions.get("f") is not None - and (new_versions.get("f") == versions.get("f")) == config_update - ) - - -@pytest.mark.parametrize("field_to_update", ["import_path", "runtime_env", "both"]) -def test_config_versions_non_deployments_update(field_to_update: str): - """ - Check that controller._generate_deployment_config_versions() has correct behavior - when the the ``import_path`` and ``runtime_env`` fields are updated. - """ - - old_config = { - "import_path": "ray.serve.tests.test_config_files.pid.node", - "deployments": [ - { - "name": "f", - "num_replicas": 1, - "ray_actor_options": {"num_cpus": 0.1}, - } - ], - } - - new_config = copy.deepcopy(old_config) - if field_to_update == "import_path": - new_config["import_path"] = "ray.serve.tests.test_config_files.pid.bnode" - elif field_to_update == "runtime_env": - new_config["runtime_env"] = {"env_vars": {"test_var": "test_val"}} - elif field_to_update == "both": - new_config["import_path"] = "ray.serve.tests.test_config_files.pid.bnode" - new_config["runtime_env"] = {"env_vars": {"test_var": "test_val"}} - - versions = {"f": "v1"} - new_versions = _generate_deployment_config_versions( - new_config, old_config, versions - ) - assert new_versions.get("f") is not None and ( - new_versions.get("f") != versions.get("f") - ) - - if __name__ == "__main__": import sys diff --git a/python/ray/serve/tests/test_controller_recovery.py b/python/ray/serve/tests/test_controller_recovery.py index 13a4ce7a0d65..34765cf1ccd6 100644 --- a/python/ray/serve/tests/test_controller_recovery.py +++ b/python/ray/serve/tests/test_controller_recovery.py @@ -7,7 +7,7 @@ import ray from ray._private.test_utils import SignalActor -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray import serve from ray.serve._private.constants import ( @@ -23,6 +23,7 @@ def test_recover_start_from_replica_actor_names(serve_instance): """Test controller is able to recover starting -> running replicas from actor names. """ + # Test failed to deploy with total of 2 replicas, # but first constructor call fails. @serve.deployment(name="recover_start_from_replica_actor_names", num_replicas=2) @@ -33,7 +34,7 @@ def __init__(self): def __call__(self, *args): return "hii" - serve.run(TransientConstructorFailureDeployment.bind()) + serve.run(TransientConstructorFailureDeployment.bind(), name="app") for _ in range(10): response = request_with_retries( "/recover_start_from_replica_actor_names/", timeout=30 @@ -42,10 +43,10 @@ def __call__(self, *args): # Assert 2 replicas are running in deployment deployment after partially # successful deploy() call with transient error deployment_dict = ray.get(serve_instance._controller._all_running_replicas.remote()) - assert len(deployment_dict["recover_start_from_replica_actor_names"]) == 2 + assert len(deployment_dict["app_recover_start_from_replica_actor_names"]) == 2 replica_version_hash = None - for replica in deployment_dict["recover_start_from_replica_actor_names"]: + for replica in deployment_dict["app_recover_start_from_replica_actor_names"]: ref = replica.actor_handle.get_metadata.remote() _, version = ray.get(ref) if replica_version_hash is None: @@ -116,7 +117,7 @@ def test_recover_rolling_update_from_replica_actor_names(serve_instance): @ray.remote(num_cpus=0) def call(block=False): - handle = serve.get_deployment(name).get_handle() + handle = serve.get_deployment(f"app_{name}").get_handle() ret = ray.get(handle.handler.remote(block)) return ret.split("|")[0], ret.split("|")[1] @@ -167,7 +168,7 @@ def make_nonblocking_calls(expected, expect_blocking=False, num_returns=1): return responses, blocking - serve.run(V1.bind()) + serve.run(V1.bind(), name="app") responses1, _ = make_nonblocking_calls({"1": 2}, num_returns=2) pids1 = responses1["1"] @@ -182,9 +183,9 @@ def make_nonblocking_calls(expected, expect_blocking=False, num_returns=1): # Redeploy new version. Since there is one replica blocking, only one new # replica should be started up. V2 = V1.options(func_or_class=V2, version="2") - serve.run(V2.bind(), _blocking=False) + serve.run(V2.bind(), _blocking=False, name="app") with pytest.raises(TimeoutError): - client._wait_for_deployment_healthy(V2.name, timeout_s=0.1) + client._wait_for_deployment_healthy(f"app_{V2.name}", timeout_s=0.1) responses3, blocking3 = make_nonblocking_calls({"1": 1}, expect_blocking=True) ray.kill(serve.context._global_client._controller, no_restart=False) @@ -197,7 +198,7 @@ def make_nonblocking_calls(expected, expect_blocking=False, num_returns=1): # Now the goal and requests to the new version should complete. # We should have two running replicas of the new version. - client._wait_for_deployment_healthy(V2.name) + client._wait_for_deployment_healthy(f"app_{V2.name}") make_nonblocking_calls({"2": 2}, num_returns=2) @@ -222,7 +223,7 @@ async def __init__(self): def __call__(self, request): return f"1|{os.getpid()}" - serve.run(V1.bind(), _blocking=False) + serve.run(V1.bind(), _blocking=False, name="app") ray.get(pending_init_indicator.remote()) def get_actor_info(name: str): @@ -234,7 +235,7 @@ def get_actor_info(name: str): print(actor) return actor["name"], actor["pid"] - actor_tag, _ = get_actor_info(V1.name) + actor_tag, _ = get_actor_info(f"app_{V1.name}") _, controller1_pid = get_actor_info(SERVE_CONTROLLER_NAME) ray.kill(serve.context._global_client._controller, no_restart=False) # wait for controller is alive again @@ -243,9 +244,9 @@ def get_actor_info(name: str): # Let the actor proceed initialization ray.get(signal.send.remote()) - client._wait_for_deployment_healthy(V1.name) + client._wait_for_deployment_healthy(f"app_{V1.name}") # Make sure the actor before controller dead is staying alive. - assert actor_tag == get_actor_info(V1.name)[0] + assert actor_tag == get_actor_info(f"app_{V1.name}")[0] if __name__ == "__main__": diff --git a/python/ray/serve/tests/test_deploy.py b/python/ray/serve/tests/test_deploy.py index d01b597982fd..24b3dc001c41 100644 --- a/python/ray/serve/tests/test_deploy.py +++ b/python/ray/serve/tests/test_deploy.py @@ -12,6 +12,7 @@ from ray import serve from ray.serve.exceptions import RayServeException from ray.serve._private.utils import get_random_letters +from ray.serve.context import get_global_client @pytest.mark.parametrize("use_handle", [True, False]) @@ -210,7 +211,7 @@ def test_redeploy_single_replica(serve_instance, use_handle): @ray.remote def call(block=False): if use_handle: - handle = serve.get_deployment(name).get_handle() + handle = serve.get_deployment(f"app_{name}").get_handle() ret = ray.get(handle.handler.remote(block)) else: ret = requests.get( @@ -241,7 +242,7 @@ async def handler(self, *args): async def __call__(self, request): return await self.handler() - serve.run(V1.bind()) + serve.run(V1.bind(), name="app") ref1 = call.remote(block=False) val1, pid1 = ray.get(ref1) assert val1 == "1" @@ -253,9 +254,9 @@ async def __call__(self, request): # Redeploy new version. This should not go through until the old version # replica completely stops. V2 = V1.options(func_or_class=V2, version="2") - serve.run(V2.bind(), _blocking=False) + serve.run(V2.bind(), _blocking=False, name="app") with pytest.raises(TimeoutError): - client._wait_for_deployment_healthy(V2.name, timeout_s=0.1) + client._wait_for_deployment_healthy(f"app_{V2.name}", timeout_s=0.1) # It may take some time for the handle change to propagate and requests # to get sent to the new version. Repeatedly send requests until they @@ -283,7 +284,7 @@ async def __call__(self, request): assert pid2 == pid1 # Now the goal and request to the new version should complete. - client._wait_for_deployment_healthy(V2.name) + client._wait_for_deployment_healthy(f"app_{V2.name}") new_version_val, new_version_pid = ray.get(new_version_ref) assert new_version_val == "2" assert new_version_pid != pid2 @@ -301,7 +302,7 @@ def test_redeploy_multiple_replicas(serve_instance, use_handle): @ray.remote(num_cpus=0) def call(block=False): if use_handle: - handle = serve.get_deployment(name).get_handle() + handle = serve.get_deployment(f"app_{name}").get_handle() ret = ray.get(handle.handler.remote(block)) else: ret = requests.get( @@ -355,7 +356,7 @@ def make_nonblocking_calls(expected, expect_blocking=False): return responses, blocking - serve.run(V1.bind()) + serve.run(V1.bind(), name="app") responses1, _ = make_nonblocking_calls({"1": 2}) pids1 = responses1["1"] @@ -368,9 +369,9 @@ def make_nonblocking_calls(expected, expect_blocking=False): # Redeploy new version. Since there is one replica blocking, only one new # replica should be started up. V2 = V1.options(func_or_class=V2, version="2") - serve.run(V2.bind(), _blocking=False) + serve.run(V2.bind(), _blocking=False, name="app") with pytest.raises(TimeoutError): - client._wait_for_deployment_healthy(V2.name, timeout_s=0.1) + client._wait_for_deployment_healthy(f"app_{V2.name}", timeout_s=0.1) responses3, blocking3 = make_nonblocking_calls({"1": 1}, expect_blocking=True) # Signal the original call to exit. @@ -381,7 +382,7 @@ def make_nonblocking_calls(expected, expect_blocking=False): # Now the goal and requests to the new version should complete. # We should have two running replicas of the new version. - client._wait_for_deployment_healthy(V2.name) + client._wait_for_deployment_healthy(f"app_{V2.name}") make_nonblocking_calls({"2": 2}) @@ -511,7 +512,7 @@ def v1(*args): @ray.remote(num_cpus=0) def call(): if use_handle: - handle = v1.get_handle() + handle = get_global_client().get_handle(f"app_{name}", sync=True) ret = ray.get(handle.remote()) else: ret = requests.get(f"http://localhost:8000/{name}").text @@ -536,7 +537,7 @@ def make_calls(expected): return responses - serve.run(v1.bind()) + serve.run(v1.bind(), name="app") responses1 = make_calls({"1": 4}) pids1 = responses1["1"] @@ -544,7 +545,7 @@ def make_calls(expected): def v2(*args): return f"2|{os.getpid()}" - serve.run(v2.bind()) + serve.run(v2.bind(), name="app") responses2 = make_calls({"2": 2}) assert all(pid not in pids1 for pid in responses2["2"]) @@ -562,7 +563,7 @@ def v1(*args): @ray.remote(num_cpus=0) def call(): if use_handle: - handle = v1.get_handle() + handle = get_global_client().get_handle(f"app_{name}", sync=True) ret = ray.get(handle.remote()) else: ret = requests.get(f"http://localhost:8000/{name}").text @@ -587,7 +588,7 @@ def make_calls(expected): return responses - serve.run(v1.bind()) + serve.run(v1.bind(), name="app") responses1 = make_calls({"1": 2}) pids1 = responses1["1"] @@ -595,7 +596,7 @@ def make_calls(expected): def v2(*args): return f"2|{os.getpid()}" - serve.run(v2.bind()) + serve.run(v2.bind(), name="app") responses2 = make_calls({"2": 4}) assert all(pid not in pids1 for pid in responses2["2"]) @@ -606,8 +607,7 @@ class A: def b(self, *args): return "hello" - serve.run(A.bind()) - handle = A.get_handle() + handle = serve.run(A.bind(), name="app") # Legacy code path assert ray.get(handle.options(method_name="b").remote()) == "hello" diff --git a/python/ray/serve/tests/test_deploy_2.py b/python/ray/serve/tests/test_deploy_2.py index 3d0a8156be3a..61449bb618a8 100644 --- a/python/ray/serve/tests/test_deploy_2.py +++ b/python/ray/serve/tests/test_deploy_2.py @@ -16,6 +16,7 @@ class TestGetDeployment: + # Test V1 API get_deployment() def get_deployment(self, name, use_list_api): if use_list_api: return serve.list_deployments()[name] @@ -33,8 +34,8 @@ def d(*args): with pytest.raises(KeyError): self.get_deployment(name, use_list_api) - handle = serve.run(d.bind()) - val1, pid1 = ray.get(handle.remote()) + d.deploy() + val1, pid1 = ray.get(d.get_handle().remote()) assert val1 == "1" del d @@ -52,7 +53,7 @@ def test_get_after_delete(self, serve_instance, use_list_api): def d(*args): return "1", os.getpid() - serve.run(d.bind()) + d.deploy() del d d2 = self.get_deployment(name, use_list_api) @@ -70,15 +71,15 @@ def test_deploy_new_version(self, serve_instance, use_list_api): def d(*args): return "1", os.getpid() - handle = serve.run(d.bind()) - val1, pid1 = ray.get(handle.remote()) + d.deploy() + val1, pid1 = ray.get(d.get_handle().remote()) assert val1 == "1" del d d2 = self.get_deployment(name, use_list_api) - handle = serve.run(d2.options(version="2").bind()) - val2, pid2 = ray.get(handle.remote()) + d2.options(version="2").deploy() + val2, pid2 = ray.get(d2.get_handle().remote()) assert val2 == "1" assert pid2 != pid1 @@ -90,15 +91,15 @@ def test_deploy_empty_version(self, serve_instance, use_list_api): def d(*args): return "1", os.getpid() - handle = serve.run(d.bind()) - val1, pid1 = ray.get(handle.remote()) + d.deploy() + val1, pid1 = ray.get(d.get_handle().remote()) assert val1 == "1" del d d2 = self.get_deployment(name, use_list_api) - handle = serve.run(d2.bind()) - val2, pid2 = ray.get(handle.remote()) + d2.deploy() + val2, pid2 = ray.get(d2.get_handle().remote()) assert val2 == "1" assert pid2 != pid1 @@ -144,12 +145,12 @@ def check_num_replicas(num): handle = self.get_deployment(name, use_list_api).get_handle() assert len(set(ray.get([handle.remote() for _ in range(50)]))) == num - serve.run(d.bind()) + d.deploy() check_num_replicas(1) del d d2 = self.get_deployment(name, use_list_api) - serve.run(d2.options(num_replicas=2).bind()) + d2.options(num_replicas=2).deploy() check_num_replicas(2) diff --git a/python/ray/serve/tests/test_deployment_graph.py b/python/ray/serve/tests/test_deployment_graph.py index 7451f7372517..f6633ca1353a 100644 --- a/python/ray/serve/tests/test_deployment_graph.py +++ b/python/ray/serve/tests/test_deployment_graph.py @@ -5,27 +5,29 @@ import numpy as np import requests +import starlette.requests import ray from ray import serve -from ray.serve.application import Application from ray.serve.api import build as build_app -from ray.serve.deployment_graph import RayServeDAGHandle -from ray.serve._private.deployment_graph_build import build as pipeline_build -from ray.serve.deployment_graph import ClassNode, InputNode +from ray.serve.built_application import BuiltApplication +from ray.serve.deployment import Application +from ray.serve.deployment_graph import InputNode, RayServeDAGHandle from ray.serve.drivers import DAGDriver -import starlette.requests +from ray.serve._private.deployment_graph_build import build as pipeline_build RayHandleLike = TypeVar("RayHandleLike") NESTED_HANDLE_KEY = "nested_handle" -def maybe_build(node: ClassNode, use_build: bool) -> Union[Application, ClassNode]: +def maybe_build( + app: Application, use_build: bool +) -> Union[Application, BuiltApplication]: if use_build: - return build_app(node) + return build_app(app) else: - return node + return app @serve.deployment @@ -166,8 +168,14 @@ def func_3(input): output_2 = func_2.bind(dag_input) output_3 = func_3.bind(output_2) ray_dag = combine.bind(output_1, output_2, kwargs_output=output_3) - with pytest.raises(ValueError, match="Please provide a driver class"): - _ = serve.run(ray_dag) + with pytest.raises( + ValueError, + match=( + "The ingress deployment to your application cannot be a " + "function if there are multiple deployment" + ), + ): + serve.run(ray_dag) serve_dag = DAGDriver.bind(ray_dag, http_adapter=json_resolver) @@ -476,12 +484,13 @@ def get(self): tracker = CallTracker.bind() with InputNode() as inp: - dag = DAGDriver.bind(tracker.predict.bind(inp)) + dag = DAGDriver.bind( + {"/get": tracker.get.bind(), "/predict": tracker.predict.bind(inp)} + ) handle = serve.run(dag) - ray.get(handle.predict.remote(1)) + ray.get(handle.predict_with_route.remote("/predict", 1)) - call_tracker = CallTracker.get_handle() - assert ray.get(call_tracker.get.remote()) == ["predict"] + assert ray.get(handle.predict_with_route.remote("/get", 1)) == ["predict"] def test_sharing_call_for_broadcast(serve_instance): diff --git a/python/ray/serve/tests/test_deployment_graph_autoscaling.py b/python/ray/serve/tests/test_deployment_graph_autoscaling.py index 662f25104bdc..6b0256f0b0c4 100644 --- a/python/ray/serve/tests/test_deployment_graph_autoscaling.py +++ b/python/ray/serve/tests/test_deployment_graph_autoscaling.py @@ -9,12 +9,19 @@ from ray.dag.input_node import InputNode from ray.serve._private.common import ReplicaState from ray._private.test_utils import SignalActor, wait_for_condition +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) # Magic number to use for speed up scale from 0 replica serve_constants.HANDLE_METRIC_PUSH_INTERVAL_S = 1 def get_num_running_replicas(controller, deployment_name): + deployment_name = ( + SERVE_DEFAULT_APP_NAME + DEPLOYMENT_NAME_PREFIX_SEPARATOR + deployment_name + ) replicas = ray.get( controller._dump_replica_states_for_testing.remote(deployment_name) ) diff --git a/python/ray/serve/tests/test_deployment_state.py b/python/ray/serve/tests/test_deployment_state.py index e4976a4c9abf..8932b1da2c80 100644 --- a/python/ray/serve/tests/test_deployment_state.py +++ b/python/ray/serve/tests/test_deployment_state.py @@ -27,6 +27,12 @@ VersionedReplica, rank_replicas_for_stopping, ) +from ray.serve._private.constants import ( + DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_S, + DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S, + DEFAULT_HEALTH_CHECK_PERIOD_S, + DEFAULT_HEALTH_CHECK_TIMEOUT_S, +) from ray.serve._private.storage.kv_store import RayInternalKVStore from ray.serve._private.utils import get_random_letters from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy @@ -93,6 +99,18 @@ def actor_handle(self) -> MockActorHandle: def max_concurrent_queries(self) -> int: return 100 + @property + def pid(self) -> Optional[int]: + return None + + @property + def actor_id(self) -> Optional[str]: + return None + + @property + def worker_id(self) -> Optional[str]: + return None + @property def node_id(self) -> Optional[str]: if isinstance(self._scheduling_strategy, NodeAffinitySchedulingStrategy): @@ -101,6 +119,14 @@ def node_id(self) -> Optional[str]: return "node-id" return None + @property + def node_ip(self) -> Optional[str]: + return None + + @property + def log_file_path(self) -> Optional[str]: + return None + def set_ready(self): self.ready = ReplicaStartupStatus.SUCCEEDED @@ -122,11 +148,11 @@ def start(self, deployment_info: DeploymentInfo, version: DeploymentVersion): self.version = version self.deployment_info = deployment_info - def update_user_config(self, user_config: Any): + def reconfigure(self, version: DeploymentVersion): self.started = True - self.version = DeploymentVersion( - self.version.code_version, user_config=user_config - ) + updating = self.version.requires_actor_reconfigure(version) + self.version = version + return updating def recover(self): self.recovering = True @@ -140,7 +166,7 @@ def check_ready(self) -> ReplicaStartupStatus: self.recovering = False self.started = True self.version = self.starting_version - return ready, self.version + return ready, None def resource_requirements(self) -> Tuple[str, str]: assert self.started @@ -194,11 +220,17 @@ def deployment_info( else: code_version = get_random_letters() - version = DeploymentVersion(code_version, info.deployment_config.user_config) + version = DeploymentVersion( + code_version, info.deployment_config, info.replica_config.ray_actor_options + ) return info, version +def deployment_version(code_version) -> DeploymentVersion: + return DeploymentVersion(code_version, DeploymentConfig(), {}) + + class MockTimer: def __init__(self, start_time=None): if start_time is None: @@ -247,7 +279,7 @@ def mock_save_checkpoint_fn(*args, **kwargs): def replica(version: Optional[DeploymentVersion] = None) -> VersionedReplica: if version is None: - version = DeploymentVersion(get_random_letters(), None) + version = DeploymentVersion(get_random_letters(), DeploymentConfig(), {}) class MockVersionedReplica(VersionedReplica): def __init__(self, version: DeploymentVersion): @@ -257,6 +289,9 @@ def __init__(self, version: DeploymentVersion): def version(self): return self._version + def update_state(self, state): + pass + return MockVersionedReplica(version) @@ -264,9 +299,9 @@ class TestReplicaStateContainer: def test_count(self): c = ReplicaStateContainer() r1, r2, r3 = ( - replica(DeploymentVersion("1")), - replica(DeploymentVersion("2")), - replica(DeploymentVersion("2")), + replica(deployment_version("1")), + replica(deployment_version("2")), + replica(deployment_version("2")), ) c.add(ReplicaState.STARTING, r1) c.add(ReplicaState.STARTING, r2) @@ -281,42 +316,44 @@ def test_count(self): assert c.count(states=[ReplicaState.STOPPING]) == 1 # Test filtering by version. - assert c.count(version=DeploymentVersion("1")) == 1 - assert c.count(version=DeploymentVersion("2")) == 2 - assert c.count(version=DeploymentVersion("3")) == 0 - assert c.count(exclude_version=DeploymentVersion("1")) == 2 - assert c.count(exclude_version=DeploymentVersion("2")) == 1 - assert c.count(exclude_version=DeploymentVersion("3")) == 3 + assert c.count(version=deployment_version("1")) == 1 + assert c.count(version=deployment_version("2")) == 2 + assert c.count(version=deployment_version("3")) == 0 + assert c.count(exclude_version=deployment_version("1")) == 2 + assert c.count(exclude_version=deployment_version("2")) == 1 + assert c.count(exclude_version=deployment_version("3")) == 3 # Test filtering by state and version. assert ( - c.count(version=DeploymentVersion("1"), states=[ReplicaState.STARTING]) == 1 + c.count(version=deployment_version("1"), states=[ReplicaState.STARTING]) + == 1 ) assert ( - c.count(version=DeploymentVersion("3"), states=[ReplicaState.STARTING]) == 0 + c.count(version=deployment_version("3"), states=[ReplicaState.STARTING]) + == 0 ) assert ( c.count( - version=DeploymentVersion("2"), + version=deployment_version("2"), states=[ReplicaState.STARTING, ReplicaState.STOPPING], ) == 2 ) assert ( c.count( - exclude_version=DeploymentVersion("1"), states=[ReplicaState.STARTING] + exclude_version=deployment_version("1"), states=[ReplicaState.STARTING] ) == 1 ) assert ( c.count( - exclude_version=DeploymentVersion("3"), states=[ReplicaState.STARTING] + exclude_version=deployment_version("3"), states=[ReplicaState.STARTING] ) == 2 ) assert ( c.count( - exclude_version=DeploymentVersion("2"), + exclude_version=deployment_version("2"), states=[ReplicaState.STARTING, ReplicaState.STOPPING], ) == 1 @@ -347,18 +384,18 @@ def test_pop_basic(self): def test_pop_exclude_version(self): c = ReplicaStateContainer() r1, r2, r3 = ( - replica(DeploymentVersion("1")), - replica(DeploymentVersion("1")), - replica(DeploymentVersion("2")), + replica(deployment_version("1")), + replica(deployment_version("1")), + replica(deployment_version("2")), ) c.add(ReplicaState.STARTING, r1) c.add(ReplicaState.STARTING, r2) c.add(ReplicaState.STARTING, r3) - assert c.pop(exclude_version=DeploymentVersion("1")) == [r3] - assert not c.pop(exclude_version=DeploymentVersion("1")) - assert c.pop(exclude_version=DeploymentVersion("2")) == [r1, r2] - assert not c.pop(exclude_version=DeploymentVersion("2")) + assert c.pop(exclude_version=deployment_version("1")) == [r3] + assert not c.pop(exclude_version=deployment_version("1")) + assert c.pop(exclude_version=deployment_version("2")) == [r1, r2] + assert not c.pop(exclude_version=deployment_version("2")) assert not c.pop() def test_pop_max_replicas(self): @@ -409,10 +446,10 @@ def test_pop_states(self): def test_pop_integration(self): c = ReplicaStateContainer() r1, r2, r3, r4 = ( - replica(DeploymentVersion("1")), - replica(DeploymentVersion("2")), - replica(DeploymentVersion("2")), - replica(DeploymentVersion("3")), + replica(deployment_version("1")), + replica(deployment_version("2")), + replica(deployment_version("2")), + replica(deployment_version("3")), ) c.add(ReplicaState.STOPPING, r1) @@ -420,35 +457,35 @@ def test_pop_integration(self): c.add(ReplicaState.RUNNING, r3) c.add(ReplicaState.RUNNING, r4) assert not c.pop( - exclude_version=DeploymentVersion("1"), states=[ReplicaState.STOPPING] + exclude_version=deployment_version("1"), states=[ReplicaState.STOPPING] ) assert c.pop( - exclude_version=DeploymentVersion("1"), + exclude_version=deployment_version("1"), states=[ReplicaState.RUNNING], max_replicas=1, ) == [r3] assert c.pop( - exclude_version=DeploymentVersion("1"), + exclude_version=deployment_version("1"), states=[ReplicaState.RUNNING], max_replicas=1, ) == [r4] c.add(ReplicaState.RUNNING, r3) c.add(ReplicaState.RUNNING, r4) assert c.pop( - exclude_version=DeploymentVersion("1"), states=[ReplicaState.RUNNING] + exclude_version=deployment_version("1"), states=[ReplicaState.RUNNING] ) == [r3, r4] assert c.pop( - exclude_version=DeploymentVersion("1"), states=[ReplicaState.STARTING] + exclude_version=deployment_version("1"), states=[ReplicaState.STARTING] ) == [r2] c.add(ReplicaState.STARTING, r2) c.add(ReplicaState.RUNNING, r3) c.add(ReplicaState.RUNNING, r4) assert c.pop( - exclude_version=DeploymentVersion("1"), + exclude_version=deployment_version("1"), states=[ReplicaState.RUNNING, ReplicaState.STARTING], ) == [r3, r4, r2] assert c.pop( - exclude_version=DeploymentVersion("nonsense"), + exclude_version=deployment_version("nonsense"), states=[ReplicaState.STOPPING], ) == [r1] @@ -508,7 +545,7 @@ def test_create_delete_single_replica(mock_get_all_node_ids, mock_deployment_sta # Once it's done stopping, replica should be removed. replica = deployment_state._replicas.get()[0] replica._actor.set_done_stopping() - deleted = deployment_state.update() + deleted, _ = deployment_state.update() assert deleted check_counts(deployment_state, total=0) @@ -557,7 +594,7 @@ def test_force_kill(mock_get_all_node_ids, mock_deployment_state): # Once the replica is done stopping, it should be removed. replica = deployment_state._replicas.get()[0] replica._actor.set_done_stopping() - deleted = deployment_state.update() + deleted, _ = deployment_state.update() assert deleted check_counts(deployment_state, total=0) @@ -689,7 +726,7 @@ def test_redeploy_no_version(mock_get_all_node_ids, mock_deployment_state): check_counts(deployment_state, total=1, by_state=[(ReplicaState.STARTING, 1)]) assert deployment_state.curr_status_info.status == DeploymentStatus.UPDATING - deleted = deployment_state.update() + deleted, _ = deployment_state.update() assert not deleted check_counts(deployment_state, total=1, by_state=[(ReplicaState.RUNNING, 1)]) assert deployment_state.curr_status_info.status == DeploymentStatus.HEALTHY @@ -793,7 +830,7 @@ def test_redeploy_new_version(mock_get_all_node_ids, mock_deployment_state): by_state=[(ReplicaState.STARTING, 1)], ) - deleted = deployment_state.update() + deleted, _ = deployment_state.update() assert not deleted check_counts( deployment_state, @@ -805,8 +842,21 @@ def test_redeploy_new_version(mock_get_all_node_ids, mock_deployment_state): @pytest.mark.parametrize("mock_deployment_state", [True, False], indirect=True) +@pytest.mark.parametrize( + "option,value", + [ + ("user_config", {"hello": "world"}), + ("max_concurrent_queries", 10), + ("graceful_shutdown_timeout_s", DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_S + 1), + ("graceful_shutdown_wait_loop_s", DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S + 1), + ("health_check_period_s", DEFAULT_HEALTH_CHECK_PERIOD_S + 1), + ("health_check_timeout_s", DEFAULT_HEALTH_CHECK_TIMEOUT_S + 1), + ], +) @patch.object(DriverDeploymentState, "_get_all_node_ids") -def test_deploy_new_config_same_version(mock_get_all_node_ids, mock_deployment_state): +def test_deploy_new_config_same_code_version( + mock_get_all_node_ids, mock_deployment_state, option, value +): # Deploying a new config with the same version should not deploy a new # replica. deployment_state, timer = mock_deployment_state @@ -829,8 +879,8 @@ def test_deploy_new_config_same_version(mock_get_all_node_ids, mock_deployment_s ) assert deployment_state.curr_status_info.status == DeploymentStatus.HEALTHY - # Update to a new config without changing the version. - b_info_2, b_version_2 = deployment_info(version="1", user_config={"hello": "world"}) + # Update to a new config without changing the code version. + b_info_2, b_version_2 = deployment_info(version="1", **{option: value}) updated = deployment_state.deploy(b_info_2) assert updated assert deployment_state.curr_status_info.status == DeploymentStatus.UPDATING @@ -841,17 +891,17 @@ def test_deploy_new_config_same_version(mock_get_all_node_ids, mock_deployment_s by_state=[(ReplicaState.RUNNING, 1)], ) - deployment_state.update() - check_counts(deployment_state, total=1) - check_counts( - deployment_state, - version=b_version_2, - total=1, - by_state=[(ReplicaState.UPDATING, 1)], - ) - - # Mark the replica as ready. - deployment_state._replicas.get()[0]._actor.set_ready() + if option in ["user_config", "graceful_shutdown_wait_loop_s"]: + deployment_state.update() + check_counts(deployment_state, total=1) + check_counts( + deployment_state, + version=b_version_2, + total=1, + by_state=[(ReplicaState.UPDATING, 1)], + ) + # Mark the replica as ready. + deployment_state._replicas.get()[0]._actor.set_ready() deployment_state.update() check_counts(deployment_state, total=1) @@ -2148,7 +2198,8 @@ def test_resume_deployment_state_from_replica_tags( deployment_state_manager._deployment_states[tag] = deployment_state # Single replica should be created. - deployment_state_manager.update() + any_recovering = deployment_state_manager.update() + assert not any_recovering check_counts( deployment_state, total=1, @@ -2158,7 +2209,8 @@ def test_resume_deployment_state_from_replica_tags( deployment_state._replicas.get()[0]._actor.set_ready() # Now the replica should be marked running. - deployment_state_manager.update() + any_recovering = deployment_state_manager.update() + assert not any_recovering check_counts( deployment_state, total=1, @@ -2170,8 +2222,8 @@ def test_resume_deployment_state_from_replica_tags( # Step 2: Delete _replicas from deployment_state deployment_state._replicas = ReplicaStateContainer() - # Step 3: Create new deployment_state by resuming from passed in replicas + # Step 3: Create new deployment_state by resuming from passed in replicas deployment_state_manager._recover_from_checkpoint( [ReplicaName.prefix + mocked_replica.replica_tag] ) @@ -2183,11 +2235,12 @@ def test_resume_deployment_state_from_replica_tags( check_counts( deployment_state, total=1, version=None, by_state=[(ReplicaState.RECOVERING, 1)] ) - deployment_state._replicas.get()[0]._actor.set_ready() - deployment_state._replicas.get()[0]._actor.set_starting_version(b_version_1) # Now the replica should be marked running. - deployment_state_manager.update() + deployment_state._replicas.get()[0]._actor.set_ready() + deployment_state._replicas.get()[0]._actor.set_starting_version(b_version_1) + any_recovering = deployment_state_manager.update() + assert not any_recovering check_counts( deployment_state, total=1, @@ -2197,6 +2250,9 @@ def test_resume_deployment_state_from_replica_tags( # Ensure same replica name is used assert deployment_state._replicas.get()[0].replica_tag == mocked_replica.replica_tag + any_recovering = deployment_state_manager.update() + assert not any_recovering + def test_stopping_replicas_ranking(): @dataclass @@ -2229,7 +2285,7 @@ class FakeActor: available_resources = {} # Make a DeploymentReplica just to accesss its resource_requirement function - replica = DeploymentReplica(None, None, None, None, None) + replica = DeploymentReplica(None, None, "random_tag", None, None) replica._actor = FakeActor() # resource_requirements() should not error diff --git a/python/ray/serve/tests/test_deployment_version.py b/python/ray/serve/tests/test_deployment_version.py index 63a8fe823de4..cb63917ffd13 100644 --- a/python/ray/serve/tests/test_deployment_version.py +++ b/python/ray/serve/tests/test_deployment_version.py @@ -2,24 +2,17 @@ import ray from ray.serve._private.deployment_state import DeploymentVersion +from ray.serve.config import DeploymentConfig def test_validation(): # Code version must be a string. with pytest.raises(TypeError): - DeploymentVersion(123, None) - - # Can't pass unhashable type as user config. - with pytest.raises(TypeError): - DeploymentVersion(123, set()) - - # Can't pass nested unhashable type as user config. - with pytest.raises(TypeError): - DeploymentVersion(123, {"set": set()}) + DeploymentVersion(123, DeploymentConfig(), {}) def test_other_type_equality(): - v = DeploymentVersion("1", None) + v = DeploymentVersion("1", DeploymentConfig(), {}) assert v is not None assert v != "1" @@ -27,9 +20,9 @@ def test_other_type_equality(): def test_code_version(): - v1 = DeploymentVersion("1", None) - v2 = DeploymentVersion("1", None) - v3 = DeploymentVersion("2", None) + v1 = DeploymentVersion("1", DeploymentConfig(), {}) + v2 = DeploymentVersion("1", DeploymentConfig(), {}) + v3 = DeploymentVersion("2", DeploymentConfig(), {}) assert v1 == v2 assert hash(v1) == hash(v2) @@ -37,10 +30,10 @@ def test_code_version(): assert hash(v1) != hash(v3) -def test_user_config_basic(): - v1 = DeploymentVersion("1", "1") - v2 = DeploymentVersion("1", "1") - v3 = DeploymentVersion("1", "2") +def test_deployment_config_basic(): + v1 = DeploymentVersion("1", DeploymentConfig(user_config="1"), {}) + v2 = DeploymentVersion("1", DeploymentConfig(user_config="1"), {}) + v3 = DeploymentVersion("1", DeploymentConfig(user_config="2"), {}) assert v1 == v2 assert hash(v1) == hash(v2) @@ -49,9 +42,9 @@ def test_user_config_basic(): def test_user_config_hashable(): - v1 = DeploymentVersion("1", ("1", "2")) - v2 = DeploymentVersion("1", ("1", "2")) - v3 = DeploymentVersion("1", ("1", "3")) + v1 = DeploymentVersion("1", DeploymentConfig(user_config=("1", "2")), {}) + v2 = DeploymentVersion("1", DeploymentConfig(user_config=("1", "2")), {}) + v3 = DeploymentVersion("1", DeploymentConfig(user_config=("1", "3")), {}) assert v1 == v2 assert hash(v1) == hash(v2) @@ -60,9 +53,9 @@ def test_user_config_hashable(): def test_user_config_list(): - v1 = DeploymentVersion("1", ["1", "2"]) - v2 = DeploymentVersion("1", ["1", "2"]) - v3 = DeploymentVersion("1", ["1", "3"]) + v1 = DeploymentVersion("1", DeploymentConfig(user_config=["1", "2"]), {}) + v2 = DeploymentVersion("1", DeploymentConfig(user_config=["1", "2"]), {}) + v3 = DeploymentVersion("1", DeploymentConfig(user_config=["1", "3"]), {}) assert v1 == v2 assert hash(v1) == hash(v2) @@ -71,9 +64,9 @@ def test_user_config_list(): def test_user_config_dict_keys(): - v1 = DeploymentVersion("1", {"1": "1"}) - v2 = DeploymentVersion("1", {"1": "1"}) - v3 = DeploymentVersion("1", {"2": "1"}) + v1 = DeploymentVersion("1", DeploymentConfig(user_config={"1": "1"}), {}) + v2 = DeploymentVersion("1", DeploymentConfig(user_config={"1": "1"}), {}) + v3 = DeploymentVersion("1", DeploymentConfig(user_config={"2": "1"}), {}) assert v1 == v2 assert hash(v1) == hash(v2) @@ -82,9 +75,9 @@ def test_user_config_dict_keys(): def test_user_config_dict_vals(): - v1 = DeploymentVersion("1", {"1": "1"}) - v2 = DeploymentVersion("1", {"1": "1"}) - v3 = DeploymentVersion("1", {"1": "2"}) + v1 = DeploymentVersion("1", DeploymentConfig(user_config={"1": "1"}), {}) + v2 = DeploymentVersion("1", DeploymentConfig(user_config={"1": "1"}), {}) + v3 = DeploymentVersion("1", DeploymentConfig(user_config={"1": "2"}), {}) assert v1 == v2 assert hash(v1) == hash(v2) @@ -93,9 +86,15 @@ def test_user_config_dict_vals(): def test_user_config_nested(): - v1 = DeploymentVersion("1", [{"1": "2"}, {"1": "2"}]) - v2 = DeploymentVersion("1", [{"1": "2"}, {"1": "2"}]) - v3 = DeploymentVersion("1", [{"1": "2"}, {"1": "3"}]) + v1 = DeploymentVersion( + "1", DeploymentConfig(user_config=[{"1": "2"}, {"1": "2"}]), {} + ) + v2 = DeploymentVersion( + "1", DeploymentConfig(user_config=[{"1": "2"}, {"1": "2"}]), {} + ) + v3 = DeploymentVersion( + "1", DeploymentConfig(user_config=[{"1": "2"}, {"1": "3"}]), {} + ) assert v1 == v2 assert hash(v1) == hash(v2) @@ -104,9 +103,101 @@ def test_user_config_nested(): def test_user_config_nested_in_hashable(): - v1 = DeploymentVersion("1", ([{"1": "2"}, {"1": "2"}],)) - v2 = DeploymentVersion("1", ([{"1": "2"}, {"1": "2"}],)) - v3 = DeploymentVersion("1", ([{"1": "2"}, {"1": "3"}],)) + v1 = DeploymentVersion( + "1", DeploymentConfig(user_config=([{"1": "2"}, {"1": "2"}])), {} + ) + v2 = DeploymentVersion( + "1", DeploymentConfig(user_config=([{"1": "2"}, {"1": "2"}])), {} + ) + v3 = DeploymentVersion( + "1", DeploymentConfig(user_config=([{"1": "2"}, {"1": "3"}])), {} + ) + + assert v1 == v2 + assert hash(v1) == hash(v2) + assert v1 != v3 + assert hash(v1) != hash(v3) + + +def test_num_replicas(): + v1 = DeploymentVersion("1", DeploymentConfig(num_replicas=1), {}) + v2 = DeploymentVersion("1", DeploymentConfig(num_replicas=2), {}) + + assert v1 == v2 + assert hash(v1) == hash(v2) + + +def test_autoscaling_config(): + v1 = DeploymentVersion( + "1", DeploymentConfig(autoscaling_config={"max_replicas": 2}), {} + ) + v2 = DeploymentVersion( + "1", DeploymentConfig(autoscaling_config={"max_replicas": 5}), {} + ) + + assert v1 == v2 + assert hash(v1) == hash(v2) + + +def test_max_concurrent_queries(): + v1 = DeploymentVersion("1", DeploymentConfig(max_concurrent_queries=5), {}) + v2 = DeploymentVersion("1", DeploymentConfig(max_concurrent_queries=5), {}) + v3 = DeploymentVersion("1", DeploymentConfig(max_concurrent_queries=10), {}) + + assert v1 == v2 + assert hash(v1) == hash(v2) + assert v1 != v3 + assert hash(v1) != hash(v3) + + +def test_health_check_period_s(): + v1 = DeploymentVersion("1", DeploymentConfig(health_check_period_s=5), {}) + v2 = DeploymentVersion("1", DeploymentConfig(health_check_period_s=5), {}) + v3 = DeploymentVersion("1", DeploymentConfig(health_check_period_s=10), {}) + + assert v1 == v2 + assert hash(v1) == hash(v2) + assert v1 != v3 + assert hash(v1) != hash(v3) + + +def test_health_check_timeout_s(): + v1 = DeploymentVersion("1", DeploymentConfig(health_check_timeout_s=5), {}) + v2 = DeploymentVersion("1", DeploymentConfig(health_check_timeout_s=5), {}) + v3 = DeploymentVersion("1", DeploymentConfig(health_check_timeout_s=10), {}) + + assert v1 == v2 + assert hash(v1) == hash(v2) + assert v1 != v3 + assert hash(v1) != hash(v3) + + +def test_graceful_shutdown_timeout_s(): + v1 = DeploymentVersion("1", DeploymentConfig(graceful_shutdown_timeout_s=5), {}) + v2 = DeploymentVersion("1", DeploymentConfig(graceful_shutdown_timeout_s=5), {}) + v3 = DeploymentVersion("1", DeploymentConfig(graceful_shutdown_timeout_s=10), {}) + + assert v1 == v2 + assert hash(v1) == hash(v2) + assert v1 != v3 + assert hash(v1) != hash(v3) + + +def test_graceful_shutdown_wait_loop_s(): + v1 = DeploymentVersion("1", DeploymentConfig(graceful_shutdown_wait_loop_s=5), {}) + v2 = DeploymentVersion("1", DeploymentConfig(graceful_shutdown_wait_loop_s=5), {}) + v3 = DeploymentVersion("1", DeploymentConfig(graceful_shutdown_wait_loop_s=10), {}) + + assert v1 == v2 + assert hash(v1) == hash(v2) + assert v1 != v3 + assert hash(v1) != hash(v3) + + +def test_ray_actor_options(): + v1 = DeploymentVersion("1", DeploymentConfig(), {"num_cpus": 0.1}) + v2 = DeploymentVersion("1", DeploymentConfig(), {"num_cpus": 0.1}) + v3 = DeploymentVersion("1", DeploymentConfig(), {"num_gpus": 0.1}) assert v1 == v2 assert hash(v1) == hash(v2) @@ -117,7 +208,11 @@ def test_user_config_nested_in_hashable(): def test_hash_consistent_across_processes(serve_instance): @ray.remote def get_version(): - return DeploymentVersion("1", ([{"1": "2"}, {"1": "2"}],)) + return DeploymentVersion( + "1", + DeploymentConfig(user_config=([{"1": "2"}, {"1": "2"}],)), + {}, + ) assert len(set(ray.get([get_version.remote() for _ in range(100)]))) == 1 diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py index 6f35c5dd4351..98b6282553d2 100644 --- a/python/ray/serve/tests/test_failure.py +++ b/python/ray/serve/tests/test_failure.py @@ -7,6 +7,10 @@ import ray from ray import serve from ray._private.test_utils import wait_for_condition +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) def request_with_retries(endpoint, timeout=30): @@ -107,10 +111,13 @@ def check_new(): def _get_worker_handles(deployment): + deployment_name = ( + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}{deployment}" + ) controller = serve.context._global_client._controller deployment_dict = ray.get(controller._all_running_replicas.remote()) - return [replica.actor_handle for replica in deployment_dict[deployment]] + return [replica.actor_handle for replica in deployment_dict[deployment_name]] # Test that a worker dying unexpectedly causes it to restart and continue diff --git a/python/ray/serve/tests/test_gcs_failure.py b/python/ray/serve/tests/test_gcs_failure.py index 9efe0bb6cce2..c553a9684e2c 100644 --- a/python/ray/serve/tests/test_gcs_failure.py +++ b/python/ray/serve/tests/test_gcs_failure.py @@ -10,6 +10,11 @@ from ray._private.test_utils import wait_for_condition from ray.serve._private.storage.kv_store import KVStoreError, RayInternalKVStore from ray.tests.conftest import external_redis # noqa: F401 +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) +from ray.serve.context import get_global_client @pytest.fixture(scope="function") @@ -58,7 +63,12 @@ def d(*args): def call(): if use_handle: - ret = ray.get(d.get_handle().remote()) + deployment_name = ( + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}d" + ) + ret = ray.get( + get_global_client().get_handle(deployment_name, sync=True).remote() + ) else: ret = requests.get("http://localhost:8000/d").text return ret diff --git a/python/ray/serve/tests/test_get_deployment.py b/python/ray/serve/tests/test_get_deployment.py index bc5f03db8f1f..0406b6695251 100644 --- a/python/ray/serve/tests/test_get_deployment.py +++ b/python/ray/serve/tests/test_get_deployment.py @@ -16,8 +16,8 @@ def d(*args): with pytest.raises(KeyError): serve.get_deployment(name=name) - handle = serve.run(d.bind()) - val1, pid1 = ray.get(handle.remote()) + d.deploy() + val1, pid1 = ray.get(d.get_handle().remote()) assert val1 == "1" del d diff --git a/python/ray/serve/tests/test_grpc.py b/python/ray/serve/tests/test_grpc.py index b5ff870cf57e..2a78cf78c319 100644 --- a/python/ray/serve/tests/test_grpc.py +++ b/python/ray/serve/tests/test_grpc.py @@ -1,19 +1,25 @@ +# coding: utf-8 import pytest - +import sys +import os from ray.serve.drivers import DefaultgRPCDriver, gRPCIngress import ray from ray import serve -from ray.serve.generated import serve_pb2, serve_pb2_grpc -import grpc from ray.cluster_utils import Cluster from ray.serve._private.constants import SERVE_NAMESPACE -from ray._private.test_utils import wait_for_condition +from ray._private.test_utils import wait_for_condition, run_string_as_driver from ray.serve.exceptions import RayServeException -from unittest.mock import patch - +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) -pytestmark = pytest.mark.asyncio +from unittest.mock import patch +from ray._private.test_utils import ( + setup_tls, + teardown_tls, +) @pytest.fixture @@ -33,8 +39,41 @@ def ray_cluster(): cluster.shutdown() -@patch("ray.serve._private.api.FLAG_DISABLE_HTTP_PROXY", True) -async def test_deploy_basic(serve_start_shutdown): +@pytest.fixture +def use_tls(request): + if request.param: + key_filepath, cert_filepath, temp_dir = setup_tls() + yield request.param + if request.param: + teardown_tls(key_filepath, cert_filepath, temp_dir) + + +def tls_enabled(): + return os.environ.get("RAY_USE_TLS", "0").lower() in ("1", "true") + + +@pytest.mark.skipif( + sys.platform == "darwin", + reason=("Cryptography (TLS dependency) doesn't install in Mac build pipeline"), +) +@pytest.mark.parametrize("use_tls", [True], indirect=True) +def test_deploy_basic(use_tls): + if use_tls: + run_string_as_driver( + """ +# coding: utf-8 +import os +from ray.serve.drivers import DefaultgRPCDriver, gRPCIngress +import ray +from ray import serve +from ray.serve.generated import serve_pb2, serve_pb2_grpc +import grpc +from ray.serve.exceptions import RayServeException +from ray._private.tls_utils import load_certs_from_env +import logging +import asyncio +try: + ray.init() @serve.deployment class D1: def __call__(self, input): @@ -43,16 +82,67 @@ def __call__(self, input): serve.run(DefaultgRPCDriver.bind(D1.bind())) async def send_request(): - async with grpc.aio.insecure_channel("localhost:9000") as channel: + server_cert_chain, private_key, ca_cert = load_certs_from_env() + credentials = grpc.ssl_channel_credentials( + certificate_chain=server_cert_chain, + private_key=private_key, + root_certificates=ca_cert, + ) + + async with grpc.aio.secure_channel("localhost:9000", credentials) as channel: stub = serve_pb2_grpc.PredictAPIsServiceStub(channel) response = await stub.Predict( serve_pb2.PredictRequest(input={"a": bytes("123", "utf-8")}) ) return response - resp = await send_request() + resp = asyncio.run(send_request()) + assert resp.prediction == b"123" +finally: + serve.shutdown() + ray.shutdown() + """, + env=os.environ.copy(), + ) + else: + run_string_as_driver( + """ +# coding: utf-8 +import os +from ray.serve.drivers import DefaultgRPCDriver, gRPCIngress +import ray +from ray import serve +from ray.serve.generated import serve_pb2, serve_pb2_grpc +import grpc +from ray.serve.exceptions import RayServeException +from ray._private.tls_utils import load_certs_from_env +import logging +import asyncio +try: + ray.init() + @serve.deployment + class D1: + def __call__(self, input): + return input["a"] + serve.run(DefaultgRPCDriver.bind(D1.bind())) + + async def send_request(): + async with grpc.aio.insecure_channel("localhost:9000") as channel: + stub = serve_pb2_grpc.PredictAPIsServiceStub(channel) + response = await stub.Predict( + serve_pb2.PredictRequest(input={"a": bytes("123", "utf-8")}) + ) + return response + + resp = asyncio.run(send_request()) assert resp.prediction == b"123" +finally: + serve.shutdown() + ray.shutdown() + """, + env=os.environ.copy(), + ) @patch("ray.serve._private.api.FLAG_DISABLE_HTTP_PROXY", True) @@ -84,7 +174,10 @@ def __call__(self, input): replicas = ray.get( serve.context._global_client._controller._all_running_replicas.remote() ) - assert len(replicas["DefaultgRPCDriver"]) == 1 + deployment_name = ( + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}DefaultgRPCDriver" + ) + assert len(replicas[deployment_name]) == 1 worker_node = cluster.add_node(num_cpus=2) @@ -92,7 +185,7 @@ def __call__(self, input): lambda: len( ray.get( serve.context._global_client._controller._all_running_replicas.remote() - )["DefaultgRPCDriver"] + )[deployment_name] ) == 2 ) @@ -104,7 +197,7 @@ def __call__(self, input): lambda: len( ray.get( serve.context._global_client._controller._all_running_replicas.remote() - )["DefaultgRPCDriver"] + )[deployment_name] ) == 1 ) diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index 8c62814d9f02..344b418ab11f 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -7,6 +7,11 @@ import ray from ray import serve from ray.serve.exceptions import RayServeException +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) +from ray.serve.context import get_global_client @pytest.mark.asyncio @@ -80,7 +85,10 @@ def f(): handle = serve.run(f.bind()) def thread_get_handle(deploy): - handle = deploy.get_handle(sync=True) + deployment_name = ( + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}{deploy._name}" + ) + handle = get_global_client().get_handle(deployment_name, sync=True) return handle with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: diff --git a/python/ray/serve/tests/test_healthcheck.py b/python/ray/serve/tests/test_healthcheck.py index 03c6dbf5fe5c..d0a1fca074dc 100644 --- a/python/ray/serve/tests/test_healthcheck.py +++ b/python/ray/serve/tests/test_healthcheck.py @@ -6,6 +6,10 @@ from ray import serve from ray.serve._private.common import DeploymentStatus from ray.serve._private.constants import REPLICA_HEALTH_CHECK_UNHEALTHY_THRESHOLD +from ray.serve._private.constants import ( + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) class Counter: @@ -219,7 +223,8 @@ def __call__(self, *args): app_status = serve_instance.get_serve_status() assert ( - app_status.deployment_statuses[0].name == "AlwaysUnhealthy" + app_status.deployment_statuses[0].name + == f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}AlwaysUnhealthy" and app_status.deployment_statuses[0].status == DeploymentStatus.UNHEALTHY ) @@ -255,7 +260,8 @@ def __call__(self, *args): def check_status(expected_status: DeploymentStatus): app_status = serve_instance.get_serve_status() return ( - app_status.deployment_statuses[0].name == "WillBeUnhealthy" + app_status.deployment_statuses[0].name == f"{SERVE_DEFAULT_APP_NAME}" + f"{DEPLOYMENT_NAME_PREFIX_SEPARATOR}WillBeUnhealthy" and app_status.deployment_statuses[0].status == expected_status ) diff --git a/python/ray/serve/tests/test_http_prefix_matching.py b/python/ray/serve/tests/test_http_prefix_matching.py index d1c6c82c43e7..a0655d0851e0 100644 --- a/python/ray/serve/tests/test_http_prefix_matching.py +++ b/python/ray/serve/tests/test_http_prefix_matching.py @@ -14,87 +14,89 @@ def mock_get_handle(name, *args, **kwargs): def test_no_match(mock_longest_prefix_router): router = mock_longest_prefix_router - router.update_routes({"endpoint": EndpointInfo(route="/hello")}) - route, handle = router.match_route("/nonexistent") - assert route is None and handle is None + router.update_routes({"endpoint": EndpointInfo(route="/hello", app_name="")}) + route, handle, app_name = router.match_route("/nonexistent") + assert route is None and handle is None and app_name is None def test_default_route(mock_longest_prefix_router): router = mock_longest_prefix_router - router.update_routes({"endpoint": EndpointInfo(route="/endpoint")}) + router.update_routes({"endpoint": EndpointInfo(route="/endpoint", app_name="")}) - route, handle = router.match_route("/nonexistent") - assert route is None and handle is None + route, handle, app_name = router.match_route("/nonexistent") + assert route is None and handle is None and app_name is None - route, handle = router.match_route("/endpoint") - assert route == "/endpoint" and handle == "endpoint" + route, handle, app_name = router.match_route("/endpoint") + assert route == "/endpoint" and handle == "endpoint" and app_name == "" def test_trailing_slash(mock_longest_prefix_router): router = mock_longest_prefix_router router.update_routes( { - "endpoint": EndpointInfo(route="/test"), + "endpoint": EndpointInfo(route="/test", app_name=""), } ) - route, handle = router.match_route("/test/") + route, handle, _ = router.match_route("/test/") assert route == "/test" and handle == "endpoint" router.update_routes( { - "endpoint": EndpointInfo(route="/test/"), + "endpoint": EndpointInfo(route="/test/", app_name=""), } ) - route, handle = router.match_route("/test") - assert route is None and handle is None + route, handle, app_name = router.match_route("/test") + assert route is None and handle is None and app_name is None def test_prefix_match(mock_longest_prefix_router): router = mock_longest_prefix_router router.update_routes( { - "endpoint1": EndpointInfo(route="/test/test2"), - "endpoint2": EndpointInfo(route="/test"), - "endpoint3": EndpointInfo(route="/"), + "endpoint1": EndpointInfo(route="/test/test2", app_name=""), + "endpoint2": EndpointInfo(route="/test", app_name=""), + "endpoint3": EndpointInfo(route="/", app_name=""), } ) - route, handle = router.match_route("/test/test2/subpath") + route, handle, _ = router.match_route("/test/test2/subpath") assert route == "/test/test2" and handle == "endpoint1" - route, handle = router.match_route("/test/test2/") + route, handle, _ = router.match_route("/test/test2/") assert route == "/test/test2" and handle == "endpoint1" - route, handle = router.match_route("/test/test2") + route, handle, _ = router.match_route("/test/test2") assert route == "/test/test2" and handle == "endpoint1" - route, handle = router.match_route("/test/subpath") + route, handle, _ = router.match_route("/test/subpath") assert route == "/test" and handle == "endpoint2" - route, handle = router.match_route("/test/") + route, handle, _ = router.match_route("/test/") assert route == "/test" and handle == "endpoint2" - route, handle = router.match_route("/test") + route, handle, _ = router.match_route("/test") assert route == "/test" and handle == "endpoint2" - route, handle = router.match_route("/test2") + route, handle, _ = router.match_route("/test2") assert route == "/" and handle == "endpoint3" - route, handle = router.match_route("/") + route, handle, _ = router.match_route("/") assert route == "/" and handle == "endpoint3" def test_update_routes(mock_longest_prefix_router): router = mock_longest_prefix_router - router.update_routes({"endpoint": EndpointInfo(route="/endpoint")}) + router.update_routes({"endpoint": EndpointInfo(route="/endpoint", app_name="app1")}) - route, handle = router.match_route("/endpoint") - assert route == "/endpoint" and handle == "endpoint" + route, handle, app_name = router.match_route("/endpoint") + assert route == "/endpoint" and handle == "endpoint" and app_name == "app1" - router.update_routes({"endpoint2": EndpointInfo(route="/endpoint2")}) + router.update_routes( + {"endpoint2": EndpointInfo(route="/endpoint2", app_name="app2")} + ) - route, handle = router.match_route("/endpoint") - assert route is None and handle is None + route, handle, app_name = router.match_route("/endpoint") + assert route is None and handle is None and app_name is None - route, handle = router.match_route("/endpoint2") - assert route == "/endpoint2" and handle == "endpoint2" + route, handle, app_name = router.match_route("/endpoint2") + assert route == "/endpoint2" and handle == "endpoint2" and app_name == "app2" if __name__ == "__main__": diff --git a/python/ray/serve/tests/test_http_state.py b/python/ray/serve/tests/test_http_state.py index 957a79f95d1b..ae3870d292d5 100644 --- a/python/ray/serve/tests/test_http_state.py +++ b/python/ray/serve/tests/test_http_state.py @@ -1,9 +1,13 @@ +import json from unittest.mock import patch import pytest +import ray +from ray._private.test_utils import SignalActor, wait_for_condition from ray.serve.config import DeploymentMode, HTTPOptions -from ray.serve._private.http_state import HTTPState +from ray.serve._private.common import HTTPProxyStatus +from ray.serve._private.http_state import HTTPState, HTTPProxyState def test_node_selection(): @@ -64,6 +68,70 @@ def _make_http_state(http_options): assert set(another_seed) != set(selected_nodes) +def test_http_proxy_healthy(): + ray.init() + signal = SignalActor.remote() + + @ray.remote(num_cpus=0) + class MockHTTPProxyActor: + async def ready(self): + await signal.wait.remote() + return json.dumps(["mock_worker_id", "mock_log_file_path"]) + + async def check_health(self): + pass + + proxy = MockHTTPProxyActor.options(lifetime="detached").remote() + state = HTTPProxyState(proxy, "alice", "mock_node_id", "mock_node_ip") + assert state.status == HTTPProxyStatus.STARTING + + state.update() + assert state.status == HTTPProxyStatus.STARTING + + signal.send.remote() + + def check_proxy(status): + state.update() + return state.status == status + + wait_for_condition(check_proxy, status=HTTPProxyStatus.HEALTHY, timeout=2) + ray.shutdown() + + +def test_http_proxy_unhealthy(): + ray.init() + signal = SignalActor.remote() + + @ray.remote(num_cpus=0) + class MockHTTPProxyActor: + async def ready(self): + return json.dumps(["mock_worker_id", "mock_log_file_path"]) + + async def check_health(self): + await signal.wait.remote() + + with patch("ray.serve._private.http_state.PROXY_HEALTH_CHECK_PERIOD_S", 1): + proxy = MockHTTPProxyActor.options(lifetime="detached").remote() + state = HTTPProxyState(proxy, "alice", "mock_node_id", "mock_node_ip") + assert state.status == HTTPProxyStatus.STARTING + + def check_proxy(status): + state.update() + return state.status == status + + # Proxy actor is ready, so status should transition STARTING -> HEALTHY + wait_for_condition(check_proxy, status=HTTPProxyStatus.HEALTHY, timeout=2) + + # Health check is blocked, so status should transition HEALTHY -> UNHEALTHY + wait_for_condition(check_proxy, status=HTTPProxyStatus.UNHEALTHY, timeout=2) + + # Unblock health check, so status should transition UNHEALTHY -> HEALTHY + signal.send.remote() + wait_for_condition(check_proxy, status=HTTPProxyStatus.HEALTHY, timeout=2) + + ray.shutdown() + + if __name__ == "__main__": import sys diff --git a/python/ray/serve/tests/test_logging.py b/python/ray/serve/tests/test_logging.py index 561dceaa28b2..44e4f79ba9d7 100644 --- a/python/ray/serve/tests/test_logging.py +++ b/python/ray/serve/tests/test_logging.py @@ -7,11 +7,21 @@ import requests import starlette import pytest +import json import ray from ray import serve -from ray._private.test_utils import wait_for_condition import re +from ray.serve._private.logging_utils import ServeJSONFormatter +from ray.serve._private.common import ServeComponentType +from ray._private.test_utils import wait_for_condition + + +@pytest.fixture +def serve_and_ray_shutdown(): + serve.shutdown() + ray.shutdown() + yield def set_logging_config(monkeypatch, max_bytes, backup_count): @@ -156,9 +166,20 @@ def __call__(self, *args): assert replica_tag not in f.getvalue() -def test_context_information_in_logging(serve_instance): +@pytest.mark.parametrize("json_log_format", [False, True]) +def test_context_information_in_logging(serve_and_ray_shutdown, json_log_format): """Make sure all context information exist in the log message""" + if json_log_format: + serve_json_log_format = "1" + else: + serve_json_log_format = "0" + ray.init( + runtime_env={ + "env_vars": {"RAY_SERVE_ENABLE_JSON_LOGGING": serve_json_log_format} + } + ) + logger = logging.getLogger("ray.serve") @serve.deployment @@ -168,6 +189,9 @@ def fn(*args): return { "request_id": request_context.request_id, "route": request_context.route, + "app_name": request_context.app_name, + "log_file": logger.handlers[1].baseFilename, + "replica": serve.get_replica_context().replica_tag, } @serve.deployment @@ -178,6 +202,9 @@ def __call__(self, req: starlette.requests.Request): return { "request_id": request_context.request_id, "route": request_context.route, + "app_name": request_context.app_name, + "log_file": logger.handlers[1].baseFilename, + "replica": serve.get_replica_context().replica_tag, } serve.run(fn.bind(), name="app1", route_prefix="/fn") @@ -190,14 +217,14 @@ def __call__(self, req: starlette.requests.Request): # Check the component log expected_log_infos = [ - f"{resp['request_id']} {resp['route']} replica.py", - f"{resp2['request_id']} {resp2['route']} replica.py", + f"{resp['request_id']} {resp['route']} {resp['app_name']} replica.py", + f"{resp2['request_id']} {resp2['route']} {resp2['app_name']} replica.py", ] # Check User log user_log_regexes = [ - f".*{resp['request_id']} {resp['route']}.* user func.*", - f".*{resp2['request_id']} {resp2['route']}.* user log " + f".*{resp['request_id']} {resp['route']} {resp['app_name']}.* user func.*", + f".*{resp2['request_id']} {resp2['route']} {resp2['app_name']}.* user log " "message from class method.*", ] @@ -213,8 +240,93 @@ def check_log(): for regex in user_log_regexes: assert re.findall(regex, logs_content) != [] + # Check stream log check_log() + # Check user log file + if json_log_format: + user_method_log_regexes = [ + f'.*"deployment": "app1_fn", ' + f'"replica": "{resp["replica"]}", ' + f'"request_id": "{resp["request_id"]}", ' + f'"route": "{resp["route"]}", ' + f'"application": "{resp["app_name"]}", "message":.* user func.*', + ] + user_class_method_log_regexes = [ + f'.*"deployment": "app2_Model", ' + f'"replica": "{resp2["replica"]}", ' + f'"request_id": "{resp2["request_id"]}", ' + f'"route": "{resp2["route"]}", ' + f'"application": "{resp2["app_name"]}", "message":.* user log ' + "message from class method.*", + ] + else: + user_method_log_regexes = [ + f".*{resp['request_id']} {resp['route']} {resp['app_name']}.* " + f"user func.*", + ] + user_class_method_log_regexes = [ + f".*{resp2['request_id']} {resp2['route']} {resp2['app_name']}.* " + f"user log message from class method.*", + ] + + def check_log_file(log_file: str, expected_regex: list): + with open(log_file, "r") as f: + s = f.read() + for regex in expected_regex: + assert re.findall(regex, s) != [] + + check_log_file(resp["log_file"], user_method_log_regexes) + check_log_file(resp2["log_file"], user_class_method_log_regexes) + + +@pytest.mark.parametrize("is_deployment_type_component", [False, True]) +def test_json_log_formatter(is_deployment_type_component): + """Test the json log formatter""" + + if is_deployment_type_component: + component_type = ServeComponentType.DEPLOYMENT + formatter = ServeJSONFormatter("component", "component_id", component_type) + else: + formatter = ServeJSONFormatter("component", "component_id") + init_kwargs = { + "name": "test_log", + "level": logging.DEBUG, + "pathname": "my_path", + "lineno": 1, + "msg": "my_message", + "args": (), + "exc_info": None, + } + record = logging.LogRecord(**init_kwargs) + + def format_and_verify_json_output(record, expected_record: dict): + formatted_record = formatter.format(record) + formatted_record_dict = json.loads(formatted_record) + for key in expected_record: + assert key in formatted_record_dict + assert formatted_record_dict[key] == expected_record[key] + + expected_json = {} + if is_deployment_type_component: + expected_json["deployment"] = "component" + expected_json["replica"] = "component_id" + + # Set request id + record.request_id = "request_id" + expected_json["request_id"] = "request_id" + format_and_verify_json_output(record, expected_json) + + # Set route + record.route = "route" + expected_json["route"] = "route" + format_and_verify_json_output(record, expected_json) + + # set application + record.application = "application" + expected_json["application"] = "application" + format_and_verify_json_output(record, expected_json) + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_long_poll.py b/python/ray/serve/tests/test_long_poll.py index 3cbe56f3ff0c..c5286808c666 100644 --- a/python/ray/serve/tests/test_long_poll.py +++ b/python/ray/serve/tests/test_long_poll.py @@ -182,8 +182,8 @@ def test_listen_for_change_java(serve_instance): assert poll_result_1.updated_objects["key_1"].object_snapshot.decode() == "999" request_2 = {"keys_to_snapshot_ids": {"ROUTE_TABLE": -1}} endpoints: Dict[EndpointTag, EndpointInfo] = dict() - endpoints["deployment_name"] = EndpointInfo(route="/test/xlang/poll") - endpoints["deployment_name1"] = EndpointInfo(route="/test/xlang/poll1") + endpoints["deployment_name"] = EndpointInfo(route="/test/xlang/poll", app_name="") + endpoints["deployment_name1"] = EndpointInfo(route="/test/xlang/poll1", app_name="") ray.get(host.notify_changed.remote(LongPollNamespace.ROUTE_TABLE, endpoints)) object_ref_2 = host.listen_for_change_java.remote( LongPollRequest(**request_2).SerializeToString() diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py index 63867945956d..ff199aa4ed32 100644 --- a/python/ray/serve/tests/test_metrics.py +++ b/python/ray/serve/tests/test_metrics.py @@ -8,10 +8,12 @@ from ray import serve from ray._private.test_utils import wait_for_condition from ray.serve._private.utils import block_until_http_ready -import ray.experimental.state.api as state_api +import ray.util.state as state_api from fastapi import FastAPI from ray.serve.metrics import Counter, Histogram, Gauge from ray.serve._private.constants import DEFAULT_LATENCY_BUCKET_MS +from ray.serve.drivers import DAGDriver +from ray.serve.http_adapters import json_request @pytest.fixture @@ -87,7 +89,6 @@ def verify_metrics(do_assert=False): def test_http_metrics(serve_start_shutdown): - # NOTE: These metrics should be documented at # https://docs.ray.io/en/latest/serve/monitoring.html#metrics # Any updates here should be reflected there too. @@ -134,7 +135,7 @@ async def __call__(self): # Trigger RayActorError os._exit(0) - serve.run(A.bind()) + serve.run(A.bind(), name="app") requests.get("http://127.0.0.1:8000/A/") requests.get("http://127.0.0.1:8000/A/") try: @@ -162,8 +163,8 @@ def verify_error_count(do_assert=False): elif "serve_num_deployment_http_error_requests" in metrics: # deployment A should have error count 2 if do_assert: - assert 'deployment="A"' in metrics and "2.0" in metrics - if 'deployment="A"' not in metrics or "2.0" not in metrics: + assert 'deployment="app_A"' in metrics and "2.0" in metrics + if 'deployment="app_A"' not in metrics or "2.0" not in metrics: return False return True @@ -181,7 +182,7 @@ def test_http_metrics_fields(serve_start_shutdown): def f(*args): return 1 / 0 - serve.run(f.bind()) + serve.run(f.bind(), name="app") # Should generate 404 responses broken_url = "http://127.0.0.1:8000/fake_route" @@ -193,6 +194,8 @@ def f(*args): assert len(num_requests) == 1 assert num_requests[0]["route"] == "/fake_route" assert num_requests[0]["method"] == "GET" + assert num_requests[0]["application"] == "" + assert num_requests[0]["status_code"] == "404" print("serve_num_http_requests working as expected.") num_errors = get_metric_dictionaries("serve_num_http_error_requests") @@ -212,25 +215,199 @@ def f(*args): "serve_num_deployment_http_error_requests" ) assert len(num_deployment_errors) == 1 - assert num_deployment_errors[0]["deployment"] == "f" + assert num_deployment_errors[0]["deployment"] == "app_f" assert num_deployment_errors[0]["error_code"] == "500" assert num_deployment_errors[0]["method"] == "GET" + assert num_deployment_errors[0]["application"] == "app" print("serve_num_deployment_http_error_requests working as expected.") + latency_metrics = get_metric_dictionaries("serve_http_request_latency_ms_sum") + assert len(latency_metrics) == 1 + assert latency_metrics[0]["route"] == "/real_route" + assert latency_metrics[0]["application"] == "app" + assert latency_metrics[0]["status_code"] == "500" + print("serve_http_request_latency_ms working as expected.") + + +def test_http_redirect_metrics(serve_start_shutdown): + """Tests the http redirect metrics' behavior.""" + + def verify_metrics_with_route(metrics, expected_metrics): + assert len(metrics) == len(expected_metrics) + for metric_dict in metrics: + match_metric = None + for expected_metric in expected_metrics: + if expected_metric["route"] == metric_dict["route"]: + match_metric = expected_metric + break + assert match_metric is not None + for key in match_metric: + assert match_metric[key] == metric_dict[key] + + @serve.deployment + class Model: + def __call__(self, *args): + return "123" + + serve.run( + DAGDriver.bind(Model.bind(), http_adapter=json_request), route_prefix="/bar" + ) + resp = requests.get("http://localhost:8000/bar", json=["123"]) + assert resp.status_code == 200 + assert resp.text == '"123"' + + wait_for_condition( + lambda: len(get_metric_dictionaries("serve_num_http_requests")) == 2, + timeout=20, + ) + num_http_requests = get_metric_dictionaries("serve_num_http_requests") + expected_output = [ + { + "route": "/bar/", + "application": "default", + "method": "GET", + "status_code": "200", + }, + { + "route": "/bar", + "application": "default", + "method": "GET", + "status_code": "307", + }, + ] + verify_metrics_with_route(num_http_requests, expected_output) + + wait_for_condition( + lambda: len(get_metric_dictionaries("serve_http_request_latency_ms_sum")) == 2, + timeout=20, + ) + http_latency = get_metric_dictionaries("serve_num_http_requests") + expected_output = [ + {"route": "/bar/", "application": "default", "status_code": "200"}, + {"route": "/bar", "application": "default", "status_code": "307"}, + ] + verify_metrics_with_route(http_latency, expected_output) + + +def test_replica_metrics_fields(serve_start_shutdown): + """Test replica metrics fields""" + + @serve.deployment + def f(): + return "hello" + + @serve.deployment + def g(): + return "world" + + serve.run(f.bind(), name="app1", route_prefix="/f") + serve.run(g.bind(), name="app2", route_prefix="/g") + url_f = "http://127.0.0.1:8000/f" + url_g = "http://127.0.0.1:8000/g" + + assert "hello" == requests.get(url_f).text + assert "world" == requests.get(url_g).text + + def verify_metrics(metric, expected_output): + for key in expected_output: + assert metric[key] == expected_output[key] + + wait_for_condition( + lambda: len(get_metric_dictionaries("serve_deployment_request_counter")) == 2, + timeout=20, + ) + + num_requests = get_metric_dictionaries("serve_deployment_request_counter") + assert len(num_requests) == 2 + expected_output = {"route": "/f", "deployment": "app1_f", "application": "app1"} + verify_metrics(num_requests[0], expected_output) + + start_metrics = get_metric_dictionaries("serve_deployment_replica_starts") + assert len(start_metrics) == 2 + expected_output = {"deployment": "app1_f", "application": "app1"} + verify_metrics(start_metrics[0], expected_output) + expected_output = {"deployment": "app2_g", "application": "app2"} + verify_metrics(start_metrics[1], expected_output) + + # Latency metrics + wait_for_condition( + lambda: len( + get_metric_dictionaries("serve_deployment_processing_latency_ms_count") + ) + == 2, + timeout=20, + ) + for metric_name in [ + "serve_deployment_processing_latency_ms_count", + "serve_deployment_processing_latency_ms_sum", + ]: + latency_metrics = get_metric_dictionaries(metric_name) + print(f"checking metric {metric_name}, {latency_metrics}") + assert len(latency_metrics) == 2 + expected_output1 = {"deployment": "app1_f", "application": "app1"} + expected_output2 = {"deployment": "app2_g", "application": "app2"} + verify_metrics(latency_metrics[0], expected_output1) + verify_metrics(latency_metrics[1], expected_output2) + + processing_queries = get_metric_dictionaries("serve_replica_processing_queries") + assert len(processing_queries) == 2 + expected_output1 = {"deployment": "app1_f", "application": "app1"} + expected_output2 = {"deployment": "app2_g", "application": "app2"} + verify_metrics(processing_queries[0], expected_output1) + verify_metrics(processing_queries[1], expected_output2) + + @serve.deployment + def h(): + return 1 / 0 + + serve.run(h.bind(), name="app3", route_prefix="/h") + assert 500 == requests.get("http://127.0.0.1:8000/h").status_code + wait_for_condition( + lambda: len(get_metric_dictionaries("serve_deployment_error_counter")) == 1, + timeout=20, + ) + err_requests = get_metric_dictionaries("serve_deployment_error_counter") + assert len(err_requests) == 1 + expected_output = {"route": "/h", "deployment": "app3_h", "application": "app3"} + verify_metrics(err_requests[0], expected_output) + + health_metrics = get_metric_dictionaries("serve_deployment_replica_healthy") + assert len(health_metrics) == 3 + expected_outputs = [ + {"deployment": "app1_f", "application": "app1"}, + {"deployment": "app2_g", "application": "app2"}, + {"deployment": "app3_h", "application": "app3"}, + ] + for i in range(len(health_metrics)): + verify_metrics(health_metrics[i], expected_outputs[i]) + class TestRequestContextMetrics: def _generate_metrics_summary(self, metrics): - """Generate "route" information from metrics. + """Generate "route", "application" information from metrics. Args: metrics: list of metrics, each item is a dictionary generated from get_metric_dictionaries func. - Return: return a dictionary, key is deployment name, value is a set - including all routes. + Return: return a Tuple[dictionary, dictionary] + First dictionary: key is deployment name, value is a set + including all routes. string is to indicate the applicationn name. + Second dictionary: key is the deployment name, value is application name. """ - metrics_summary = DefaultDict(set) + metrics_summary_route = DefaultDict(set) + metrics_summary_app = DefaultDict(str) + for request_metrcis in metrics: - metrics_summary[request_metrcis["deployment"]].add(request_metrcis["route"]) - return metrics_summary + metrics_summary_route[request_metrcis["deployment"]].add( + request_metrcis["route"] + ) + metrics_summary_app[request_metrcis["deployment"]] = request_metrcis[ + "application" + ] + return metrics_summary_route, metrics_summary_app + + def verify_metrics(self, metric, expected_output): + for key in expected_output: + assert metric[key] == expected_output[key] def test_request_context_pass_for_http_proxy(self, serve_start_shutdown): """Test HTTP proxy passing request context""" @@ -269,56 +446,46 @@ def h(): ) # Check replica qps & latency - qps_metrics = self._generate_metrics_summary( + qps_metrics_route, qps_metrics_app_name = self._generate_metrics_summary( get_metric_dictionaries("serve_deployment_request_counter") ) - print(qps_metrics) - assert qps_metrics["app1_f"] == {"/app1"} - assert qps_metrics["app2_g"] == {"/app2"} - qps_metrics = self._generate_metrics_summary( + print(qps_metrics_route) + assert qps_metrics_route["app1_f"] == {"/app1"} + assert qps_metrics_route["app2_g"] == {"/app2"} + assert qps_metrics_app_name["app1_f"] == "app1" + assert qps_metrics_app_name["app2_g"] == "app2" + qps_metrics_route, qps_metrics_app_name = self._generate_metrics_summary( get_metric_dictionaries("serve_deployment_error_counter") ) - assert qps_metrics["app3_h"] == {"/app3"} - - latency_metrics = self._generate_metrics_summary( - get_metric_dictionaries("serve_deployment_processing_latency_ms_sum") - ) - assert len(latency_metrics) == 3 - assert latency_metrics["app1_f"] == {"/app1"} - assert latency_metrics["app2_g"] == {"/app2"} - assert latency_metrics["app3_h"] == {"/app3"} + assert qps_metrics_route["app3_h"] == {"/app3"} + assert qps_metrics_app_name["app3_h"] == "app3" # Check http proxy qps & latency - qps_metrics = get_metric_dictionaries("serve_num_http_requests") - len(qps_metrics) == 3 - assert {metric["route"] for metric in qps_metrics} == { - "/app1", - "/app2", - "/app3", - } - - latency_metrics = get_metric_dictionaries("serve_http_request_latency_ms_sum") - assert {metric["route"] for metric in latency_metrics} == { - "/app1", - "/app2", - "/app3", - } - - # Check handle qps - qps_metrics = self._generate_metrics_summary( - get_metric_dictionaries("serve_handle_request_counter") - ) - assert qps_metrics["app1_f"] == {"/app1"} - assert qps_metrics["app2_g"] == {"/app2"} - assert qps_metrics["app3_h"] == {"/app3"} + for metric_name in [ + "serve_num_http_requests", + "serve_http_request_latency_ms_sum", + ]: + metrics = get_metric_dictionaries(metric_name) + assert {metric["route"] for metric in metrics} == { + "/app1", + "/app2", + "/app3", + } - # Check router qps - qps_metrics = self._generate_metrics_summary( - get_metric_dictionaries("serve_num_router_requests") - ) - assert qps_metrics["app1_f"] == {"/app1"} - assert qps_metrics["app2_g"] == {"/app2"} - assert qps_metrics["app3_h"] == {"/app3"} + for metric_name in [ + "serve_handle_request_counter", + "serve_num_router_requests", + "serve_deployment_processing_latency_ms_sum", + ]: + metrics_route, metrics_app_name = self._generate_metrics_summary( + get_metric_dictionaries(metric_name) + ) + assert metrics_route["app1_f"] == {"/app1"} + assert metrics_route["app2_g"] == {"/app2"} + assert metrics_route["app3_h"] == {"/app3"} + assert metrics_app_name["app1_f"] == "app1" + assert metrics_app_name["app2_g"] == "app2" + assert metrics_app_name["app3_h"] == "app3" def test_request_context_pass_for_handle_passing(self, serve_start_shutdown): """Test handle passing contexts between replicas""" @@ -348,7 +515,7 @@ async def app1(self): async def app2(self): return await (await self.handle2.remote()) - serve.run(G.bind(g1.bind(), g2.bind())) + serve.run(G.bind(g1.bind(), g2.bind()), name="app") resp = requests.get("http://127.0.0.1:8000/api") assert resp.text == '"ok1"' resp = requests.get("http://127.0.0.1:8000/api2") @@ -365,12 +532,18 @@ async def app2(self): == 4, timeout=20, ) - requests_metrics = self._generate_metrics_summary( + ( + requests_metrics_route, + requests_metrics_app_name, + ) = self._generate_metrics_summary( get_metric_dictionaries("serve_deployment_request_counter") ) - assert requests_metrics["G"] == {"/api", "/api2"} - assert requests_metrics["g1"] == {"/api"} - assert requests_metrics["g2"] == {"/api2"} + assert requests_metrics_route["app_G"] == {"/api", "/api2"} + assert requests_metrics_route["app_g1"] == {"/api"} + assert requests_metrics_route["app_g2"] == {"/api2"} + assert requests_metrics_app_name["app_G"] == "app" + assert requests_metrics_app_name["app_g1"] == "app" + assert requests_metrics_app_name["app_g2"] == "app" def test_customer_metrics_with_context(self, serve_start_shutdown): @serve.deployment @@ -382,6 +555,7 @@ def __init__(self): tag_keys=( "my_static_tag", "my_runtime_tag", + "route", ), ) self.counter.set_default_tags({"my_static_tag": "static_value"}) @@ -392,6 +566,7 @@ def __init__(self): tag_keys=( "my_static_tag", "my_runtime_tag", + "route", ), ) self.histogram.set_default_tags({"my_static_tag": "static_value"}) @@ -401,6 +576,7 @@ def __init__(self): tag_keys=( "my_static_tag", "my_runtime_tag", + "route", ), ) self.gauge.set_default_tags({"my_static_tag": "static_value"}) @@ -421,26 +597,42 @@ def __call__(self): lambda: len(get_metric_dictionaries("my_gauge")) == 1, timeout=20, ) + counter_metrics = get_metric_dictionaries("my_counter") assert len(counter_metrics) == 1 - counter_metrics[0]["my_static_tag"] == "static_value" - counter_metrics[0]["my_runtime_tag"] == "100" - counter_metrics[0]["replica"] == replica_tag - counter_metrics[0]["deployment"] == deployment_name - + expected_metrics = { + "my_static_tag": "static_value", + "my_runtime_tag": "100", + "replica": replica_tag, + "deployment": deployment_name, + "application": "app", + "route": "/app", + } + self.verify_metrics(counter_metrics[0], expected_metrics) + + expected_metrics = { + "my_static_tag": "static_value", + "my_runtime_tag": "300", + "replica": replica_tag, + "deployment": deployment_name, + "application": "app", + "route": "/app", + } gauge_metrics = get_metric_dictionaries("my_gauge") assert len(counter_metrics) == 1 - gauge_metrics[0]["my_static_tag"] == "static_value" - gauge_metrics[0]["my_runtime_tag"] == "300" - gauge_metrics[0]["replica"] == replica_tag - gauge_metrics[0]["deployment"] == deployment_name - + self.verify_metrics(gauge_metrics[0], expected_metrics) + + expected_metrics = { + "my_static_tag": "static_value", + "my_runtime_tag": "200", + "replica": replica_tag, + "deployment": deployment_name, + "application": "app", + "route": "/app", + } histogram_metrics = get_metric_dictionaries("my_histogram_sum") assert len(histogram_metrics) == 1 - histogram_metrics[0]["my_static_tag"] == "static_value" - histogram_metrics[0]["my_runtime_tag"] == "200" - histogram_metrics[0]["replica"] == replica_tag - histogram_metrics[0]["deployment"] == deployment_name + self.verify_metrics(histogram_metrics[0], expected_metrics) @pytest.mark.parametrize("use_actor", [False, True]) def test_serve_metrics_outside_serve(self, use_actor, serve_start_shutdown): @@ -541,20 +733,73 @@ async def __call__(self): lambda: len(get_metric_dictionaries("my_gauge")) == 1, timeout=20, ) + counter_metrics = get_metric_dictionaries("my_counter") assert len(counter_metrics) == 1 - counter_metrics[0]["my_static_tag"] == "static_value" - counter_metrics[0]["my_runtime_tag"] == "100" + expected_metrics = { + "my_static_tag": "static_value", + "my_runtime_tag": "100", + } + self.verify_metrics(counter_metrics[0], expected_metrics) gauge_metrics = get_metric_dictionaries("my_gauge") assert len(counter_metrics) == 1 - gauge_metrics[0]["my_static_tag"] == "static_value" - gauge_metrics[0]["my_runtime_tag"] == "300" + expected_metrics = { + "my_static_tag": "static_value", + "my_runtime_tag": "300", + } + self.verify_metrics(gauge_metrics[0], expected_metrics) histogram_metrics = get_metric_dictionaries("my_histogram_sum") assert len(histogram_metrics) == 1 - histogram_metrics[0]["my_static_tag"] == "static_value" - histogram_metrics[0]["my_runtime_tag"] == "200" + expected_metrics = { + "my_static_tag": "static_value", + "my_runtime_tag": "200", + } + self.verify_metrics(histogram_metrics[0], expected_metrics) + + +def test_multiplexed_metrics(serve_start_shutdown): + """Tests multiplexed API corresponding metrics.""" + + @serve.deployment + class Model: + @serve.multiplexed(max_num_models_per_replica=2) + async def get_model(self, model_id: str): + return model_id + + async def __call__(self, model_id: str): + await self.get_model(model_id) + return + + handle = serve.run(Model.bind(), name="app", route_prefix="/app") + handle.remote("model1") + handle.remote("model2") + # Trigger model eviction. + handle.remote("model3") + expected_metrics = [ + "serve_multiplexed_model_load_latency_s", + "serve_multiplexed_model_unload_latency_s", + "serve_num_multiplexed_models", + "serve_multiplexed_models_load_counter", + "serve_multiplexed_models_unload_counter", + ] + + def verify_metrics(): + try: + resp = requests.get("http://127.0.0.1:9999").text + # Requests will fail if we are crashing the controller + except requests.ConnectionError: + return False + for metric in expected_metrics: + assert metric in resp + return True + + wait_for_condition( + verify_metrics, + timeout=20, + retry_interval_ms=1000, + ) def test_actor_summary(serve_instance): @@ -562,11 +807,11 @@ def test_actor_summary(serve_instance): def f(): pass - serve.run(f.bind()) + serve.run(f.bind(), name="app") actors = state_api.list_actors(filters=[("state", "=", "ALIVE")]) class_names = {actor["class_name"] for actor in actors} assert class_names.issuperset( - {"ServeController", "HTTPProxyActor", "ServeReplica:f"} + {"ServeController", "HTTPProxyActor", "ServeReplica:app_f"} ) diff --git a/python/ray/serve/tests/test_multiplex.py b/python/ray/serve/tests/test_multiplex.py new file mode 100644 index 000000000000..405f7e448c40 --- /dev/null +++ b/python/ray/serve/tests/test_multiplex.py @@ -0,0 +1,266 @@ +import pytest +from typing import List + +import ray +from ray import serve +from ray.serve.multiplex import _ModelMultiplexWrapper +from ray.serve.context import get_internal_replica_context +from ray._private.test_utils import async_wait_for_condition, wait_for_condition +from ray.serve._private.common import RunningReplicaInfo + + +@pytest.fixture() +def start_serve_with_context(): + serve.start() + ray.serve.context._set_internal_replica_context( + "fake_deployment", "fake_replica_tag", None, None, None + ) + yield + serve.shutdown() + + +class TestMultiplexWrapper: + def test_failed_to_get_replica_context(self): + async def model_load_func(model_id: str): + return model_id + + with pytest.raises( + RuntimeError, match="Fail to retrieve serve replica context" + ): + _ModelMultiplexWrapper(model_load_func, None, max_num_models_per_replica=2) + + @pytest.mark.asyncio + async def test_multiplex_wrapper(self, start_serve_with_context): + """Test multiplex wrapper with LRU caching.""" + + async def model_load_func(model_id: str): + return model_id + + multiplexer = _ModelMultiplexWrapper( + model_load_func, None, max_num_models_per_replica=2 + ) + + # Check the replica info pushed + def check_info_pushed(): + return multiplexer._push_multiplexed_replica_info is False + + # Load model1 + await multiplexer.load_model("1") + assert multiplexer.models == {"1": "1"} + assert multiplexer._push_multiplexed_replica_info + await async_wait_for_condition(check_info_pushed) + + # Load model2 + await multiplexer.load_model("2") + assert multiplexer.models == {"1": "1", "2": "2"} + assert multiplexer._push_multiplexed_replica_info + await async_wait_for_condition(check_info_pushed) + + # Load model3, model1 should be unloaded + await multiplexer.load_model("3") + assert multiplexer.models == {"2": "2", "3": "3"} + assert multiplexer._push_multiplexed_replica_info + await async_wait_for_condition(check_info_pushed) + + # reload model2, model2 should be moved to the end of the LRU cache + # _push_multiplexed_replica_info should be False. + await multiplexer.load_model("2") + assert multiplexer.models == {"3": "3", "2": "2"} + assert multiplexer._push_multiplexed_replica_info is False + + # Load model4, model3 should be unloaded + await multiplexer.load_model("4") + assert multiplexer._push_multiplexed_replica_info + assert multiplexer.models == {"2": "2", "4": "4"} + + @pytest.mark.asyncio + async def test_bad_call_multiplexed_func(self, start_serve_with_context): + """Test bad call to multiplexed function""" + + async def model_load_func(model_id: str): + return model_id + + multiplexer = _ModelMultiplexWrapper( + model_load_func, None, max_num_models_per_replica=2 + ) + with pytest.raises(TypeError): + await multiplexer.load_model(1) + with pytest.raises(TypeError): + await multiplexer.load_model() + + @pytest.mark.asyncio + async def test_unload_model_call_del(self, start_serve_with_context): + class MyModel: + def __init__(self, model_id): + self.model_id = model_id + + def __del__(self): + raise Exception(f"{self.model_id} is dead") + + def __eq__(self, model): + return model.model_id == self.model_id + + async def model_load_func(model_id: str) -> MyModel: + return MyModel(model_id) + + multiplexer = _ModelMultiplexWrapper( + model_load_func, None, max_num_models_per_replica=1 + ) + await multiplexer.load_model("1") + assert multiplexer.models == {"1": MyModel("1")} + with pytest.raises(Exception, match="1 is dead"): + await multiplexer.load_model("2") + + +class TestBasicAPI: + def test_decorator_validation(self): + @serve.multiplexed + async def get_model(model: str): + return + + @serve.multiplexed(max_num_models_per_replica=1) + async def get_model2(model: str): + return + + @serve.deployment + class MyModel: + @serve.multiplexed + async def get_model(model: str): + return + + @serve.deployment + class MyModel2: + @serve.multiplexed(max_num_models_per_replica=1) + async def get_model(self, model: str): + return + + # multiplex can only be used with func or method. + with pytest.raises(TypeError): + + @serve.deployment + @serve.multiplexed + class BadDecorator: + pass + + # max_num_models_per_replica must be an integer + with pytest.raises(TypeError): + + @serve.multiplexed(max_num_models_per_replica="1") + async def get_model3(model: str): + pass + + # max_num_models_per_replica must be positive + with pytest.raises(ValueError): + + @serve.multiplexed(max_num_models_per_replica=0) + async def get_model4(model: str): + pass + + # multiplexed function must be async def + with pytest.raises(TypeError): + + @serve.multiplexed + def get_model5(model: str): + pass + + with pytest.raises(TypeError): + + @serve.deployment + class MyModel3: + @serve.multiplexed + def get_model(self, model: str): + return + + # no model_id argument in multiplexed function + with pytest.raises(TypeError): + + @serve.multiplexed + def get_model6(): + pass + + with pytest.raises(TypeError): + + @serve.deployment + class MyModel4: + @serve.multiplexed + def get_model(self): + return + + def test_get_multiplexed_model_id(self): + """Test get_multiplexed_model_id() API""" + assert serve.get_multiplexed_model_id() == "" + ray.serve.context._serve_request_context.set( + ray.serve.context.RequestContext(multiplexed_model_id="1") + ) + assert serve.get_multiplexed_model_id() == "1" + + +def test_multiplexed_replica_info(): + """Test MultiplexedReplicaInfo is passed to the controller & router""" + + @serve.deployment + class MyModel: + @serve.multiplexed(max_num_models_per_replica=2) + async def get_model(self, model_id: str): + return + + async def __call__(self, model_id: str): + _ = await self.get_model(model_id) + context = get_internal_replica_context() + return (context.deployment, context.replica_tag) + + handle = serve.run(MyModel.bind()) + deployment, replica_tag = ray.get(handle.remote("model1")) + + def check_replica_information( + replicas: List[RunningReplicaInfo], + deployment: str, + replica_tag: str, + model_ids: List[str], + ): + for replica in replicas: + assert replica.deployment_name == deployment + assert replica.replica_tag == replica_tag + assert list(replica.multiplexed_model_ids) == model_ids + return True + + wait_for_condition( + check_replica_information, + replicas=handle.router._replica_set.in_flight_queries.keys(), + deployment=deployment, + replica_tag=replica_tag, + model_ids=[ + "model1", + ], + ) + + ray.get(handle.remote("model2")) + wait_for_condition( + check_replica_information, + replicas=handle.router._replica_set.in_flight_queries.keys(), + deployment=deployment, + replica_tag=replica_tag, + model_ids=[ + "model1", + "model2", + ], + ) + + # LRU remove the model1 + ray.get(handle.remote("model3")) + wait_for_condition( + check_replica_information, + replicas=handle.router._replica_set.in_flight_queries.keys(), + deployment=deployment, + replica_tag=replica_tag, + model_ids=[ + "model2", + "model3", + ], + ) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_persistence.py b/python/ray/serve/tests/test_persistence.py index 3ba7d592a58d..c0167fc02067 100644 --- a/python/ray/serve/tests/test_persistence.py +++ b/python/ray/serve/tests/test_persistence.py @@ -14,13 +14,13 @@ def test_new_driver(serve_instance): def driver(): return "OK!" -serve.run(driver.bind()) +serve.run(driver.bind(), name="app") """.format( ray._private.worker._global_node.address ) run_string_as_driver(script) - handle = serve.get_deployment("driver").get_handle() + handle = serve.get_deployment("app_driver").get_handle() assert ray.get(handle.remote()) == "OK!" diff --git a/python/ray/serve/tests/test_regression.py b/python/ray/serve/tests/test_regression.py index 23be1e9d1988..9f636fd58340 100644 --- a/python/ray/serve/tests/test_regression.py +++ b/python/ray/serve/tests/test_regression.py @@ -153,13 +153,13 @@ def test_handle_cache_out_of_scope(serve_instance): def f(): return "hi" - handle = serve.run(f.bind()) + handle = serve.run(f.bind(), name="app") handle_cache = get_global_client().handle_cache assert len(handle_cache) == initial_num_cached + 1 def sender_where_handle_goes_out_of_scope(): - f = serve.get_deployment("f").get_handle() + f = get_global_client().get_handle("app_f", missing_ok=True, sync=True) assert f is handle assert ray.get(f.remote()) == "hi" diff --git a/python/ray/serve/tests/test_runtime_env.py b/python/ray/serve/tests/test_runtime_env.py index 14c8c5e3552c..e6fe295fae4a 100644 --- a/python/ray/serve/tests/test_runtime_env.py +++ b/python/ray/serve/tests/test_runtime_env.py @@ -73,7 +73,7 @@ class Test: def __call__(self, *args): return open("hello").read() -handle = serve.run(Test.bind()) +handle = serve.run(Test.bind(), name="app") assert ray.get(handle.remote()) == "world" """ @@ -87,8 +87,8 @@ def __call__(self, *args): ray.init(address="auto", namespace="serve", job_config=job_config) -Test = serve.get_deployment("Test") -handle = serve.run(Test.bind()) +Test = serve.get_deployment("app_Test") +handle = serve.run(Test.bind(), name="app") assert ray.get(handle.remote()) == "world" Test.delete() """ diff --git a/python/ray/serve/tests/test_schema.py b/python/ray/serve/tests/test_schema.py index cfed7f823973..5c69af93080a 100644 --- a/python/ray/serve/tests/test_schema.py +++ b/python/ray/serve/tests/test_schema.py @@ -659,6 +659,18 @@ def test_deploy_empty_name(self): # Error message should be descriptive, mention name must be nonempty assert "name" in str(e.value) and "empty" in str(e.value) + def test_deploy_no_applications(self): + """Applications must be specified.""" + + deploy_config_dict = { + "http_options": { + "host": "127.0.0.1", + "port": 8000, + }, + } + with pytest.raises(ValidationError): + ServeDeploySchema.parse_obj(deploy_config_dict) + class TestServeStatusSchema: def get_valid_serve_status_schema(self): diff --git a/python/ray/serve/tests/test_standalone.py b/python/ray/serve/tests/test_standalone.py index 41004e4abda0..65ed21cd61db 100644 --- a/python/ray/serve/tests/test_standalone.py +++ b/python/ray/serve/tests/test_standalone.py @@ -40,7 +40,7 @@ ) from ray.serve.schema import ServeApplicationSchema -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors # Explicitly importing it here because it is a ray core tests utility ( # not in the tree) @@ -186,12 +186,12 @@ def test_single_app_shutdown_actors(ray_shutdown): def f(): pass - serve.run(f.bind()) + serve.run(f.bind(), name="app") actor_names = { "ServeController", "HTTPProxyActor", - "ServeReplica:f", + "ServeReplica:app_f", } def check_alive(): @@ -710,7 +710,7 @@ def check(): return False serve.start(detached=True) - serve.run(hello.bind()) + serve.run(hello.bind(), name="app") check() webui_url = ray_start_with_dashboard["webui_url"] @@ -731,7 +731,7 @@ def verify_snapshot(): snapshot = get_deployment_snapshot() assert len(snapshot) == 1 hello_deployment = list(snapshot.values())[0] - assert hello_deployment["name"] == "hello" + assert hello_deployment["name"] == "app_hello" assert hello_deployment["status"] == "RUNNING" diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py index b4b483d53ad3..0c1b9b91b753 100644 --- a/python/ray/serve/tests/test_standalone2.py +++ b/python/ray/serve/tests/test_standalone2.py @@ -5,6 +5,7 @@ from contextlib import contextmanager from typing import Dict, Set from concurrent.futures.thread import ThreadPoolExecutor +from functools import partial import pytest import requests @@ -12,7 +13,7 @@ import ray import ray.actor import ray._private.state -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray import serve from ray._private.test_utils import ( @@ -22,8 +23,12 @@ from ray.exceptions import RayActorError from ray.serve.exceptions import RayServeException from ray.serve._private.client import ServeControllerClient -from ray.serve._private.common import ApplicationStatus, DeploymentStatus -from ray.serve._private.constants import SERVE_NAMESPACE +from ray.serve._private.common import ApplicationStatus, DeploymentStatus, ReplicaState +from ray.serve._private.constants import ( + SERVE_NAMESPACE, + SERVE_DEFAULT_APP_NAME, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, +) from ray.serve.context import get_global_client from ray.serve.schema import ( ServeApplicationSchema, @@ -76,13 +81,14 @@ def ray_instance(request): @contextmanager def start_and_shutdown_ray_cli(): - subprocess.check_output( - ["ray", "start", "--head"], - ) + subprocess.check_output(["ray", "stop", "--force"]) + wait_for_condition(_check_ray_stop, timeout=15) + subprocess.check_output(["ray", "start", "--head"]) + yield - subprocess.check_output( - ["ray", "stop", "--force"], - ) + + subprocess.check_output(["ray", "stop", "--force"]) + wait_for_condition(_check_ray_stop, timeout=15) @pytest.fixture(scope="function") @@ -133,9 +139,9 @@ def hello(*args, **kwargs): return "world" ray.init(num_gpus=3, namespace="serve") - serve.run(hello.bind()) + handle = serve.run(hello.bind()) - assert ray.get(hello.get_handle().remote()) == "world" + assert ray.get(handle.remote()) == "world" @pytest.mark.parametrize("detached", [True, False]) @@ -240,7 +246,6 @@ def controller_died(handle): def test_get_serve_status(shutdown_ray): - ray.init() @serve.deployment @@ -252,7 +257,10 @@ def f(*args): client = get_global_client() status_info_1 = client.get_serve_status() assert status_info_1.app_status.status == "RUNNING" - assert status_info_1.deployment_statuses[0].name == "f" + assert ( + status_info_1.deployment_statuses[0].name + == f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + ) assert status_info_1.deployment_statuses[0].status in {"UPDATING", "HEALTHY"} serve.shutdown() @@ -322,7 +330,6 @@ def generate_pid_based_deserializer(pid, raw_deserializer): """Cannot be deserialized by the process with specified pid.""" def deserializer(*args): - import os if os.getpid() == pid: @@ -366,14 +373,14 @@ def test_controller_recover_and_delete(shutdown_ray): def f(): pass - f.deploy() + serve.run(f.bind()) actors = list_actors( address=ray_context.address_info["address"], filters=[("state", "=", "ALIVE")] ) - # Try to delete the deployments and kill the controller right after - client.delete_deployments(["f"], blocking=False) + # Try to delete the application and kill the controller right after + serve.delete(SERVE_DEFAULT_APP_NAME, _blocking=False) ray.kill(client._controller, no_restart=False) # All replicas should be removed already or after the controller revives @@ -400,35 +407,93 @@ def f(): # The deployment should be deleted, meaning its state should not be stored # in the DeploymentStateManager. This can be checked by attempting to # retrieve the deployment's status through the controller. - assert client.get_serve_status().get_deployment_status("f") is None + assert ( + client.get_serve_status().get_deployment_status( + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + ) + is None + ) serve.shutdown() ray.shutdown() +def test_serve_stream_logs(start_and_shutdown_ray_cli_function): + """Test that serve logs show up across different drivers.""" + import tempfile + + file1 = """from ray import serve +@serve.deployment +class A: + def __call__(self): + return "Hello A" +serve.run(A.bind())""" + + file2 = """from ray import serve +@serve.deployment +class B: + def __call__(self): + return "Hello B" +serve.run(B.bind())""" + + with tempfile.NamedTemporaryFile() as f1, tempfile.NamedTemporaryFile() as f2: + f1.write(file1.encode("utf-8")) + f1.seek(0) + # Driver 1 (starts Serve controller) + output = subprocess.check_output(["python", f1.name], stderr=subprocess.STDOUT) + assert "Connecting to existing Ray cluster" in output.decode("utf-8") + assert "Adding 1 replica to deployment default_A" in output.decode("utf-8") + + f2.write(file2.encode("utf-8")) + f2.seek(0) + # Driver 2 (reconnects to the same Serve controller) + output = subprocess.check_output(["python", f2.name], stderr=subprocess.STDOUT) + assert "Connecting to existing Ray cluster" in output.decode("utf-8") + assert "Adding 1 replica to deployment default_B" in output.decode("utf-8") + + class TestDeployApp: @pytest.fixture(scope="function") def client(self): - subprocess.check_output(["ray", "stop", "--force"]) - wait_for_condition( - _check_ray_stop, - timeout=15, + with start_and_shutdown_ray_cli(): + wait_for_condition( + lambda: requests.get( + "http://localhost:52365/api/ray/version" + ).status_code + == 200, + timeout=15, + ) + ray.init(address="auto", namespace=SERVE_NAMESPACE) + yield serve.start(detached=True) + serve.shutdown() + ray.shutdown() + + def check_deployment_running(self, client: ServeControllerClient, name: str): + serve_status = client.get_serve_status() + return ( + serve_status.get_deployment_status(name) is not None + and serve_status.app_status.status == ApplicationStatus.RUNNING + and serve_status.get_deployment_status(name).status + == DeploymentStatus.HEALTHY ) - subprocess.check_output(["ray", "start", "--head"]) - wait_for_condition( - lambda: requests.get("http://localhost:52365/api/ray/version").status_code - == 200, - timeout=15, + + def check_deployments_dead(self, deployment_names): + actor_names = [ + actor["class_name"] + for actor in list_actors( + filters=[("state", "=", "ALIVE")], + ) + ] + return all( + f"ServeReplica:{name}" not in actor_names for name in deployment_names ) - ray.init(address="auto", namespace=SERVE_NAMESPACE) - yield serve.start(detached=True) - serve.shutdown() - ray.shutdown() - subprocess.check_output(["ray", "stop", "--force"]) - wait_for_condition( - _check_ray_stop, - timeout=15, + + def get_num_replicas(self, client: ServeControllerClient, deployment_name: str): + replicas = ray.get( + client._controller._dump_replica_states_for_testing.remote(deployment_name) ) + running_replicas = replicas.get([ReplicaState.RUNNING]) + return len(running_replicas) def get_test_config(self) -> Dict: return {"import_path": "ray.serve.tests.test_config_files.pizza.serve_dag"} @@ -509,7 +574,6 @@ def test_deploy_multi_app(self, client: ServeControllerClient): self.check_multi_app() def test_deploy_app_with_overriden_config(self, client: ServeControllerClient): - config = self.get_test_config() config["deployments"] = [ { @@ -1067,9 +1131,11 @@ def test_controller_recover_and_deploy(self, client: ServeControllerClient): deployment_timestamp = client.get_serve_status().app_status.deployment_timestamp # Delete all deployments, but don't update config - client.delete_deployments( - ["Router", "Multiplier", "Adder", "create_order", "DAGDriver"] - ) + deployments = [ + f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}{name}" + for name in ["Router", "Multiplier", "Adder", "create_order", "DAGDriver"] + ] + client.delete_deployments(deployments) ray.kill(client._controller, no_restart=False) @@ -1094,80 +1160,274 @@ def test_controller_recover_and_deploy(self, client: ServeControllerClient): assert client.get_serve_status().app_status.deployment_timestamp == 0 @pytest.mark.parametrize( - "field_to_update,option_to_update,config_update", - [ - ("import_path", "", False), - ("runtime_env", "", False), - ("deployments", "num_replicas", True), - ("deployments", "autoscaling_config", True), - ("deployments", "user_config", True), - ("deployments", "ray_actor_options", False), - ], + "field_to_update", + ["import_path", "runtime_env", "ray_actor_options"], ) - def test_deploy_config_update( - self, - client: ServeControllerClient, - field_to_update: str, - option_to_update: str, - config_update: bool, + def test_deploy_config_update_heavyweight( + self, client: ServeControllerClient, field_to_update: str ): - """ - Check that replicas stay alive when lightweight config updates are made and - replicas are torn down when code updates are made. - """ - - def deployment_running(): - serve_status = client.get_serve_status() - return ( - serve_status.get_deployment_status("f") is not None - and serve_status.app_status.status == ApplicationStatus.RUNNING - and serve_status.get_deployment_status("f").status - == DeploymentStatus.HEALTHY - ) - + """Check that replicas are torn down when code updates are made.""" + name = f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" config_template = { "import_path": "ray.serve.tests.test_config_files.pid.node", "deployments": [ { "name": "f", "autoscaling_config": None, - "user_config": None, + "user_config": {"name": "alice"}, "ray_actor_options": {"num_cpus": 0.1}, }, ], } client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) - wait_for_condition(deployment_running, timeout=15) - pid1 = requests.get("http://localhost:8000/f").text + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + pid1, _ = requests.get("http://localhost:8000/f").json() if field_to_update == "import_path": config_template[ "import_path" - ] = "ray.serve.tests.test_config_files.pid.bnode" + ] = "ray.serve.tests.test_config_files.pid.dup_node" elif field_to_update == "runtime_env": config_template["runtime_env"] = {"env_vars": {"test_var": "test_val"}} - elif field_to_update == "deployments": - updated_options = { - "num_replicas": 2, - "autoscaling_config": {"max_replicas": 2}, - "user_config": {"name": "bob"}, - "ray_actor_options": {"num_cpus": 0.2}, - } - config_template["deployments"][0][option_to_update] = updated_options[ - option_to_update - ] + elif field_to_update == "ray_actor_options": + config_template["deployments"][0]["ray_actor_options"] = {"num_cpus": 0.2} client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) - wait_for_condition(deployment_running, timeout=15) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) # This assumes that Serve implements round-robin routing for its replicas. As # long as that doesn't change, this test shouldn't be flaky; however if that # routing ever changes, this test could become mysteriously flaky pids = [] for _ in range(4): - pids.append(requests.get("http://localhost:8000/f").text) - assert (pid1 in pids) == config_update + pids.append(requests.get("http://localhost:8000/f").json()[0]) + assert pid1 not in pids + + def test_update_config_user_config(self, client: ServeControllerClient): + """Check that replicas stay alive when user config is updated.""" + + name = f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + config_template = { + "import_path": "ray.serve.tests.test_config_files.pid.node", + "deployments": [{"name": "f", "user_config": {"name": "alice"}}], + } + + # Deploy first time + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + + # Query + pid1, res = requests.get("http://localhost:8000/f").json() + assert res == "alice" + + # Redeploy with updated option + config_template["deployments"][0]["user_config"] = {"name": "bob"} + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + + # This assumes that Serve implements round-robin routing for its replicas. As + # long as that doesn't change, this test shouldn't be flaky; however if that + # routing ever changes, this test could become mysteriously flaky + # Query + pids = [] + for _ in range(4): + pid, res = requests.get("http://localhost:8000/f").json() + assert res == "bob" + pids.append(pid) + assert pid1 in pids + + def test_update_config_graceful_shutdown_timeout( + self, client: ServeControllerClient + ): + """Check that replicas stay alive when graceful_shutdown_timeout_s is updated""" + name = f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + config_template = { + "import_path": "ray.serve.tests.test_config_files.pid.node", + "deployments": [{"name": "f", "graceful_shutdown_timeout_s": 1000}], + } + + # Deploy first time + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + handle = client.get_handle(name) + + # Start off with signal ready, and send query + ray.get(handle.send.remote()) + pid1 = ray.get(handle.remote())[0] + print("PID of replica after first deployment:", pid1) + + # Redeploy with shutdown timeout set to 5 seconds + config_template["deployments"][0]["graceful_shutdown_timeout_s"] = 5 + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + + pid2 = ray.get(handle.remote())[0] + assert pid1 == pid2 + print("PID of replica after redeployment:", pid2) + + # Send blocking query + handle.send.remote(clear=True) + handle.remote() + # Try to delete deployment, should be blocked until the timeout at 5 seconds + client.delete_deployments([name], blocking=False) + # Replica should be dead within 10 second timeout, which means + # graceful_shutdown_timeout_s was successfully updated lightweightly + wait_for_condition(partial(self.check_deployments_dead, ["f"])) + + def test_update_config_max_concurrent_queries(self, client: ServeControllerClient): + """Check that replicas stay alive when max_concurrent_queries is updated.""" + + url = "http://localhost:8000/f" + name = f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + config_template = { + "import_path": "ray.serve.tests.test_config_files.pid.async_node", + "deployments": [{"name": "f", "max_concurrent_queries": 1000}], + } + + # Deploy first time + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + handle = client.get_handle(name) + # Block on calls + ray.get(handle.send.remote(clear=True)) + + with ThreadPoolExecutor() as pool: + # Send 10 queries + futs = [pool.submit(partial(requests.get, url)) for _ in range(10)] + wait_for_condition(lambda: 10 == ray.get(handle.get_counter.remote())) + + # Unblock + ray.get(handle.send.remote()) + pids = [fut.result().json()[0] for fut in futs] + pid1 = pids[0] + # Check all returned pids are the same, meaning requests were served by the + # same replica + assert all(pid == pid1 for pid in pids) + + # Redeploy with max concurrent queries set to 2 + config_template["deployments"][0]["max_concurrent_queries"] = 2 + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + + # Re-block + ray.get(handle.send.remote(clear=True)) + + with ThreadPoolExecutor() as pool: + # Send 3 queries + futs = [pool.submit(partial(requests.get, url)) for _ in range(3)] + # Only 2 out of the 3 queries should have been sent to the replica because + # max concurrent queries is 2 + time.sleep(10) + assert ray.get(handle.get_counter.remote()) < 103 + + # Unblock + ray.get(handle.send.remote()) + pids = [fut.result().json()[0] for fut in futs] + pid2 = pids[0] + assert all(pid == pid2 for pid in pids) + + # Check that it's the same replica, it didn't get teared down + assert pid1 == pid2 + + def test_update_config_health_check_period(self, client: ServeControllerClient): + """Check that replicas stay alive when max_concurrent_queries is updated.""" + + name = f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + config_template = { + "import_path": "ray.serve.tests.test_config_files.pid.async_node", + "deployments": [{"name": "f", "health_check_period_s": 100}], + } + + # Deploy first time, wait for replica running and deployment healthy + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + handle = client.get_handle(name) + pid1 = ray.get(handle.remote())[0] + # Health check counter shouldn't increase beyond any initial health checks done + # upon replica actor startup + initial_counter = ray.get(handle.get_counter.remote(health_check=True)) + time.sleep(5) + assert initial_counter == ray.get(handle.get_counter.remote(health_check=True)) + + # Redeploy with health check period reduced to 1 second + config_template["deployments"][0]["health_check_period_s"] = 0.1 + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + # health check counter should now very quickly increase + wait_for_condition( + lambda: ray.get(handle.get_counter.remote(health_check=True)) >= 30, + retry_interval_ms=1000, + timeout=5, + ) + + # Check that it's the same replica, it didn't get teared down + pid2 = ray.get(handle.remote())[0] + assert pid1 == pid2 + + def test_update_config_health_check_timeout(self, client: ServeControllerClient): + """Check that replicas stay alive when max_concurrent_queries is updated.""" + + name = f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + # Deploy with a very long initial health_check_timeout_s + # Also set small health_check_period_s to make test run faster + config_template = { + "import_path": "ray.serve.tests.test_config_files.pid.async_node", + "deployments": [ + { + "name": "f", + "health_check_period_s": 1, + "health_check_timeout_s": 1000, + } + ], + } + + # Deploy first time, wait for replica running and deployment healthy + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + handle = client.get_handle(name) + pid1 = ray.get(handle.remote())[0] + + # Redeploy with health check timeout reduced to 1 second + config_template["deployments"][0]["health_check_timeout_s"] = 1 + client.deploy_apps(ServeApplicationSchema.parse_obj(config_template)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + # Check that it's the same replica, it didn't get teared down + # (needs to be done before the tests below because the replica will be marked + # unhealthy then stopped and restarted) + pid2 = ray.get(handle.remote())[0] + assert pid1 == pid2 + + # Block in health check + ray.get(handle.send.remote(clear=True, health_check=True)) + wait_for_condition( + lambda: client.get_serve_status().get_deployment_status(name).status + == DeploymentStatus.UNHEALTHY + ) def test_deploy_separate_runtime_envs(self, client: ServeControllerClient): """Deploy two applications with separate runtime envs.""" @@ -1383,7 +1643,9 @@ def check_app_status(): assert info_valid def test_deploy_nonexistent_deployment(self, client: ServeControllerClient): - """Remove an application from a config, it should reach a deleting state.""" + """Apply a config that lists a deployment that doesn't exist in the application. + The error message should be descriptive. + """ config = ServeDeploySchema.parse_obj(self.get_test_deploy_config()) # Change names to invalid names that don't contain "deployment" or "application" @@ -1402,6 +1664,52 @@ def check_app_message(): wait_for_condition(check_app_message) + def test_deploy_with_no_applications(self, client: ServeControllerClient): + """Deploy an empty list of applications, serve should just be started.""" + + config = ServeDeploySchema.parse_obj({"applications": []}) + client.deploy_apps(config) + + def serve_running(): + ServeInstanceDetails.parse_obj( + ray.get(client._controller.get_serve_instance_details.remote()) + ) + actors = list_actors( + filters=[ + ("ray_namespace", "=", SERVE_NAMESPACE), + ("state", "=", "ALIVE"), + ] + ) + actor_names = [actor["class_name"] for actor in actors] + return "ServeController" in actor_names and "HTTPProxyActor" in actor_names + + wait_for_condition(serve_running) + + def test_deployments_not_listed_in_config(self, client: ServeControllerClient): + """Apply a config without the app's deployments listed. The deployments should + not redeploy. + """ + + name = f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}f" + config = {"import_path": "ray.serve.tests.test_config_files.pid.node"} + client.deploy_apps(ServeApplicationSchema(**config)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + pid1, _ = requests.get("http://localhost:8000/f").json() + + # Redeploy the same config (with no deployments listed) + client.deploy_apps(ServeApplicationSchema(**config)) + wait_for_condition( + partial(self.check_deployment_running, client, name), timeout=15 + ) + + # It should be the same replica actor + pids = [] + for _ in range(4): + pids.append(requests.get("http://localhost:8000/f").json()[0]) + assert all(pid == pid1 for pid in pids) + class TestServeRequestProcessingTimeoutS: @pytest.mark.parametrize( diff --git a/python/ray/serve/tests/test_standalone3.py b/python/ray/serve/tests/test_standalone3.py index 9170bb4fe700..47b7173b187c 100644 --- a/python/ray/serve/tests/test_standalone3.py +++ b/python/ray/serve/tests/test_standalone3.py @@ -22,6 +22,7 @@ from ray.exceptions import RayActorError from ray.serve._private.constants import ( SYNC_HANDLE_IN_DAG_FEATURE_FLAG_ENV_KEY, + SERVE_DEFAULT_APP_NAME, ) from ray.serve.context import get_global_client from ray.tests.conftest import call_ray_stop_only # noqa: F401 @@ -196,6 +197,50 @@ def verify_metrics(): serve.shutdown() +@pytest.mark.parametrize( + "ray_instance", + [], + indirect=True, +) +def test_replica_health_metric(ray_instance): + """Test replica health metrics""" + + @serve.deployment(num_replicas=2) + def f(): + return "hello" + + serve.run(f.bind()) + + def count_live_replica_metrics(): + resp = requests.get("http://127.0.0.1:9999").text + resp = resp.split("\n") + count = 0 + for metrics in resp: + if "# HELP" in metrics or "# TYPE" in metrics: + continue + if "serve_deployment_replica_healthy" in metrics: + if "1.0" in metrics: + count += 1 + return count + + wait_for_condition( + lambda: count_live_replica_metrics() == 2, timeout=120, retry_interval_ms=500 + ) + + # Add more replicas + serve.run(f.options(num_replicas=10).bind()) + wait_for_condition( + lambda: count_live_replica_metrics() == 10, timeout=120, retry_interval_ms=500 + ) + + # delete the application + serve.delete(SERVE_DEFAULT_APP_NAME) + wait_for_condition( + lambda: count_live_replica_metrics() == 0, timeout=120, retry_interval_ms=500 + ) + serve.shutdown() + + def test_shutdown_remote(start_and_shutdown_ray_cli_function): """Check that serve.shutdown() works on a remote Ray cluster.""" diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 26623a66b6c0..8cb7a0beed60 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -151,6 +151,7 @@ py_test_module_list( "test_runtime_env_env_vars.py", "test_runtime_env_packaging.py", "test_runtime_env_plugin.py", + "test_runtime_env_setup_func.py", "test_runtime_env_strong_type.py", "test_runtime_env_fork_process.py", "test_serialization.py", @@ -204,6 +205,7 @@ py_test_module_list( "test_top_level_api.py", "test_unhandled_error.py", "test_utils.py", + "test_widgets.py", ], size = "small", tags = ["exclusive", "small_size_python_tests", "team:core"], @@ -355,7 +357,7 @@ py_test_module_list( "test_runtime_env_working_dir_remote_uri.py" ], size = "large", - tags = ["exclusive", "large_size_python_tests_shard_2", "team:serve"], + tags = ["exclusive", "large_size_python_tests_shard_2", "team:core"], deps = ["//:ray_lib", ":conftest"], data = ["pip_install_test-0.5-py3-none-any.whl"], ) @@ -369,7 +371,7 @@ py_test_module_list( "test_runtime_env_conda_and_pip_5.py", ], size = "large", - tags = ["exclusive", "post_wheel_build", "team:serve"], + tags = ["exclusive", "post_wheel_build", "team:core"], deps = ["//:ray_lib", ":conftest"], ) @@ -377,7 +379,7 @@ py_test( name = "test_runtime_env_complicated", size = "large", srcs = ["test_runtime_env_complicated.py"], - tags = ["exclusive", "post_wheel_build", "team:serve"], + tags = ["exclusive", "post_wheel_build", "team:core"], deps = ["//:ray_lib", ":conftest"], data = ["//python/ray/experimental/packaging/example_pkg"], ) @@ -386,7 +388,7 @@ py_test( name = "test_actor_group", size = "medium", srcs = ["test_actor_group.py"], - tags = ["exclusive", "medium_size_python_tests_a_to_j", "team:serve"], + tags = ["exclusive", "medium_size_python_tests_a_to_j", "team:core"], deps = ["//:ray_lib", ":conftest"] ) @@ -452,7 +454,7 @@ py_test( "test_runtime_env_validation_1_schema.json", "test_runtime_env_validation_2_schema.json", ], - tags = ["exclusive", "small_size_python_tests", "team:serve"], + tags = ["exclusive", "small_size_python_tests", "team:core"], deps = ["//:ray_lib", ":conftest"], ) @@ -460,7 +462,7 @@ py_test( name = "test_runtime_env_ray_minimal", size = "medium", srcs = ["test_runtime_env_ray_minimal.py"], - tags = ["exclusive", "medium_size_python_tests_k_to_z", "team:serve"], + tags = ["exclusive", "medium_size_python_tests_k_to_z", "team:core"], deps = ["//:ray_lib", ":conftest"], ) diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py index cdc049ca860f..3af9928cf784 100644 --- a/python/ray/tests/conftest.py +++ b/python/ray/tests/conftest.py @@ -146,21 +146,42 @@ def get_default_fixture_ray_kwargs(): return ray_kwargs -@contextmanager -def _setup_redis(request): - # Setup external Redis and env var for initialization. - redis_ports = [] - for _ in range(redis_replicas()): - # max port for redis cluster - port = 55536 - while port >= 55535: - with socket.socket() as s: - s.bind(("", 0)) - port = s.getsockname()[1] - print("Picking port", port) - redis_ports.append(port) +def is_process_listen_to_port(pid, port): + retry_num = 10 + interval_time = 0.5 + for _ in range(retry_num): + try: + proc = psutil.Process(pid) + for conn in proc.connections(): + if conn.status == "LISTEN" and conn.laddr.port == port: + return True + except Exception: + pass + finally: + time.sleep(interval_time) + print( + f"Process({pid}) has not listened to port {port} " + + f"for more than {retry_num * interval_time}s." + ) + return False + + +def start_redis(db_dir): + retry_num = 0 + while True: + is_need_restart = False + # Setup external Redis and env var for initialization. + redis_ports = [] + for _ in range(redis_replicas()): + # max port for redis cluster + port = 55536 + while port >= 55535: + with socket.socket() as s: + s.bind(("", 0)) + port = s.getsockname()[1] + print("Picking port", port) + redis_ports.append(port) - with tempfile.TemporaryDirectory() as tmpdirname: processes = [] enable_tls = "RAY_REDIS_CA_CERT" in os.environ leader_port = None @@ -174,12 +195,30 @@ def _setup_redis(request): enable_tls=enable_tls, replica_of=leader_port, leader_id=leader_id, - db_dir=tmpdirname, + db_dir=db_dir, ) if leader_port is None: leader_port = port leader_id = node_id processes.append(proc) + # Check if th redis has started successfully and is listening on the port. + if not is_process_listen_to_port(proc.process.pid, port): + is_need_restart = True + break + + if is_need_restart: + retry_num += 1 + for proc in processes: + proc.process.kill() + + if retry_num > 5: + raise RuntimeError("Failed to start redis after {retry_num} attempts.") + print( + "Retry to start redis because the process failed to " + + f"listen to the port({port}), retry num:{retry_num}." + ) + continue + if redis_replicas() > 1: import redis @@ -189,6 +228,13 @@ def _setup_redis(request): scheme = "rediss://" if enable_tls else "" address_str = f"{scheme}127.0.0.1:{redis_ports[-1]}" + return address_str, processes + + +@contextmanager +def _setup_redis(request): + with tempfile.TemporaryDirectory() as tmpdirname: + address_str, processes = start_redis(tmpdirname) old_addr = os.environ.get("RAY_REDIS_ADDRESS") os.environ["RAY_REDIS_ADDRESS"] = address_str import uuid @@ -1143,3 +1189,23 @@ def enable_syncer_test(request, monkeypatch): yield monkeypatch.delenv("RAY_use_ray_syncer") ray._raylet.Config.initialize("") + + +@pytest.fixture(scope="function") +def temp_file(request): + with tempfile.NamedTemporaryFile("r+b") as fp: + yield fp + + +@pytest.fixture(scope="module") +def random_ascii_file(request): + import random + import string + + file_size = getattr(request, "param", 1 << 10) + + with tempfile.NamedTemporaryFile(mode="r+b") as fp: + fp.write("".join(random.choices(string.ascii_letters, k=file_size)).encode()) + fp.flush() + + yield fp diff --git a/python/ray/tests/gcp/test_gcp_node_provider.py b/python/ray/tests/gcp/test_gcp_node_provider.py index da27cc3be02e..7c551455a29f 100644 --- a/python/ray/tests/gcp/test_gcp_node_provider.py +++ b/python/ray/tests/gcp/test_gcp_node_provider.py @@ -35,6 +35,33 @@ def __init__(self, provider_config: dict, cluster_name: str): assert create_node_return_value == expected_return_value +def test_terminate_nodes(): + mock_node_config = {"machineType": "n2-standard-8"} + node_type = GCPNodeType.COMPUTE.value + id1, id2 = f"instance-id1-{node_type}", f"instance-id2-{node_type}" + terminate_node_ids = [id1, id2] + mock_resource = MagicMock() + mock_resource.create_instances.return_value = [ + ({"dict": 1}, id1), + ({"dict": 2}, id2), + ] + mock_resource.delete_instance.return_value = "test" + expected_terminate_nodes_result_len = 2 + + def __init__(self, provider_config: dict, cluster_name: str): + self.lock = RLock() + self.cached_nodes: Dict[str, GCPNode] = {} + self.resources: Dict[GCPNodeType, GCPResource] = {} + self.resources[GCPNodeType.COMPUTE] = mock_resource + + with patch.object(GCPNodeProvider, "__init__", __init__): + node_provider = GCPNodeProvider({}, "") + node_provider.create_node(mock_node_config, {}, 1) + create_results = node_provider.terminate_nodes(terminate_node_ids) + + assert len(create_results) == expected_terminate_nodes_result_len + + @pytest.mark.parametrize( "test_case", [ diff --git a/python/ray/tests/ludwig/ludwig_test_utils.py b/python/ray/tests/ludwig/ludwig_test_utils.py index 95a7341e0f11..069d431655ad 100644 --- a/python/ray/tests/ludwig/ludwig_test_utils.py +++ b/python/ray/tests/ludwig/ludwig_test_utils.py @@ -521,7 +521,7 @@ def create_data_set_to_use(data_format, raw_data): # support for writing to a fwf dataset based on this stackoverflow posting: # https://stackoverflow.com/questions/16490261/python-pandas-write-dataframe-to-fixed-width-file-to-fwf - from tabulate import tabulate + from ray._private.thirdparty.tabulate.tabulate import tabulate def to_fwf(df, fname): content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain") diff --git a/python/ray/tests/test_actor_advanced.py b/python/ray/tests/test_actor_advanced.py index 5f18194be475..3f692e444dfa 100644 --- a/python/ray/tests/test_actor_advanced.py +++ b/python/ray/tests/test_actor_advanced.py @@ -7,7 +7,7 @@ import ray import ray._private.gcs_utils as gcs_utils -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors import ray.cluster_utils from ray._private.test_utils import ( SignalActor, @@ -768,8 +768,9 @@ def f(self): # Verify an exception is thrown. a = Actor.remote() - with pytest.raises(ray.exceptions.RayActorError): + with pytest.raises(ray.exceptions.RayActorError) as excinfo: ray.get(a.f.remote()) + assert excinfo.value.actor_id == a._actor_id.hex() # Test an actor can be restarted successfully # afte it dies in its constructor. @@ -1302,7 +1303,7 @@ def verify_cached_dead_actor_cleaned(): driver = """ import ray -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors ray.init("auto") @ray.remote diff --git a/python/ray/tests/test_actor_failures.py b/python/ray/tests/test_actor_failures.py index 7e82a4975156..86e14eaee994 100644 --- a/python/ray/tests/test_actor_failures.py +++ b/python/ray/tests/test_actor_failures.py @@ -1,3 +1,4 @@ +import atexit import asyncio import collections import numpy as np @@ -8,6 +9,7 @@ import time import ray +from ray.actor import exit_actor import ray.cluster_utils from ray._private.test_utils import ( wait_for_condition, @@ -712,6 +714,7 @@ def create_actor(self): ray.exceptions.RayActorError, match="it was killed by `ray.kill" ) as exc_info: ray.get(a.check_alive.remote()) + assert exc_info.value.actor_id == a._actor_id.hex() print(exc_info._excinfo[1]) # Test actor killed because of worker failure. @@ -723,6 +726,7 @@ def create_actor(self): match=("The actor is dead because its worker process has died"), ) as exc_info: ray.get(a.check_alive.remote()) + assert exc_info.value.actor_id == a._actor_id.hex() print(exc_info._excinfo[1]) # Test acator killed because of owner failure. @@ -734,6 +738,7 @@ def create_actor(self): match="The actor is dead because its owner has died", ) as exc_info: ray.get(a.check_alive.remote()) + assert exc_info.value.actor_id == a._actor_id.hex() print(exc_info._excinfo[1]) # Test actor killed because the node is dead. @@ -746,6 +751,7 @@ def create_actor(self): match="The actor is dead because its node has died.", ) as exc_info: ray.get(a.check_alive.remote()) + assert exc_info.value.actor_id == a._actor_id.hex() print(exc_info._excinfo[1]) @@ -790,6 +796,145 @@ def foo(): ray.get(ref) +def test_exit_actor(shutdown_only, tmp_path): + """ + Verify TypeError is raised when exit_actor is not used + inside an actor. + """ + with pytest.raises( + TypeError, match="exit_actor API is called on a non-actor worker" + ): + exit_actor() + + @ray.remote + def f(): + exit_actor() + + with pytest.raises( + TypeError, match="exit_actor API is called on a non-actor worker" + ): + ray.get(f.remote()) + + """ + Verify the basic case. + """ + + @ray.remote + class Actor: + def exit(self): + exit_actor() + + @ray.remote + class AsyncActor: + async def exit(self): + exit_actor() + + a = Actor.remote() + ray.get(a.__ray_ready__.remote()) + with pytest.raises(ray.exceptions.RayActorError) as exc_info: + ray.get(a.exit.remote()) + assert "exit_actor()" in str(exc_info.value) + + b = AsyncActor.remote() + ray.get(b.__ray_ready__.remote()) + with pytest.raises(ray.exceptions.RayActorError) as exc_info: + ray.get(b.exit.remote()) + assert "exit_actor()" in str(exc_info.value) + + """ + Verify atexit handler is called correctly. + """ + sync_temp_file = tmp_path / "actor.log" + async_temp_file = tmp_path / "async_actor.log" + sync_temp_file.touch() + async_temp_file.touch() + + @ray.remote + class Actor: + def __init__(self): + def f(): + print("atexit handler") + with open(sync_temp_file, "w") as f: + f.write("Actor\n") + + atexit.register(f) + + def exit(self): + exit_actor() + + @ray.remote + class AsyncActor: + def __init__(self): + def f(): + print("atexit handler") + with open(async_temp_file, "w") as f: + f.write("Async Actor\n") + + atexit.register(f) + + async def exit(self): + exit_actor() + + a = Actor.remote() + ray.get(a.__ray_ready__.remote()) + b = AsyncActor.remote() + ray.get(b.__ray_ready__.remote()) + with pytest.raises(ray.exceptions.RayActorError): + ray.get(a.exit.remote()) + with pytest.raises(ray.exceptions.RayActorError): + ray.get(b.exit.remote()) + + def verify(): + with open(async_temp_file) as f: + assert f.readlines() == ["Async Actor\n"] + with open(sync_temp_file) as f: + assert f.readlines() == ["Actor\n"] + return True + + wait_for_condition(verify) + + +def test_exit_actor_queued(shutdown_only): + """Verify after exit_actor is called the queued tasks won't execute.""" + + @ray.remote + class RegressionSync: + def f(self): + import time + + time.sleep(1) + exit_actor() + + def ping(self): + pass + + @ray.remote + class RegressionAsync: + async def f(self): + await asyncio.sleep(1) + exit_actor() + + def ping(self): + pass + + # Test async case. + # https://github.com/ray-project/ray/issues/32376 + # If we didn't fix this issue, this will segfault. + a = RegressionAsync.remote() + a.f.remote() + refs = [a.ping.remote() for _ in range(10000)] + with pytest.raises(ray.exceptions.RayActorError) as exc_info: + ray.get(refs) + assert " Worker unexpectedly exits" not in str(exc_info.value) + + # Test a sync case. + a = RegressionSync.remote() + a.f.remote() + with pytest.raises(ray.exceptions.RayActorError) as exc_info: + ray.get([a.ping.remote() for _ in range(10000)]) + assert " Worker unexpectedly exits" not in str(exc_info.value) + + if __name__ == "__main__": import pytest diff --git a/python/ray/tests/test_actor_pool.py b/python/ray/tests/test_actor_pool.py index 63fdc7b31071..a4c04276b553 100644 --- a/python/ray/tests/test_actor_pool.py +++ b/python/ray/tests/test_actor_pool.py @@ -1,6 +1,7 @@ import asyncio import sys import time +from unittest.mock import MagicMock import pytest import ray @@ -79,6 +80,26 @@ def double(self, x): index += 1 +def test_map_eager(init): + """Verify that submit is called eagerly when map is called. + + If the results are directly yielded, then the submit calls are not + executed until the results are consumed. + """ + + @ray.remote + class MyActor: + def f(self, x): + pass + + actor = MyActor.remote() + pool = ActorPool([actor]) + pool.submit = MagicMock() + + pool.map(lambda a, v: a.f.remote(v), range(1)) + pool.submit.assert_called() + + def test_map_unordered(init): @ray.remote class MyActor: diff --git a/python/ray/tests/test_actor_state_metrics.py b/python/ray/tests/test_actor_state_metrics.py index a0069c161f74..e53b06c9df99 100644 --- a/python/ray/tests/test_actor_state_metrics.py +++ b/python/ray/tests/test_actor_state_metrics.py @@ -6,7 +6,7 @@ import ray -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray._private.test_utils import ( raw_metrics, wait_for_condition, diff --git a/python/ray/tests/test_advanced_2.py b/python/ray/tests/test_advanced_2.py index f1900ec46e16..8bb2838c61e6 100644 --- a/python/ray/tests/test_advanced_2.py +++ b/python/ray/tests/test_advanced_2.py @@ -520,21 +520,6 @@ def test(): assert cluster_resources == {} -def test_ray_get_timeout_zero(monkeypatch): - # Check that ray.get(timeout=0) raises warnings on change of behavior. - # Removed when https://github.com/ray-project/ray/issues/28465 is resolved. - with pytest.warns(UserWarning): - ray.get(ray.put(1), timeout=0) - - with monkeypatch.context() as m: - m.setenv("RAY_WARN_RAY_GET_TIMEOUT_ZERO", "0") - import warnings - - with warnings.catch_warnings(): - warnings.simplefilter("error") - ray.get(ray.put(1), timeout=0) - - if __name__ == "__main__": if os.environ.get("PARALLEL_CI"): sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) diff --git a/python/ray/tests/test_advanced_9.py b/python/ray/tests/test_advanced_9.py index 369370bac091..b61e5aac9216 100644 --- a/python/ray/tests/test_advanced_9.py +++ b/python/ray/tests/test_advanced_9.py @@ -280,7 +280,7 @@ def get_gcs_num_of_connections(): time.sleep(10) - curr_fds = get_gcs_num_of_connections() + fds_without_workers = get_gcs_num_of_connections() @ray.remote class A: @@ -289,27 +289,27 @@ def ready(self): return "WORLD" num_of_actors = 10 - a = [A.remote() for _ in range(num_of_actors)] - print(ray.get([t.ready.remote() for t in a])) + actors = [A.remote() for _ in range(num_of_actors)] + print(ray.get([t.ready.remote() for t in actors])) - # Kill the actor - del a + # Kill the actors + del actors - # TODO(clarng):remove this once prestart works with actors. - # ray_start_cluster defaults to one cpu, which prestarts one worker. - FD_PER_WORKER = 2 # Make sure the # of fds opened by the GCS dropped. - wait_for_condition(lambda: get_gcs_num_of_connections() + FD_PER_WORKER == curr_fds) + # This assumes worker processes are not created after the actor worker + # processes die. + wait_for_condition(lambda: get_gcs_num_of_connections() <= fds_without_workers) + num_fds_after_workers_die = get_gcs_num_of_connections() n = cluster.add_node(wait=True) # Make sure the # of fds opened by the GCS increased. - wait_for_condition(lambda: get_gcs_num_of_connections() + FD_PER_WORKER > curr_fds) + wait_for_condition(lambda: get_gcs_num_of_connections() > num_fds_after_workers_die) cluster.remove_node(n) # Make sure the # of fds opened by the GCS dropped. - wait_for_condition(lambda: get_gcs_num_of_connections() + FD_PER_WORKER == curr_fds) + wait_for_condition(lambda: get_gcs_num_of_connections() <= fds_without_workers) @pytest.mark.parametrize( diff --git a/python/ray/tests/test_asyncio.py b/python/ray/tests/test_asyncio.py index 17ce2b08ca88..a7cc9890d094 100644 --- a/python/ray/tests/test_asyncio.py +++ b/python/ray/tests/test_asyncio.py @@ -348,6 +348,31 @@ async def async_thread_id(self): assert sync_id == async_id +def test_asyncio_actor_shutdown_when_non_async_method_mixed(ray_start_regular_shared): + # It is a regression test. + # https://github.com/ray-project/ray/issues/32376 + # Make sure the core worker doesn't crash when + # exit_actor is used when async & regular actor tasks + # are executed. + @ray.remote + class A: + async def f(self): + await asyncio.sleep(1) + ray.actor.exit_actor() + + def ping(self): + pass + + a = A.remote() + a.f.remote() + + with pytest.raises( + ray.exceptions.RayActorError, + match=("exit_actor"), + ): + ray.get([a.ping.remote() for _ in range(10000)]) + + if __name__ == "__main__": import pytest diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index b60464f72792..4f30b6c932b7 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -362,6 +362,25 @@ class A: ray.remote(_metadata={"data": 1})(f) ray.remote(_metadata={"data": 1})(A) + # Check invalid resource quantity + with pytest.raises( + ValueError, + match=( + "The precision of the fractional quantity of resource num_gpus" + " cannot go beyond 0.0001" + ), + ): + ray.remote(num_gpus=0.0000001)(f) + + with pytest.raises( + ValueError, + match=( + "The precision of the fractional quantity of resource custom_resource" + " cannot go beyond 0.0001" + ), + ): + ray.remote(resources={"custom_resource": 0.0000001})(f) + def test_options(): """General test of option keywords in Ray.""" diff --git a/python/ray/tests/test_basic_2.py b/python/ray/tests/test_basic_2.py index b83280fcecb6..7c78747148f0 100644 --- a/python/ray/tests/test_basic_2.py +++ b/python/ray/tests/test_basic_2.py @@ -384,6 +384,10 @@ def test_get_with_timeout(ray_start_regular_shared): with pytest.raises(TimeoutError): ray.get(result_id, timeout=0.1) + # timeout of 0 should raise an error + with pytest.raises(GetTimeoutError): + ray.get(result_id, timeout=0) + # Check that a subsequent get() returns early. ray.get(signal.send.remote()) start = time.time() diff --git a/python/ray/tests/test_basic_3.py b/python/ray/tests/test_basic_3.py index 14c4323b1c49..318cc5c70cd8 100644 --- a/python/ray/tests/test_basic_3.py +++ b/python/ray/tests/test_basic_3.py @@ -85,13 +85,21 @@ def f(block, accepted_resources): result_ids = [] for rand1, rand2, rand3 in np.random.uniform(size=(100, 3)): resource_set = {"CPU": int(rand1 * 10000) / 10000} - result_ids.append(f._remote([False, resource_set], num_cpus=rand1)) + result_ids.append( + f._remote([False, resource_set], num_cpus=resource_set["CPU"]) + ) resource_set = {"CPU": 1, "GPU": int(rand1 * 10000) / 10000} - result_ids.append(f._remote([False, resource_set], num_gpus=rand1)) + result_ids.append( + f._remote([False, resource_set], num_gpus=resource_set["GPU"]) + ) resource_set = {"CPU": 1, "Custom": int(rand1 * 10000) / 10000} - result_ids.append(f._remote([False, resource_set], resources={"Custom": rand1})) + result_ids.append( + f._remote( + [False, resource_set], resources={"Custom": resource_set["Custom"]} + ) + ) resource_set = { "CPU": int(rand1 * 10000) / 10000, @@ -101,17 +109,17 @@ def f(block, accepted_resources): result_ids.append( f._remote( [False, resource_set], - num_cpus=rand1, - num_gpus=rand2, - resources={"Custom": rand3}, + num_cpus=resource_set["CPU"], + num_gpus=resource_set["GPU"], + resources={"Custom": resource_set["Custom"]}, ) ) result_ids.append( f._remote( [True, resource_set], - num_cpus=rand1, - num_gpus=rand2, - resources={"Custom": rand3}, + num_cpus=resource_set["CPU"], + num_gpus=resource_set["GPU"], + resources={"Custom": resource_set["Custom"]}, ) ) assert all(ray.get(result_ids)) diff --git a/python/ray/tests/test_basic_4.py b/python/ray/tests/test_basic_4.py index 43eb63e5ee95..3918239b840f 100644 --- a/python/ray/tests/test_basic_4.py +++ b/python/ray/tests/test_basic_4.py @@ -6,7 +6,6 @@ from pathlib import Path import os -import numpy as np import pytest from unittest import mock @@ -21,7 +20,7 @@ def test_actor_scheduling(shutdown_only): - ray.init() + ray.init(num_cpus=1) @ray.remote class A: @@ -33,7 +32,7 @@ def get(self): a = A.remote() a.run_fail.remote() - with pytest.raises(Exception): + with pytest.raises(ray.exceptions.RayActorError, match="exit_actor"): ray.get([a.get.remote()]) @@ -59,11 +58,7 @@ def slow_function(): # Flood a large scale lease worker requests. for i in range(10000): - # Use random cpu resources to make sure that all tasks are sent - # to the raylet. Because core worker will cache tasks with the - # same resource shape. - num_cpus = 0.24 + np.random.uniform(0, 0.01) - slow_function.options(num_cpus=num_cpus).remote() + slow_function.options(num_cpus=0.25).remote() # Check "debug_state.txt" to ensure no extra workers were started. session_dir = ray._private.worker.global_worker.node.address_info["session_dir"] diff --git a/python/ray/tests/test_basic_5.py b/python/ray/tests/test_basic_5.py index 20aee63e2983..c3847ad8b1be 100644 --- a/python/ray/tests/test_basic_5.py +++ b/python/ray/tests/test_basic_5.py @@ -15,6 +15,7 @@ from ray._private.test_utils import ( run_string_as_driver, wait_for_pid_to_exit, + client_test_enabled, ) logger = logging.getLogger(__name__) @@ -144,7 +145,7 @@ def pid(self): assert "Traceback" not in log -@pytest.mark.skipif(sys.platform == "win32", reason="Flaky on windows") +@pytest.mark.skip("flaky test") def test_run_on_all_workers(call_ray_start, tmp_path): # This test is to ensure run_function_on_all_workers are executed # on all workers. @@ -226,6 +227,9 @@ def sys_path(): subprocess.check_call(["python", "-m", "package.module2"]) +# This will be fixed on Windows once the import thread is removed, see +# https://github.com/ray-project/ray/pull/30895 +@pytest.mark.skipif(sys.platform == "win32", reason="Currently fails on Windows.") def test_worker_kv_calls(monkeypatch, shutdown_only): monkeypatch.setenv("TEST_RAY_COLLECT_KV_FREQUENCY", "1") ray.init() @@ -362,6 +366,7 @@ def verify_imports(latch): ray.get(futures) +@pytest.mark.skipif(client_test_enabled(), reason="only server mode") def test_gcs_port_env(): try: with unittest.mock.patch.dict(os.environ): diff --git a/python/ray/tests/test_cancel.py b/python/ray/tests/test_cancel.py index 00d8299e1a07..1c6d33d4d61f 100644 --- a/python/ray/tests/test_cancel.py +++ b/python/ray/tests/test_cancel.py @@ -18,7 +18,7 @@ ) from ray._private.utils import DeferSigint from ray._private.test_utils import SignalActor, wait_for_condition -from ray.experimental.state.api import list_tasks +from ray.util.state import list_tasks def valid_exceptions(use_force): @@ -508,7 +508,6 @@ def inner(): @ray.remote(num_cpus=1) def outer(): - x = [inner.remote()] print(x) while True: diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index 1a9c813a8674..865099a5ab1f 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -44,7 +44,7 @@ import ray.scripts.scripts as scripts from ray._private.test_utils import wait_for_condition from ray.cluster_utils import cluster_not_supported -from ray.experimental.state.api import list_nodes +from ray.util.state import list_nodes import psutil @@ -193,7 +193,6 @@ def _debug_check_line_by_line(result, expected_lines): if i < len(expected_lines): print("!!! ERROR: Expected extra lines (regex):") for line in expected_lines[i:]: - print(repr(line)) assert False diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index 1589c8d1a44d..4ee54ed1eac6 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -65,11 +65,11 @@ def test_client_context_manager(call_ray_start_shared, connect_to_client): call_ray_start_shared ), enable_client_mode(): # Client mode is on. - assert client_mode_should_convert(auto_init=True) + assert client_mode_should_convert() # We're connected to Ray client. assert ray.util.client.ray.is_connected() else: - assert not client_mode_should_convert(auto_init=True) + assert not client_mode_should_convert() assert not ray.util.client.ray.is_connected() @@ -108,20 +108,20 @@ def run(self): def test_client_mode_hook_thread_safe(call_ray_start_shared): with ray_start_client_server_for_address(call_ray_start_shared): with enable_client_mode(): - assert client_mode_should_convert(auto_init=True) + assert client_mode_should_convert() lock = threading.Lock() lock.acquire() q = queue.Queue() def disable(): with disable_client_hook(): - q.put(client_mode_should_convert(auto_init=True)) + q.put(client_mode_should_convert()) lock.acquire() - q.put(client_mode_should_convert(auto_init=True)) + q.put(client_mode_should_convert()) t = threading.Thread(target=disable) t.start() - assert client_mode_should_convert(auto_init=True) + assert client_mode_should_convert() lock.release() t.join() assert q.get() is False, "Threaded disable_client_hook failed to disable" diff --git a/python/ray/tests/test_client_builder.py b/python/ray/tests/test_client_builder.py index 318c337132bf..64edd7cb9365 100644 --- a/python/ray/tests/test_client_builder.py +++ b/python/ray/tests/test_client_builder.py @@ -14,7 +14,7 @@ run_string_as_driver_nonblocking, wait_for_condition, ) -from ray.experimental.state.api import list_workers +from ray.util.state import list_workers @pytest.mark.parametrize( diff --git a/python/ray/tests/test_client_library_integration.py b/python/ray/tests/test_client_library_integration.py index 133a84b956de..b0e6fa602b0a 100644 --- a/python/ray/tests/test_client_library_integration.py +++ b/python/ray/tests/test_client_library_integration.py @@ -14,11 +14,11 @@ def test_rllib_integration(ray_start_regular): # Confirming the behavior of this context manager. # (Client mode hook not yet enabled.) - assert not client_mode_should_convert(auto_init=True) + assert not client_mode_should_convert() # Need to enable this for client APIs to be used. with enable_client_mode(): # Confirming mode hook is enabled. - assert client_mode_should_convert(auto_init=True) + assert client_mode_should_convert() config = dqn.SIMPLE_Q_DEFAULT_CONFIG.copy() # Run locally. @@ -38,11 +38,11 @@ def test_rllib_integration_tune(ray_start_regular): with ray_start_client_server(): # Confirming the behavior of this context manager. # (Client mode hook not yet enabled.) - assert not client_mode_should_convert(auto_init=True) + assert not client_mode_should_convert() # Need to enable this for client APIs to be used. with enable_client_mode(): # Confirming mode hook is enabled. - assert client_mode_should_convert(auto_init=True) + assert client_mode_should_convert() tune.run( "DQN", config={"env": "CartPole-v1"}, stop={"training_iteration": 2} ) diff --git a/python/ray/tests/test_client_proxy.py b/python/ray/tests/test_client_proxy.py index f23646263964..c52106dd70b8 100644 --- a/python/ray/tests/test_client_proxy.py +++ b/python/ray/tests/test_client_proxy.py @@ -272,9 +272,9 @@ def test_prepare_runtime_init_req_no_modification(): ), ) req, new_config = proxier.prepare_runtime_init_req(init_req) - assert new_config.serialize() == job_config.serialize() + assert new_config._serialize() == job_config._serialize() assert isinstance(req, ray_client_pb2.DataRequest) - assert pickle.loads(req.init.job_config).serialize() == new_config.serialize() + assert pickle.loads(req.init.job_config)._serialize() == new_config._serialize() assert json.loads(req.init.ray_init_kwargs) == {"log_to_driver": False} @@ -301,7 +301,7 @@ def modify_namespace(job_config: JobConfig): req, new_config = proxier.prepare_runtime_init_req(init_req) assert new_config.ray_namespace == "test_value" - assert pickle.loads(req.init.job_config).serialize() == new_config.serialize() + assert pickle.loads(req.init.job_config)._serialize() == new_config._serialize() assert json.loads(req.init.ray_init_kwargs) == {"log_to_driver": False} diff --git a/python/ray/tests/test_client_reconnect.py b/python/ray/tests/test_client_reconnect.py index e49726c62ec4..58df0027da52 100644 --- a/python/ray/tests/test_client_reconnect.py +++ b/python/ray/tests/test_client_reconnect.py @@ -367,17 +367,17 @@ def fail_every_three(_): @ray.remote def large_result(): - # 1024x1024x128 float64 matrix (1024 MiB). With 64MiB chunk size, + # 1024x1024x6 float64 matrix (96 MiB). With 5MiB chunk size, # it will take at least 16 chunks to transfer this object. Since # the failure is injected every 3 chunks, this transfer can only # work if the chunked get request retries at the last received chunk # (instead of starting from the beginning each retry) - return np.random.random((1024, 1024, 128)) + return np.random.random((1024, 1024, 6)) with start_middleman_server(on_task_response=fail_every_three): started = True result = ray.get(large_result.remote()) - assert result.shape == (1024, 1024, 128) + assert result.shape == (1024, 1024, 6) def test_disconnects_during_large_async_get(): @@ -398,12 +398,12 @@ def fail_every_three(_): @ray.remote def large_result(): - # 1024x1024x128 float64 matrix (1024 MiB). With 64MiB chunk size, + # 1024x1024x6 float64 matrix (96 MiB). With 5MiB chunk size, # it will take at least 16 chunks to transfer this object. Since # the failure is injected every 3 chunks, this transfer can only # work if the chunked get request retries at the last received chunk # (instead of starting from the beginning each retry) - return np.random.random((1024, 1024, 128)) + return np.random.random((1024, 1024, 6)) with start_middleman_server(on_data_response=fail_every_three): started = True @@ -412,7 +412,7 @@ async def get_large_result(): return await large_result.remote() result = get_or_create_event_loop().run_until_complete(get_large_result()) - assert result.shape == (1024, 1024, 128) + assert result.shape == (1024, 1024, 6) def test_disconnect_during_large_put(): @@ -433,10 +433,10 @@ def fail_halfway(_): with start_middleman_server(on_data_request=fail_halfway): started = True - objref = ray.put(np.random.random((1024, 1024, 128))) + objref = ray.put(np.random.random((1024, 1024, 6))) assert i > 8 # Check that the failure was injected result = ray.get(objref) - assert result.shape == (1024, 1024, 128) + assert result.shape == (1024, 1024, 6) def test_disconnect_during_large_schedule(): @@ -461,10 +461,10 @@ def f(a): with start_middleman_server(on_data_request=fail_halfway): started = True - a = np.random.random((1024, 1024, 128)) + a = np.random.random((1024, 1024, 6)) result = ray.get(f.remote(a)) assert i > 8 # Check that the failure was injected - assert result == (1024, 1024, 128) + assert result == (1024, 1024, 6) def test_valid_actor_state(): diff --git a/python/ray/tests/test_component_failures_2.py b/python/ray/tests/test_component_failures_2.py index e39bdc99a34a..35fc1b385183 100644 --- a/python/ray/tests/test_component_failures_2.py +++ b/python/ray/tests/test_component_failures_2.py @@ -127,7 +127,7 @@ def get_node_info(): cluster.head_node.node_ip_address, ) - assert get_node_info().raylet_socket_name == cluster.head_node.raylet_socket_name + assert get_node_info()["raylet_socket_name"] == cluster.head_node.raylet_socket_name cluster.head_node.kill_raylet() wait_for_condition( @@ -137,7 +137,7 @@ def get_node_info(): get_node_info() node2 = cluster.add_node() - assert get_node_info().raylet_socket_name == node2.raylet_socket_name + assert get_node_info()["raylet_socket_name"] == node2.raylet_socket_name if __name__ == "__main__": diff --git a/python/ray/tests/test_dashboard.py b/python/ray/tests/test_dashboard.py index bef335b519e4..dad7ed17c7ea 100644 --- a/python/ray/tests/test_dashboard.py +++ b/python/ray/tests/test_dashboard.py @@ -98,7 +98,6 @@ def dashboard_available(): indirect=True, ) def test_port_conflict(listen_port, call_ray_stop_only, shutdown_only): - try: subprocess.check_output( [ @@ -151,6 +150,7 @@ def test_dashboard(shutdown_only): conflict_port = 34567 +configured_test_port = 34568 def run_tasks_without_runtime_env(): @@ -195,16 +195,50 @@ def f(): def test_dashboard_agent_grpc_port_conflict(listen_port, call_ray_start): address = call_ray_start ray.init(address=address) + # Tasks without runtime env still work when dashboard agent grpc port conflicts. run_tasks_without_runtime_env() # Tasks with runtime env couldn't work. with pytest.raises( ray.exceptions.RuntimeEnvSetupError, - match="the grpc service of agent is invalid", + match="Ray agent couldn't be started due to the port conflict", ): run_tasks_with_runtime_env() +@pytest.mark.parametrize( + "call_ray_start", + [f"ray start --head --num-cpus=1 --dashboard-grpc-port={configured_test_port}"], + indirect=True, +) +def test_configured_dashboard_grpc_port(call_ray_start): + address = call_ray_start + addresses = ray.init(address=address) + assert addresses.dashboard_url == "127.0.0.1:8265" + + +@pytest.mark.parametrize( + "listen_port", + [conflict_port], + indirect=True, +) +def test_dashboard_grpc_port_conflict(listen_port, call_ray_stop_only, shutdown_only): + try: + subprocess.check_output( + [ + "ray", + "start", + "--head", + "--dashboard-grpc-port", + f"{conflict_port}", + "--include-dashboard=True", + ], + stderr=subprocess.PIPE, + ) + except subprocess.CalledProcessError as e: + assert f"Failed to bind to address 0.0.0.0:{conflict_port}".encode() in e.stderr + + @pytest.mark.skipif( sys.platform == "win32", reason="`runtime_env` with `pip` not supported on Windows." ) diff --git a/python/ray/tests/test_environ.py b/python/ray/tests/test_environ.py index ec407799feb5..e074a761f0a5 100644 --- a/python/ray/tests/test_environ.py +++ b/python/ray/tests/test_environ.py @@ -1,6 +1,8 @@ import os import pytest +import unittest import ray +from ray._private.utils import update_envs @pytest.mark.skipif("sys.platform != 'linux'") @@ -31,6 +33,34 @@ def get_os_environ(self): assert len(actor_os_environ) > 0 +def test_update_envs(): + with unittest.mock.patch.dict(os.environ): + env_vars = { + "PATH": "/test/lib/path:${PATH}", + "LD_LIBRARY_PATH": "/test/path1:${LD_LIBRARY_PATH}:./test/path2", + "DYLD_LIBRARY_PATH": "${DYLD_LIBRARY_PATH}:/test/path", + "LD_PRELOAD": "", + } + old_path = os.environ["PATH"] + os.environ["LD_LIBRARY_PATH"] = "./" + os.environ["DYLD_LIBRARY_PATH"] = "/lib64" + os.environ["LD_PRELOAD"] = "/lib:/usr/local/lib" + update_envs(env_vars) + assert os.environ["PATH"] == "/test/lib/path:" + old_path + assert os.environ["LD_LIBRARY_PATH"] == "/test/path1:./:./test/path2" + assert os.environ["DYLD_LIBRARY_PATH"] == "/lib64:/test/path" + assert os.environ["LD_PRELOAD"] == env_vars["LD_PRELOAD"] + + # Test the empty string scenario + os.environ["LD_LIBRARY_PATH"] = "" + del os.environ["DYLD_LIBRARY_PATH"] + del os.environ["LD_PRELOAD"] + update_envs(env_vars) + assert os.environ["LD_LIBRARY_PATH"] == "/test/path1::./test/path2" + assert os.environ["DYLD_LIBRARY_PATH"] == ":/test/path" + assert os.environ["LD_PRELOAD"] == env_vars["LD_PRELOAD"] + + if __name__ == "__main__": import pytest import sys diff --git a/python/ray/tests/test_exit_observability.py b/python/ray/tests/test_exit_observability.py index 114ec1d504e5..eea3b9470016 100644 --- a/python/ray/tests/test_exit_observability.py +++ b/python/ray/tests/test_exit_observability.py @@ -8,7 +8,7 @@ import ray from ray._private.test_utils import run_string_as_driver, wait_for_condition -from ray.experimental.state.api import list_workers, list_nodes +from ray.util.state import list_workers, list_nodes from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index 71bb7a98dd9a..93f1c734ee0a 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -10,7 +10,6 @@ import ray._private.gcs_utils as gcs_utils import ray._private.ray_constants as ray_constants import ray._private.utils -from ray._private.gcs_pubsub import GcsPublisher from ray._private.test_utils import ( SignalActor, convert_actor_state, @@ -69,7 +68,7 @@ def interceptor(e): def test_publish_error_to_driver(ray_start_regular, error_pubsub): address_info = ray_start_regular - gcs_publisher = GcsPublisher(address=address_info["gcs_address"]) + gcs_publisher = ray._raylet.GcsPublisher(address=address_info["gcs_address"]) error_message = "Test error message" ray._private.utils.publish_error_to_driver( diff --git a/python/ray/tests/test_failure_4.py b/python/ray/tests/test_failure_4.py index f56f2e802b0d..d040953b9493 100644 --- a/python/ray/tests/test_failure_4.py +++ b/python/ray/tests/test_failure_4.py @@ -7,7 +7,7 @@ import psutil import pytest from grpc._channel import _InactiveRpcError -from ray.experimental.state.api import list_tasks +from ray.util.state import list_tasks from ray._private.state_api_test_utils import verify_failed_task import ray @@ -700,6 +700,25 @@ def sleeper(): assert raylet["NodeManagerAddress"] in message +def test_accessing_actor_after_cluster_crashed(shutdown_only): + ray.init() + + @ray.remote + class A: + def f(self): + return + + a = A.remote() + + ray.get(a.f.remote()) + + ray.shutdown() + ray.init() + with pytest.raises(Exception) as exc_info: + ray.get(a.f.remote()) + assert "It might be dead or it's from a different cluster" in exc_info.value.args[0] + + if __name__ == "__main__": import os diff --git a/python/ray/tests/test_gcs_fault_tolerance.py b/python/ray/tests/test_gcs_fault_tolerance.py index 3f70356db8f6..72caad2f0f6e 100644 --- a/python/ray/tests/test_gcs_fault_tolerance.py +++ b/python/ray/tests/test_gcs_fault_tolerance.py @@ -17,6 +17,9 @@ wait_for_pid_to_exit, run_string_as_driver, ) +from ray._private.gcs_pubsub import ( + GcsErrorSubscriber, +) import psutil @@ -649,6 +652,45 @@ def pid(self): ray.get_actor("A") +@pytest.mark.parametrize( + "ray_start_regular_with_external_redis", + [ + generate_system_config_map( + gcs_failover_worker_reconnect_timeout=20, + gcs_rpc_server_reconnect_timeout_s=60, + gcs_server_request_timeout_seconds=10, + ) + ], + indirect=True, +) +@pytest.mark.skip( + reason="python publisher and subscriber doesn't handle gcs server failover" +) +def test_publish_and_subscribe_error_info(ray_start_regular_with_external_redis): + address_info = ray_start_regular_with_external_redis + gcs_server_addr = address_info["gcs_address"] + + subscriber = GcsErrorSubscriber(address=gcs_server_addr) + subscriber.subscribe() + + publisher = ray._raylet.GcsPublisher(address=gcs_server_addr) + print("sending error message 1") + publisher.publish_error(b"aaa_id", "", "test error message 1") + + ray._private.worker._global_node.kill_gcs_server() + ray._private.worker._global_node.start_gcs_server() + + print("sending error message 2") + publisher.publish_error(b"bbb_id", "", "test error message 2") + print("done") + + (key_id, err) = subscriber.poll() + assert key_id == b"bbb_id" + assert err.error_message == "test error message 2" + + subscriber.close() + + @pytest.fixture def redis_replicas(monkeypatch): monkeypatch.setenv("TEST_EXTERNAL_REDIS_REPLICAS", "3") diff --git a/python/ray/tests/test_gcs_ha_e2e.py b/python/ray/tests/test_gcs_ha_e2e.py index 8b12e7a91e5d..53c57167a1d2 100644 --- a/python/ray/tests/test_gcs_ha_e2e.py +++ b/python/ray/tests/test_gcs_ha_e2e.py @@ -3,11 +3,34 @@ import threading from time import sleep from ray._private.test_utils import wait_for_condition -from pytest_docker_tools import container, fetch, network +from pytest_docker_tools import container, fetch, network, volume from pytest_docker_tools import wrappers from http.client import HTTPConnection +# If you need to debug these tests, comment in the volume +# mounts in the head node and worker node containers below and use +# the repro-ci.py script to spin up an instance. The test +# setup is a little intricate, as it uses docker-in-docker. +# You need to ssh into the host machine, find the +# docker-in-docker container with +# +# docker ps +# +# Log into the container with +# +# docker exec -it sh +# +# And run +# +# mkdir -p /tmp/ray +# chmod 777 /tmp/ray +# +# Now you can re-run the test and the logs will show +# up in /tmp/ray in the docker-in-docker container. +# Good luck! + + class Container(wrappers.Container): def ready(self): self._container.reload() @@ -36,6 +59,11 @@ def client(self): port = self.ports["8000/tcp"][0] return HTTPConnection(f"localhost:{port}") + def print_logs(self): + for (name, content) in self.get_files("/tmp"): + print(f"===== log start: {name} ====") + print(content.decode()) + gcs_network = network(driver="bridge") @@ -47,6 +75,9 @@ def client(self): command=("redis-server --save 60 1 --loglevel" " warning"), ) +head_node_vol = volume() +worker_node_vol = volume() + head_node = container( image="ray_ci:v1", name="gcs", @@ -63,11 +94,15 @@ def client(self): "--node-manager-port", "9379", ], + volumes={"{head_node_vol.name}": {"bind": "/tmp", "mode": "rw"}}, environment={"RAY_REDIS_ADDRESS": "{redis.ips.primary}:6379"}, wrapper_class=Container, ports={ "8000/tcp": None, }, + # volumes={ + # "/tmp/ray/": {"bind": "/tmp/ray/", "mode": "rw"} + # }, ) worker_node = container( @@ -84,11 +119,15 @@ def client(self): "--node-manager-port", "9379", ], + volumes={"{worker_node_vol.name}": {"bind": "/tmp", "mode": "rw"}}, environment={"RAY_REDIS_ADDRESS": "{redis.ips.primary}:6379"}, wrapper_class=Container, ports={ "8000/tcp": None, }, + # volumes={ + # "/tmp/ray/": {"bind": "/tmp/ray/", "mode": "rw"} + # }, ) diff --git a/python/ray/tests/test_gcs_pubsub.py b/python/ray/tests/test_gcs_pubsub.py index b9a4eddee7a4..71d4ae802f26 100644 --- a/python/ray/tests/test_gcs_pubsub.py +++ b/python/ray/tests/test_gcs_pubsub.py @@ -3,8 +3,8 @@ import threading import re +import ray from ray._private.gcs_pubsub import ( - GcsPublisher, GcsErrorSubscriber, GcsLogSubscriber, GcsFunctionKeySubscriber, @@ -24,14 +24,16 @@ def test_publish_and_subscribe_error_info(ray_start_regular): subscriber = GcsErrorSubscriber(address=gcs_server_addr) subscriber.subscribe() - publisher = GcsPublisher(address=gcs_server_addr) - err1 = ErrorTableData(error_message="test error message 1") - err2 = ErrorTableData(error_message="test error message 2") - publisher.publish_error(b"aaa_id", err1) - publisher.publish_error(b"bbb_id", err2) + publisher = ray._raylet.GcsPublisher(address=gcs_server_addr) + publisher.publish_error(b"aaa_id", "", "test error message 1") + publisher.publish_error(b"bbb_id", "", "test error message 2") - assert subscriber.poll() == (b"aaa_id", err1) - assert subscriber.poll() == (b"bbb_id", err2) + (key_id1, err1) = subscriber.poll() + assert key_id1 == b"aaa_id" + assert err1.error_message == "test error message 1" + (key_id2, err2) = subscriber.poll() + assert key_id2 == b"bbb_id" + assert err2.error_message == "test error message 2" subscriber.close() @@ -63,7 +65,7 @@ def test_publish_and_subscribe_logs(ray_start_regular): subscriber = GcsLogSubscriber(address=gcs_server_addr) subscriber.subscribe() - publisher = GcsPublisher(address=gcs_server_addr) + publisher = ray._raylet.GcsPublisher(address=gcs_server_addr) log_batch = { "ip": "127.0.0.1", "pid": 1234, @@ -114,7 +116,7 @@ def test_publish_and_subscribe_function_keys(ray_start_regular): subscriber = GcsFunctionKeySubscriber(address=gcs_server_addr) subscriber.subscribe() - publisher = GcsPublisher(address=gcs_server_addr) + publisher = ray._raylet.GcsPublisher(address=gcs_server_addr) publisher.publish_function_key(b"111") publisher.publish_function_key(b"222") @@ -196,9 +198,9 @@ def receive_logs(): t2 = threading.Thread(target=receive_logs) t2.start() - publisher = GcsPublisher(address=gcs_server_addr) + publisher = ray._raylet.GcsPublisher(address=gcs_server_addr) for i in range(0, num_messages): - publisher.publish_error(b"msg_id", ErrorTableData(error_message=f"error {i}")) + publisher.publish_error(b"msg_id", "", f"error {i}") publisher.publish_logs( { "ip": "127.0.0.1", diff --git a/python/ray/tests/test_global_state.py b/python/ray/tests/test_global_state.py index 17899b74b6f5..facc69ad9b81 100644 --- a/python/ray/tests/test_global_state.py +++ b/python/ray/tests/test_global_state.py @@ -168,15 +168,11 @@ def test_node_name_cluster(ray_start_cluster): global_state_accessor = make_global_state_accessor(head_context) node_table = global_state_accessor.get_node_table() assert len(node_table) == 2 - for node_data in node_table: - node = gcs_utils.GcsNodeInfo.FromString(node_data) - if ( - ray._private.utils.binary_to_hex(node.node_id) - == head_context.address_info["node_id"] - ): - assert node.node_name == "head_node" + for node in node_table: + if node["NodeID"] == head_context.address_info["node_id"]: + assert node["NodeName"] == "head_node" else: - assert node.node_name == "worker_node" + assert node["NodeName"] == "worker_node" global_state_accessor.disconnect() ray.shutdown() @@ -188,9 +184,8 @@ def test_node_name_init(): new_head_context = ray.init(_node_name="new_head_node", include_dashboard=False) global_state_accessor = make_global_state_accessor(new_head_context) - node_data = global_state_accessor.get_node_table()[0] - node = gcs_utils.GcsNodeInfo.FromString(node_data) - assert node.node_name == "new_head_node" + node = global_state_accessor.get_node_table()[0] + assert node["NodeName"] == "new_head_node" ray.shutdown() @@ -198,9 +193,8 @@ def test_no_node_name(): # Test that starting ray with no node name will result in a node_name=ip_address new_head_context = ray.init(include_dashboard=False) global_state_accessor = make_global_state_accessor(new_head_context) - node_data = global_state_accessor.get_node_table()[0] - node = gcs_utils.GcsNodeInfo.FromString(node_data) - assert node.node_name == ray.util.get_node_ip_address() + node = global_state_accessor.get_node_table()[0] + assert node["NodeName"] == ray.util.get_node_ip_address() ray.shutdown() diff --git a/python/ray/tests/test_memory_pressure.py b/python/ray/tests/test_memory_pressure.py index d19aa502aac1..421141e87b77 100644 --- a/python/ray/tests/test_memory_pressure.py +++ b/python/ray/tests/test_memory_pressure.py @@ -16,7 +16,7 @@ from ray._private.utils import get_used_memory from ray._private.state_api_test_utils import verify_failed_task -from ray.experimental.state.state_manager import StateDataSourceClient +from ray.util.state.state_manager import StateDataSourceClient memory_usage_threshold = 0.65 diff --git a/python/ray/tests/test_metrics.py b/python/ray/tests/test_metrics.py index 4e629da92441..a9357461d0aa 100644 --- a/python/ray/tests/test_metrics.py +++ b/python/ray/tests/test_metrics.py @@ -100,6 +100,131 @@ def verify(): wait_for_condition(verify) +def get_owner_info(node_ids): + node_addrs = { + n["NodeID"]: (n["NodeManagerAddress"], n["NodeManagerPort"]) + for n in ray.nodes() + } + # Force a global gc to clean up the object store. + ray._private.internal_api.global_gc() + owner_stats = {n: 0 for n in node_ids} + primary_copy_stats = {n: 0 for n in node_ids} + + for node_id in node_ids: + node_stats = ray._private.internal_api.node_stats( + node_addrs[node_id][0], node_addrs[node_id][1], False + ) + owner_stats[node_id] = sum( + [stats.num_owned_objects for stats in node_stats.core_workers_stats] + ) + primary_copy_stats[ + node_id + ] = node_stats.store_stats.num_object_store_primary_copies + + print(owner_stats) + print(node_ids) + owner_stats = [owner_stats.get(node_id, 0) for node_id in node_ids] + primary_copy_stats = [primary_copy_stats.get(node_id, 0) for node_id in node_ids] + print("owner_stats", owner_stats) + print("primary_copy_stats", primary_copy_stats) + + return owner_stats, primary_copy_stats + + +def test_node_object_metrics(ray_start_cluster, monkeypatch): + NUM_NODES = 3 + cluster = ray_start_cluster + for i in range(NUM_NODES): + cluster.add_node(True, resources={f"node_{i}": 1}) + if i == 0: + ray.init(address=cluster.address) + node_ids = [] + + for i in range(NUM_NODES): + + @ray.remote(resources={f"node_{i}": 1}) + def get_node_id(): + return ray.get_runtime_context().get_node_id() + + node_ids.append(ray.get(get_node_id.remote())) + + # Object store stats + # x is owned by node_0 + # x is stored at node_0 + x = ray.put([1]) # noqa: F841 + wait_for_condition(lambda: get_owner_info(node_ids) == ([1, 0, 0], [1, 0, 0])) + + # Test nested with put + @ray.remote(resources={"node_1": 1}) + def big_obj(): + # b is owned by node_1 + # b is stored at node_1 + b = ray.put([1] * 1024 * 1024 * 10) + return b + + # Object store stats + # big_obj is owned by node_0 + # big_obj is stored in memory (no primary copy) + big_obj_ref = big_obj.remote() # noqa: F841 + wait_for_condition(lambda: get_owner_info(node_ids) == ([2, 1, 0], [1, 1, 0])) + + # Test nested with task (small output) + @ray.remote(resources={"node_1": 1}) + def nest_task(s): + @ray.remote(resources={"node_2": 1}) + def task(): + return [1] * s + + # t is owned by node_1 + # if s is small, + # then it's is stored in memory of node_1 (no primary copy) + # else it's stored in object store of node_1 + t = task.remote() + return t + + # nest_ref is owned by node_0 + # nest_ref is stored in memory (no primary copy) + nest_ref = nest_task.remote(1) # noqa: F841 + wait_for_condition(lambda: get_owner_info(node_ids) == ([3, 2, 0], [1, 1, 0])) + + big_nest = nest_task.remote(1024 * 1024 * 10) # noqa: F841 + + wait_for_condition(lambda: get_owner_info(node_ids) == ([4, 3, 0], [1, 1, 1])) + + # Test with assigned owned + @ray.remote(resources={"node_2": 0.5}, num_cpus=0) + class A: + def ready(self): + return + + def gen(self): + return ray.put(10) + + # actor is owned by node_0 + # actor is not an object, so no object store copies + actor = A.remote() # noqa: F841 + ray.get(actor.ready.remote()) + # o is owned by actor (node_2) + # o is stored in object store of node_0 + o = ray.put(1, _owner=actor) # noqa: F841 + wait_for_condition(lambda: get_owner_info(node_ids) == ([5, 3, 1], [2, 1, 1])) + + # Test with detached owned + # detached actor is owned by GCS. So it's not counted in the owner stats + detached_actor = A.options(lifetime="detached", name="A").remote() + ray.get(detached_actor.ready.remote()) + for i in range(3): + assert get_owner_info(node_ids) == ([5, 3, 1], [2, 1, 1]) + import time + + time.sleep(1) + # gen_obj is owned by node_0 + # the inner object is owned by A (node_2) + # the inner object is stored in object store of node_2 + gen_obj = detached_actor.gen.remote() # noqa: F841 + wait_for_condition(lambda: get_owner_info(node_ids) == ([6, 3, 2], [2, 1, 2])) + + def test_multi_node_metrics_export_port_discovery(ray_start_cluster): NUM_NODES = 3 cluster = ray_start_cluster diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index 1a3fc1e62677..06166b5c3941 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -12,7 +12,7 @@ import pytest import ray -from ray.experimental.state.api import list_nodes +from ray.util.state import list_nodes from ray._private.metrics_agent import PrometheusServiceDiscoveryWriter from ray._private.ray_constants import PROMETHEUS_SERVICE_DISCOVERY_FILE from ray._private.test_utils import ( diff --git a/python/ray/tests/test_node_manager.py b/python/ray/tests/test_node_manager.py index 88fcef336742..7e3b1c5b8427 100644 --- a/python/ray/tests/test_node_manager.py +++ b/python/ray/tests/test_node_manager.py @@ -1,5 +1,5 @@ import ray -from ray.experimental.state.api import list_workers +from ray.util.state import list_workers from ray._private.test_utils import ( get_load_metrics_report, run_string_as_driver, @@ -9,7 +9,7 @@ ) import pytest import os -from ray.experimental.state.api import list_objects +from ray.util.state import list_objects import subprocess from ray._private.utils import get_num_cpus import time diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index 3350783effae..b8eecc32bbbc 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -588,7 +588,7 @@ def test_pull_bundle_deadlock(ray_start_cluster): @ray.remote(num_cpus=0) def get_node_id(): - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() worker_node_1_id = ray.get( get_node_id.options(resources={"worker_node_1": 0.1}).remote() diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 50d85d4f9ef1..11fd750bfdfa 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -319,7 +319,7 @@ def test_spill_objects_automatically(fs_only_object_spilling_config, shutdown_on index = random.choice(list(range(buffer_length))) ref = replay_buffer[index] solution = solution_buffer[index] - sample = ray.get(ref, timeout=0) + sample = ray.get(ref, timeout=None) assert np.array_equal(sample, solution) assert_no_thrashing(address["address"]) @@ -359,7 +359,7 @@ def test_unstable_spill_objects_automatically(unstable_spilling_config, shutdown index = random.choice(list(range(buffer_length))) ref = replay_buffer[index] solution = solution_buffer[index] - sample = ray.get(ref, timeout=0) + sample = ray.get(ref, timeout=None) assert np.array_equal(sample, solution) assert_no_thrashing(address["address"]) @@ -397,7 +397,7 @@ def test_slow_spill_objects_automatically(slow_spilling_config, shutdown_only): index = random.choice(list(range(buffer_length))) ref = replay_buffer[index] solution = solution_buffer[index] - sample = ray.get(ref, timeout=0) + sample = ray.get(ref, timeout=None) assert np.array_equal(sample, solution) assert_no_thrashing(address["address"]) diff --git a/python/ray/tests/test_object_spilling_2.py b/python/ray/tests/test_object_spilling_2.py index 9781952574f6..29b3e00ac13c 100644 --- a/python/ray/tests/test_object_spilling_2.py +++ b/python/ray/tests/test_object_spilling_2.py @@ -77,7 +77,7 @@ def test_delete_objects_delete_while_creating(object_spilling_config, shutdown_o # Do random sampling. for _ in range(200): ref = random.choice(replay_buffer) - sample = ray.get(ref, timeout=0) + sample = ray.get(ref, timeout=None) assert np.array_equal(sample, arr) # After all, make sure all objects are killed without race condition. @@ -126,7 +126,7 @@ def create_objects(self): # Do random sampling. for _ in range(200): ref = random.choice(self.replay_buffer) - sample = ray.get(ref, timeout=0) + sample = ray.get(ref, timeout=None) assert np.array_equal(sample, arr) a = Actor.remote() @@ -288,7 +288,7 @@ def test_fusion_objects(fs_only_object_spilling_config, shutdown_only): index = random.choice(list(range(buffer_length))) ref = replay_buffer[index] solution = solution_buffer[index] - sample = ray.get(ref, timeout=0) + sample = ray.get(ref, timeout=None) assert np.array_equal(sample, solution) is_test_passing = False diff --git a/python/ray/tests/test_object_spilling_3.py b/python/ray/tests/test_object_spilling_3.py index 0ae983c5f420..b286df27f949 100644 --- a/python/ray/tests/test_object_spilling_3.py +++ b/python/ray/tests/test_object_spilling_3.py @@ -315,7 +315,7 @@ def test_spill_deadlock(object_spilling_config, shutdown_only): if random.randint(0, 9) < 5: for _ in range(5): ref = random.choice(replay_buffer) - sample = ray.get(ref, timeout=0) + sample = ray.get(ref, timeout=None) assert np.array_equal(sample, arr) assert_no_thrashing(address["address"]) diff --git a/python/ray/tests/test_out_of_disk_space.py b/python/ray/tests/test_out_of_disk_space.py index 8ef155e75766..b92d4c80fceb 100644 --- a/python/ray/tests/test_out_of_disk_space.py +++ b/python/ray/tests/test_out_of_disk_space.py @@ -10,7 +10,7 @@ import pytest import ray -from ray.experimental.state.api import list_cluster_events +from ray.util.state import list_cluster_events def calculate_capacity_threshold(disk_capacity_in_bytes): diff --git a/python/ray/tests/test_placement_group_2.py b/python/ray/tests/test_placement_group_2.py index 4356090bc0c4..c9b367e429f9 100644 --- a/python/ray/tests/test_placement_group_2.py +++ b/python/ray/tests/test_placement_group_2.py @@ -89,6 +89,9 @@ def test_pending_placement_group_wait(ray_start_cluster, connect_to_client): assert len(ready) == 0 table = ray.util.placement_group_table(placement_group) assert table["state"] == "PENDING" + for i in range(3): + assert len(table["bundles_to_node_id"][i]) == 0 + with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(placement_group.ready(), timeout=0.1) @@ -115,11 +118,24 @@ def test_placement_group_wait(ray_start_cluster, connect_to_client): assert len(ready) == 1 table = ray.util.placement_group_table(placement_group) assert table["state"] == "CREATED" - pg = ray.get(placement_group.ready()) assert pg.bundle_specs == placement_group.bundle_specs assert pg.id.binary() == placement_group.id.binary() + @ray.remote + def get_node_id(): + return ray.get_runtime_context().get_node_id() + + for i in range(2): + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_bundle_index=i, + ) + node_id = ray.get( + get_node_id.options(scheduling_strategy=scheduling_strategy).remote() + ) + assert node_id == table["bundles_to_node_id"][i] + @pytest.mark.parametrize("connect_to_client", [False, True]) def test_schedule_placement_group_when_node_add(ray_start_cluster, connect_to_client): diff --git a/python/ray/tests/test_placement_group_3.py b/python/ray/tests/test_placement_group_3.py index f43ddeb9809e..fdde1d874136 100644 --- a/python/ray/tests/test_placement_group_3.py +++ b/python/ray/tests/test_placement_group_3.py @@ -458,6 +458,19 @@ def f(): assert len(gpu_ids_res) == 2 +@pytest.mark.parametrize( + "ray_start_cluster", + [ + generate_system_config_map( + use_ray_syncer=True, + ), + generate_system_config_map( + use_ray_syncer=False, + ), + ], + indirect=True, +) +@pytest.mark.repeat(3) def test_actor_scheduling_not_block_with_placement_group(ray_start_cluster): """Tests the scheduling of lots of actors will not be blocked when using placement groups. diff --git a/python/ray/tests/test_placement_group_failover.py b/python/ray/tests/test_placement_group_failover.py index 3bbe88536443..b8a7841eec48 100755 --- a/python/ray/tests/test_placement_group_failover.py +++ b/python/ray/tests/test_placement_group_failover.py @@ -2,9 +2,7 @@ import sys import ray import ray.cluster_utils -from ray._private.test_utils import ( - get_other_nodes, -) +from ray._private.test_utils import get_other_nodes, wait_for_condition MB = 1024 * 1024 @@ -58,5 +56,72 @@ def test_placement_group_failover_when_two_nodes_die(monkeypatch, ray_start_clus ray.get(object_ref, timeout=5) +def test_gcs_restart_when_placement_group_failover( + ray_start_cluster_head_with_external_redis, +): + @ray.remote(num_cpus=1) + class Actor(object): + def __init__(self): + self.n = 0 + + def value(self): + return self.n + + cluster = ray_start_cluster_head_with_external_redis + num_nodes = 3 + nodes = [] + for _ in range(num_nodes - 1): + nodes.append(cluster.add_node(num_cpus=1)) + + # Make sure the placement group is ready. + bundles = [{"CPU": 1, "memory": 100 * MB} for _ in range(num_nodes)] + placement_group = ray.util.placement_group( + name="name", strategy="STRICT_SPREAD", bundles=bundles + ) + assert placement_group.wait(5000) + actors = [] + for i in range(num_nodes): + actor = Actor.options( + placement_group=placement_group, + placement_group_bundle_index=i, + max_restarts=-1, + ).remote() + object_ref = actor.value.remote() + ray.get(object_ref, timeout=5) + actors.append(actor) + + # Simulate a node dead. + other_nodes = get_other_nodes(cluster, exclude_head=True) + cluster.remove_node(other_nodes[0]) + + # Make sure placement group state change to rescheduling. + def _check_pg_whether_be_reschedule(): + table = ray.util.placement_group_table(placement_group) + return table["state"] == "RESCHEDULING" + + wait_for_condition( + _check_pg_whether_be_reschedule, timeout=5, retry_interval_ms=1000 + ) + + # Simulate gcs restart. + cluster.head_node.kill_gcs_server() + cluster.head_node.start_gcs_server() + + cluster.add_node(num_cpus=1) + cluster.wait_for_nodes() + + # Check placement gorup reschedule success after gcs server restart. + def _check_actor_with_pg_is_ready(): + try: + for actor in actors: + object_ref = actor.value.remote() + ray.get(object_ref, timeout=5) + return True + except Exception: + return False + + wait_for_condition(_check_actor_with_pg_is_ready, timeout=5, retry_interval_ms=1000) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_ray_init_2.py b/python/ray/tests/test_ray_init_2.py index ddf0e66782b2..3f37dd701308 100644 --- a/python/ray/tests/test_ray_init_2.py +++ b/python/ray/tests/test_ray_init_2.py @@ -294,7 +294,7 @@ def test_ray_init_from_workers(ray_start_cluster): node_info = ray._private.services.get_node_to_connect_for_driver( cluster.gcs_address, "127.0.0.3" ) - assert node_info.node_manager_port == node2.node_manager_port + assert node_info["node_manager_port"] == node2.node_manager_port def test_default_resource_not_allowed_error(shutdown_only): diff --git a/python/ray/tests/test_runtime_env.py b/python/ray/tests/test_runtime_env.py index 74eb3d51fb9e..fdb05f6f79eb 100644 --- a/python/ray/tests/test_runtime_env.py +++ b/python/ray/tests/test_runtime_env.py @@ -219,7 +219,7 @@ def test_container_option_serialize(runtime_env_class): container={"image": "ray:latest", "run_options": ["--name=test"]} ) job_config = ray.job_config.JobConfig(runtime_env=runtime_env) - job_config_serialized = job_config.serialize() + job_config_serialized = job_config._serialize() # job_config_serialized is JobConfig protobuf serialized string, # job_config.runtime_env_info.serialized_runtime_env # has container_option info diff --git a/python/ray/tests/test_runtime_env_env_vars.py b/python/ray/tests/test_runtime_env_env_vars.py index 69daa429341b..43dd6ff56f57 100644 --- a/python/ray/tests/test_runtime_env_env_vars.py +++ b/python/ray/tests/test_runtime_env_env_vars.py @@ -300,6 +300,25 @@ def get_options(val): assert pid7 == pid1 +def test_appendable_environ(ray_start_regular): + @ray.remote + def get_env(key): + return os.environ.get(key) + + custom_env = os.path.pathsep + "/usr/local/bin" + remote_env = ray.get( + get_env.options( + runtime_env={ + "env_vars": { + "PATH": "${PATH}" + custom_env, + } + } + ).remote("PATH") + ) + assert remote_env.endswith(custom_env) + assert len(remote_env) > len(custom_env) + + if __name__ == "__main__": import pytest diff --git a/python/ray/tests/test_runtime_env_ray_minimal.py b/python/ray/tests/test_runtime_env_ray_minimal.py index 82d7e77b62b8..2b3d68e51286 100644 --- a/python/ray/tests/test_runtime_env_ray_minimal.py +++ b/python/ray/tests/test_runtime_env_ray_minimal.py @@ -14,15 +14,16 @@ import pytest import ray +from ray.exceptions import RuntimeEnvSetupError -def _test_task_and_actor(capsys): +def _test_task_and_actor(): @ray.remote def f(): return 1 - # with pytest.raises(RuntimeEnvSetupError): - assert ray.get(f.options(runtime_env={"pip": ["requests"]}).remote()) == 1 + with pytest.raises(RuntimeEnvSetupError, match="install virtualenv"): + ray.get(f.options(runtime_env={"pip": ["requests"]}).remote()) @ray.remote class A: @@ -30,7 +31,9 @@ def task(self): return 1 a = A.options(runtime_env={"pip": ["requests"]}).remote() - assert ray.get(a.task.remote()) == 1 + + with pytest.raises(RuntimeEnvSetupError, match="install virtualenv"): + ray.get(a.task.remote()) @pytest.mark.skipif( @@ -45,9 +48,9 @@ def task(self): ["ray start --head --ray-client-server-port 25553 --port 0"], indirect=True, ) -def test_ray_client_task_actor(call_ray_start, capsys): +def test_ray_client_task_actor(call_ray_start): ray.init("ray://localhost:25553") - _test_task_and_actor(capsys) + _test_task_and_actor() @pytest.mark.skipif( @@ -57,9 +60,9 @@ def test_ray_client_task_actor(call_ray_start, capsys): os.environ.get("RAY_MINIMAL") != "1", reason="This test is only run in CI with a minimal Ray installation.", ) -def test_task_actor(shutdown_only, capsys): +def test_task_actor(shutdown_only): ray.init() - _test_task_and_actor(capsys) + _test_task_and_actor() @pytest.mark.skipif( @@ -69,14 +72,15 @@ def test_task_actor(shutdown_only, capsys): os.environ.get("RAY_MINIMAL") != "1", reason="This test is only run in CI with a minimal Ray installation.", ) -def test_ray_init(shutdown_only, capsys): +def test_ray_init(shutdown_only): ray.init(runtime_env={"pip": ["requests"]}) @ray.remote def f(): return 1 - assert ray.get(f.remote()) == 1 + with pytest.raises(RuntimeEnvSetupError, match="install virtualenv"): + ray.get(f.remote()) @pytest.mark.skipif( @@ -92,7 +96,9 @@ def f(): indirect=True, ) def test_ray_client_init(call_ray_start): - ray.init("ray://localhost:25552", runtime_env={"pip": ["requests"]}) + with pytest.raises(ConnectionAbortedError) as excinfo: + ray.init("ray://localhost:25552", runtime_env={"pip": ["requests"]}) + assert "install virtualenv" in str(excinfo.value) if __name__ == "__main__": diff --git a/python/ray/tests/test_runtime_env_setup_func.py b/python/ray/tests/test_runtime_env_setup_func.py new file mode 100644 index 000000000000..32c0cf88bb07 --- /dev/null +++ b/python/ray/tests/test_runtime_env_setup_func.py @@ -0,0 +1,152 @@ +import threading +import os +import sys +import logging + +import pytest + +import ray + + +def test_setup_func_basic(shutdown_only): + def configure_logging(level: int): + logger = logging.getLogger("") + logger.setLevel(level) + + ray.init( + num_cpus=1, + runtime_env={ + "worker_setup_hook": lambda: configure_logging(logging.DEBUG), + "env_vars": {"ABC": "123"}, + }, + ) + + @ray.remote + def f(level): + logger = logging.getLogger("") + assert logging.getLevelName(logger.getEffectiveLevel()) == level + return True + + @ray.remote + class Actor: + def __init__(self, level): + logger = logging.getLogger("") + assert logging.getLevelName(logger.getEffectiveLevel()) == level + + def ready(self): + return True + + def get_env_var(self, key): + return os.getenv(key) + + # Test basic. + for _ in range(10): + assert ray.get(f.remote("DEBUG")) + a = Actor.remote("DEBUG") + assert ray.get(a.__ray_ready__.remote()) + + # Make sure env var is not overwritten. + assert ray.get(a.get_env_var.remote("ABC")) == "123" + + # Test override. + # TODO(sang) + # ray.get( + # f.options( + # runtime_env={ + # "worker_setup_hook": lambda: configure_logging(logging.INFO)} + # ).remote("INFO")) + # a = Actor.optinos( + # runtime_env={"worker_setup_hook": lambda: configure_logging(logging.INFO)} + # ).remote("INFO") + # assert ray.get(a.__ray_ready__.remote()) + + +def test_setup_func_failure(shutdown_only): + """ + Verify when deserilization failed, it raises an exception. + """ + + class CustomClass: + """ + Custom class that can serialize but canont deserialize. + It is used to test deserialization failure. + """ + + def __getstate__(self): + # This method is called during serialization + return self.__dict__ + + def __setstate__(self, state): + # This method is called during deserialization + raise RuntimeError("Deserialization not allowed") + + c = CustomClass() + + def setup(): + print(c) + + ray.init( + num_cpus=1, + runtime_env={ + "worker_setup_hook": setup, + }, + ) + + @ray.remote + class A: + pass + + a = A.remote() + # TODO(sang): Maybe we should raise RuntimeEnvSetupError? + # It is pretty difficult now. See + # https://github.com/ray-project/ray/pull/34738#discussion_r1189553716 + with pytest.raises(ray.exceptions.RayActorError) as e: + ray.get(a.__ray_ready__.remote()) + assert "Deserialization not allowed" in str(e.value) + + """ + Verify when the serialization fails, ray.init fails. + """ + ray.shutdown() + lock = threading.Lock() + + with pytest.raises(ray.exceptions.RuntimeEnvSetupError) as e: + ray.init( + num_cpus=0, + runtime_env={ + "worker_setup_hook": lambda: print(lock), + }, + ) + assert "Failed to export the setup function." in str(e.value) + + """ + Verify when the setup hook failed, it raises an exception. + """ + ray.shutdown() + + def setup_func(): + raise ValueError("Setup Failed") + + ray.init( + num_cpus=1, + runtime_env={ + "worker_setup_hook": setup_func, + }, + ) + + @ray.remote + class A: + pass + + a = A.remote() + with pytest.raises(ray.exceptions.RayActorError) as e: + ray.get(a.__ray_ready__.remote()) + assert "Setup Failed" in str(e.value) + assert "Failed to execute the setup hook method." in str(e.value) + + +if __name__ == "__main__": + if os.environ.get("PARALLEL_CI"): + sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) + else: + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_runtime_env_working_dir_remote_uri.py b/python/ray/tests/test_runtime_env_working_dir_remote_uri.py index 6494dd96c33c..9f5a59cbd855 100644 --- a/python/ray/tests/test_runtime_env_working_dir_remote_uri.py +++ b/python/ray/tests/test_runtime_env_working_dir_remote_uri.py @@ -97,7 +97,7 @@ def check_and_get_node_id(self): import test_module test_module.one() - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() num_cpus = int(ray.available_resources()["CPU"]) actors = [A.remote() for _ in range(num_cpus)] diff --git a/python/ray/tests/test_scheduling_2.py b/python/ray/tests/test_scheduling_2.py index b62edf36c6f6..0dcdde5c2ecd 100644 --- a/python/ray/tests/test_scheduling_2.py +++ b/python/ray/tests/test_scheduling_2.py @@ -568,7 +568,7 @@ def test_demand_report_for_node_affinity_scheduling_strategy( @ray.remote(num_cpus=1) def f(sleep_s): time.sleep(sleep_s) - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() worker_node_id = ray.get(f.remote(0)) @@ -713,13 +713,13 @@ def test_data_locality_spilled_objects( def f(): return ( np.zeros(50 * 1024 * 1024, dtype=np.uint8), - ray.runtime_context.get_runtime_context().node_id, + ray.runtime_context.get_runtime_context().get_node_id(), ) @ray.remote def check_locality(x): _, node_id = x - assert node_id == ray.runtime_context.get_runtime_context().node_id + assert node_id == ray.runtime_context.get_runtime_context().get_node_id() # Check locality works when dependent task is already submitted by the time # the upstream task finishes. diff --git a/python/ray/tests/test_state_api.py b/python/ray/tests/test_state_api.py index 64882421c856..c154ef9f52cf 100644 --- a/python/ray/tests/test_state_api.py +++ b/python/ray/tests/test_state_api.py @@ -8,6 +8,9 @@ from unittest.mock import MagicMock import pytest +from ray.util.state import get_job +from ray.dashboard.modules.job.pydantic_models import JobDetails +from ray.util.state.common import Humanify from ray._private.gcs_utils import GcsAioClient import yaml from click.testing import CliRunner @@ -62,7 +65,7 @@ StateAPIManager, _convert_filters_type, ) -from ray.experimental.state.api import ( +from ray.util.state import ( get_actor, get_node, get_objects, @@ -84,7 +87,7 @@ StateApiClient, ) from ray._private.event.event_logger import get_event_id -from ray.experimental.state.common import ( +from ray.util.state.common import ( DEFAULT_LIMIT, DEFAULT_RPC_TIMEOUT, ActorState, @@ -102,16 +105,16 @@ state_column, ) from ray.dashboard.utils import ray_address_to_api_server_url -from ray.experimental.state.exception import DataSourceUnavailable, RayStateApiException -from ray.experimental.state.state_cli import ( +from ray.util.state.exception import DataSourceUnavailable, RayStateApiException +from ray.util.state.state_cli import ( AvailableFormat, format_list_api_output, _parse_filter, summary_state_cli_group, ) -from ray.experimental.state.state_cli import ray_get -from ray.experimental.state.state_cli import ray_list -from ray.experimental.state.state_manager import IdToIpMap, StateDataSourceClient +from ray.util.state.state_cli import ray_get +from ray.util.state.state_cli import ray_list +from ray.util.state.state_manager import IdToIpMap, StateDataSourceClient from ray.job_submission import JobSubmissionClient from ray.runtime_env import RuntimeEnv @@ -915,7 +918,7 @@ async def test_api_manager_list_tasks(state_api_manager): ] result = await state_api_manager.list_tasks(option=create_api_options()) data_source_client.get_all_task_info.assert_any_await( - timeout=DEFAULT_RPC_TIMEOUT, job_id=None, exclude_driver=True + timeout=DEFAULT_RPC_TIMEOUT, filters=[], exclude_driver=True ) data = result.result data = data @@ -1533,8 +1536,14 @@ async def test_state_data_source_client(ray_start_cluster): entrypoint="ls", ) result = await client.get_job_info() - assert list(result.keys())[0] == job_id - assert isinstance(result, dict) + assert isinstance(result[0], JobDetails) + found_job = False + for job in result: + if job.type != "DRIVER": + assert job.submission_id == job_id + found_job = True + assert found_job, result + assert isinstance(result, list) """ Test tasks @@ -1696,6 +1705,18 @@ def ready(self): assert result.total == 6 +def test_humanify(): + raw_bytes = 1024 + assert Humanify.memory(raw_bytes) == "1.000 KiB" + raw_bytes *= 1024 + assert Humanify.memory(raw_bytes) == "1.000 MiB" + raw_bytes *= 1024 + assert Humanify.memory(raw_bytes) == "1.000 GiB" + timestamp = 1610000000 + assert "1970-01" in Humanify.timestamp(timestamp) + assert Humanify.duration(timestamp) == "18 days, 15:13:20" + + @pytest.mark.asyncio async def test_state_data_source_client_limit_distributed_sources(ray_start_cluster): cluster = ray_start_cluster @@ -2172,8 +2193,9 @@ def verify(): sys.platform == "win32", reason="Failed on Windows", ) -def test_list_jobs(shutdown_only): +def test_list_get_jobs(shutdown_only): ray.init() + # Test submission job client = JobSubmissionClient( f"http://{ray._private.worker.global_worker.node.address_info['webui_url']}" ) @@ -2185,13 +2207,50 @@ def test_list_jobs(shutdown_only): def verify(): job_data = list_jobs()[0] print(job_data) - job_id_from_api = job_data["job_id"] - correct_state = job_data["status"] == "SUCCEEDED" - correct_id = job_id == job_id_from_api - return correct_state and correct_id + job_id_from_api = job_data["submission_id"] + assert job_data["status"] == "SUCCEEDED" + assert job_id == job_id_from_api + return True + + wait_for_condition(verify) + + # Test driver jobs + script = """ + +import ray + +ray.init("auto") + +@ray.remote +def f(): + pass + +ray.get(f.remote()) +""" + run_string_as_driver(script) + + def verify(): + jobs = list_jobs(filters=[("type", "=", "DRIVER")]) + assert len(jobs) == 2, "1 test driver + 1 script run above" + for driver_job in jobs: + assert driver_job["driver_info"] is not None + + sub_jobs = list_jobs(filters=[("type", "=", "SUBMISSION")]) + assert len(sub_jobs) == 1 + assert sub_jobs[0]["submission_id"] is not None + return True + + wait_for_condition(verify) + + # Test GET api + def verify(): + job = get_job(id=job_id) + assert job["submission_id"] == job_id + assert job["entrypoint"] == "ls" + assert job["status"] == "SUCCEEDED" + return True wait_for_condition(verify) - print(list_jobs()) @pytest.mark.skipif( @@ -2282,7 +2341,7 @@ def g(dep): def impossible(): pass - out = [f.remote() for _ in range(2)] # noqa + out = [f.options(name=f"f_{i}").remote() for i in range(2)] # noqa g_out = g.remote(f.remote()) # noqa im = impossible.remote() # noqa @@ -2350,6 +2409,9 @@ def verify(): for task in tasks: assert task["job_id"] == job_id + tasks = list_tasks(filters=[("name", "=", "f_0")]) + assert len(tasks) == 1 + return True wait_for_condition(verify) @@ -2540,7 +2602,6 @@ def verify(): for task in tasks: assert task["job_id"] == job_id for task in tasks: - print(task) assert task["actor_id"] == actor_id # Actor.__init__: 1 finished # Actor.call: 1 running, 9 waiting for execution (queued). @@ -2590,6 +2651,10 @@ def verify(): == 1 ) + # Filters with actor id. + assert len(list_tasks(filters=[("actor_id", "=", actor_id)])) == 11 + assert len(list_tasks(filters=[("actor_id", "!=", actor_id)])) == 0 + return True wait_for_condition(verify) diff --git a/python/ray/tests/test_state_api_2.py b/python/ray/tests/test_state_api_2.py index 9cf126870fae..cbb654565aac 100644 --- a/python/ray/tests/test_state_api_2.py +++ b/python/ray/tests/test_state_api_2.py @@ -9,7 +9,7 @@ import pytest from ray._private.profiling import chrome_tracing_dump -from ray.experimental.state.api import ( +from ray.util.state import ( get_actor, list_tasks, list_actors, @@ -252,6 +252,92 @@ def get_actor(self, name): wait_for_condition(_verify_repr_name, id=a._actor_id.hex(), name="inner") +def test_experimental_import_deprecation(): + with pytest.warns(DeprecationWarning): + from ray.experimental.state.api import list_tasks # noqa: F401 + + with pytest.warns(DeprecationWarning): + from ray.experimental.state.common import DEFAULT_RPC_TIMEOUT # noqa: F401 + + with pytest.warns(DeprecationWarning): + from ray.experimental.state.custom_types import ACTOR_STATUS # noqa: F401 + + with pytest.warns(DeprecationWarning): + from ray.experimental.state.exception import RayStateApiException # noqa: F401 + + with pytest.warns(DeprecationWarning): + from ray.experimental.state.state_cli import ray_get # noqa: F401 + + with pytest.warns(DeprecationWarning): + from ray.experimental.state.state_manager import ( # noqa: F401 + StateDataSourceClient, + ) + + with pytest.warns(DeprecationWarning): + from ray.experimental.state.util import convert_string_to_type # noqa: F401 + + +def test_actor_task_with_repr_name(): + @ray.remote + class ReprActor: + def __init__(self, x) -> None: + self.x = x + + def __repr__(self) -> str: + return self.x + + def f(self): + pass + + a = ReprActor.remote(x="repr-name-a") + ray.get(a.f.remote()) + + def verify(): + tasks = list_tasks(detail=True, filters=[("type", "=", "ACTOR_TASK")]) + assert len(tasks) == 1, tasks + assert tasks[0].name == "repr-name-a.f" + assert tasks[0].func_or_class_name == "ReprActor.f" + return True + + wait_for_condition(verify) + + b = ReprActor.remote(x="repr-name-b") + ray.get(b.f.options(name="custom-name").remote()) + + def verify(): + tasks = list_tasks( + detail=True, + filters=[("actor_id", "=", b._actor_id.hex()), ("type", "=", "ACTOR_TASK")], + ) + assert len(tasks) == 1, tasks + assert tasks[0].name == "custom-name" + assert tasks[0].func_or_class_name == "ReprActor.f" + return True + + wait_for_condition(verify) + + @ray.remote + class Actor: + def f(self): + pass + + c = Actor.remote() + ray.get(c.f.remote()) + + def verify(): + tasks = list_tasks( + detail=True, + filters=[("actor_id", "=", c._actor_id.hex()), ("type", "=", "ACTOR_TASK")], + ) + + assert len(tasks) == 1, tasks + assert tasks[0].name == "Actor.f" + assert tasks[0].func_or_class_name == "Actor.f" + return True + + wait_for_condition(verify) + + if __name__ == "__main__": import sys diff --git a/python/ray/tests/test_state_api_log.py b/python/ray/tests/test_state_api_log.py index d821799982f2..f484f943e6b9 100644 --- a/python/ray/tests/test_state_api_log.py +++ b/python/ray/tests/test_state_api_log.py @@ -1,13 +1,19 @@ import json import os import sys +import asyncio from typing import List +import urllib from unittest.mock import MagicMock import pytest -from ray.experimental.state.state_cli import logs_state_cli_group +from ray.util.state.state_cli import logs_state_cli_group +from ray.util.state import list_jobs import requests from click.testing import CliRunner +import grpc + +from pathlib import Path import ray from ray._private.test_utils import ( @@ -15,18 +21,35 @@ wait_for_condition, wait_until_server_available, ) + +from ray._private.ray_constants import ( + LOG_PREFIX_TASK_ATTEMPT_START, + LOG_PREFIX_TASK_ATTEMPT_END, +) from ray._raylet import ActorID, NodeID, TaskID, WorkerID from ray.core.generated.common_pb2 import Address -from ray.core.generated.gcs_pb2 import ActorTableData +from ray.core.generated.gcs_service_pb2 import GetTaskEventsReply from ray.core.generated.reporter_pb2 import ListLogsReply, StreamLogReply +from ray.core.generated.gcs_pb2 import ( + ActorTableData, + TaskEvents, + TaskStateUpdate, +) from ray.dashboard.modules.actor.actor_head import actor_table_data_to_dict -from ray.dashboard.modules.log.log_agent import tail as tail_file +from ray.dashboard.modules.log.log_agent import ( + find_offset_of_content_in_file, + find_end_offset_file, + find_end_offset_next_n_lines_from_offset, + find_start_offset_last_n_lines_from_offset, + LogAgentV1Grpc, +) +from ray.dashboard.modules.log.log_agent import _stream_log_in_chunk from ray.dashboard.modules.log.log_manager import LogsManager from ray.dashboard.tests.conftest import * # noqa -from ray.experimental.state.api import get_log, list_logs, list_nodes, list_workers -from ray.experimental.state.common import GetLogOptions -from ray.experimental.state.exception import DataSourceUnavailable -from ray.experimental.state.state_manager import StateDataSourceClient +from ray.util.state import get_log, list_logs, list_nodes, list_workers +from ray.util.state.common import GetLogOptions +from ray.util.state.exception import DataSourceUnavailable +from ray.util.state.state_manager import StateDataSourceClient if sys.version_info >= (3, 8, 0): from unittest.mock import AsyncMock @@ -37,6 +60,19 @@ ASYNCMOCK_MIN_PYTHON_VER = (3, 8) +def generate_task_event(task_id, node_id, attempt_number, worker_id): + task_event = TaskEvents( + task_id=task_id.binary(), + attempt_number=attempt_number, + job_id=b"", + state_updates=TaskStateUpdate( + node_id=node_id.binary(), worker_id=worker_id.binary() + ), + ) + + return task_event + + def generate_actor_data(id, node_id, worker_id): if worker_id: worker_id = worker_id.binary() @@ -57,34 +93,307 @@ def generate_actor_data(id, node_id, worker_id): # Unit Tests (Log Agent) - - -@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") -def test_logs_tail(): +def _read_file(fp, start, end): + """Help func to read a file with offsets""" + fp.seek(start, 0) + if end == -1: + return fp.read() + return fp.read(end - start) + + +async def _stream_log(context, fp, start, end): + """Help func to stream a log with offsets""" + result = bytearray() + async for chunk_res in _stream_log_in_chunk( + context=context, + file=fp, + start_offset=start, + end_offset=end, + keep_alive_interval_sec=-1, + ): + result += chunk_res.data + return result + + +TEST_LINE_TEMPLATE = "{}-test-line" + + +def _write_lines_and_get_offset_at_index( + f, num_lines, start_offset=0, trailing_new_line=True +): """ - Unit test for tail + Write multiple lines into a file, and record offsets + + Args: + f: a binary file object that's writable + num_lines: Number of lines to write + start_offset: The offset to start writing + trailing_new_line: True if a '\n' is added at the end of the + lines. + + Return: + offsets: A list of offsets of the lines. + offset_end: The offset of the end of file. """ - TOTAL_LINES = 1000 - FILE_NAME = "test_file.txt" - try: - with open(FILE_NAME, "w") as f: - for i in range(TOTAL_LINES): - # Check this works with unicode - f.write(f"Message 日志 {i:4}\n") - file = open(FILE_NAME, "rb") - text, byte_pos = tail_file(file, 100) - assert byte_pos == TOTAL_LINES * len( - "Message 日志 1000\n".encode(encoding="utf-8") + f.seek(start_offset, 0) + + offsets = [] + for i in range(num_lines): + offsets.append(f.tell()) + if i == num_lines - 1 and not trailing_new_line: + # Last line no newline + line = TEST_LINE_TEMPLATE.format(i) + else: + line = TEST_LINE_TEMPLATE.format(i) + "\n" + f.write(line.encode("utf-8")) + + f.flush() + f.seek(0, 2) + offset_end = f.tell() + + return offsets, offset_end + + +@pytest.mark.parametrize("new_line", [True, False]) +@pytest.mark.parametrize("block_size", [4, 16, 256]) +def test_find_start_offset_last_n_lines_from_offset(new_line, temp_file, block_size): + file = temp_file + o, end_file = _write_lines_and_get_offset_at_index( + file, num_lines=50, start_offset=0, trailing_new_line=new_line + ) + # Test the function with different offsets and number of lines to find + assert find_start_offset_last_n_lines_from_offset(file, o[3], 1, block_size) == o[2] + assert ( + find_start_offset_last_n_lines_from_offset(file, o[10], 10, block_size) == o[0] + ) + + # Test end of file last 1 line + assert find_start_offset_last_n_lines_from_offset(file, -1, 1, block_size) == o[-1] + + # Test end of file no line + assert ( + find_start_offset_last_n_lines_from_offset(file, -1, 0, block_size) == end_file + ) + + # Test no line from middle of file + assert ( + find_start_offset_last_n_lines_from_offset(file, o[30], 0, block_size) == o[30] + ) + + # Test more lines than file + assert ( + find_start_offset_last_n_lines_from_offset(file, o[30], 100, block_size) == o[0] + ) + + # Test offsets in the middle of a line + assert ( + find_start_offset_last_n_lines_from_offset(file, o[2] + 1, 1, block_size) + == o[2] + ) + assert ( + find_start_offset_last_n_lines_from_offset(file, o[2] - 1, 1, block_size) + == o[1] + ) + + +def test_find_end_offset_next_n_lines_from_offset(temp_file): + file = temp_file + o, end_file = _write_lines_and_get_offset_at_index( + file, num_lines=10, start_offset=0 + ) + # Test the function with different offsets and number of lines to find + assert find_end_offset_next_n_lines_from_offset(file, o[3], 1) == o[4] + assert find_end_offset_next_n_lines_from_offset(file, o[3], 2) == o[5] + assert find_end_offset_next_n_lines_from_offset(file, 0, 1) == o[1] + + # Test end of file + assert find_end_offset_next_n_lines_from_offset(file, o[3], 999) == end_file + + # Test offset diff + assert find_end_offset_next_n_lines_from_offset(file, 1, 1) == o[1] + assert find_end_offset_next_n_lines_from_offset(file, o[1] - 1, 1) == o[1] + + +def test_find_offset_of_content_in_file(temp_file): + file = temp_file + o, end_file = _write_lines_and_get_offset_at_index(file, num_lines=10) + + assert ( + find_offset_of_content_in_file( + file, TEST_LINE_TEMPLATE.format(0).encode("utf-8") + ) + == o[0] + ) + + assert ( + find_offset_of_content_in_file( + file, TEST_LINE_TEMPLATE.format(3).encode("utf-8"), o[1] + 1 + ) + == o[3] + ) + + assert ( + find_offset_of_content_in_file( + file, TEST_LINE_TEMPLATE.format(4).encode("utf-8"), o[1] - 1 + ) + == o[4] + ) + + # Not found + assert ( + find_offset_of_content_in_file( + file, TEST_LINE_TEMPLATE.format(1000).encode("utf-8"), o[1] - 1 ) - lines = text.decode("utf-8").split("\n") - assert len(lines) == 100 - assert lines[0] == "Message 日志 900" - assert lines[99] == "Message 日志 999" - except Exception as e: - raise e - finally: - if os.path.exists(FILE_NAME): - os.remove(FILE_NAME) + == -1 + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("random_ascii_file", [1 << 20], indirect=True) +@pytest.mark.parametrize( + "start_offset,end_offset", + [ + (0, 1 << 20), + (1 << 20, 1 << 20), + (0, 0), + (0, 1), + (1 << 16, 1 << 20), + (1024, 2042), + ], +) +async def test_stream_log_in_chunk(random_ascii_file, start_offset, end_offset): + """Test streaming of a file from different offsets""" + test_file = random_ascii_file + context = MagicMock(grpc.aio.ServicerContext) + context.done.return_value = False + + expected_file_content = _read_file(test_file, start_offset, end_offset) + actual_log_content = await _stream_log(context, test_file, start_offset, end_offset) + + assert ( + expected_file_content == actual_log_content + ), "Non-matching content from log streamed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "lines_to_tail,total_lines", + [(0, 100), (100, 100), (10, 100), (1, 100), (99, 100)], +) +@pytest.mark.parametrize("trailing_new_line", [True, False]) +async def test_log_tails(lines_to_tail, total_lines, trailing_new_line, temp_file): + """Test tailing a file works""" + _write_lines_and_get_offset_at_index( + temp_file, + total_lines, + trailing_new_line=trailing_new_line, + ) + test_file = temp_file + context = MagicMock(grpc.aio.ServicerContext) + context.done.return_value = False + start_offset = find_start_offset_last_n_lines_from_offset( + test_file, offset=-1, n=lines_to_tail + ) + + actual_data = await _stream_log(context, test_file, start_offset, -1) + expected_data = _read_file(test_file, start_offset, -1) + + assert actual_data == expected_data, "Non-matching data from stream log" + + all_lines = actual_data.decode("utf-8") + assert all_lines.count("\n") == ( + lines_to_tail if trailing_new_line or lines_to_tail == 0 else lines_to_tail - 1 + ), "Non-matching number of lines tailed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "lines_to_tail,total_lines", + [(0, 5), (5, 5), (2, 5), (1, 5), (4, 5)], +) +async def test_log_tails_with_appends(lines_to_tail, total_lines, temp_file): + """Test tailing a log file that grows at the same time""" + _write_lines_and_get_offset_at_index(temp_file, total_lines) + test_file = temp_file + context = MagicMock(grpc.aio.ServicerContext) + context.done.return_value = False + start_offset = find_start_offset_last_n_lines_from_offset( + test_file, offset=-1, n=lines_to_tail + ) + + actual_data = await _stream_log(context, test_file, start_offset, -1) + + end_offset = find_end_offset_file(test_file) + expected_data = _read_file(test_file, start_offset, end_offset) + assert actual_data == expected_data, "Non-matching data from stream log" + + all_lines = actual_data.decode("utf-8") + assert all_lines.count("\n") == lines_to_tail, "Non-matching number of lines tailed" + + # Modify the file with append here + num_new_lines = 2 + _write_lines_and_get_offset_at_index( + temp_file, num_new_lines, start_offset=end_offset + ) + + # Tail again should read the new lines written + start_offset = find_start_offset_last_n_lines_from_offset( + test_file, offset=-1, n=lines_to_tail + num_new_lines + ) + + expected_data = _read_file(test_file, start_offset, -1) + actual_data = await _stream_log(context, test_file, start_offset, -1) + + assert ( + actual_data == expected_data + ), "Non-matching data from stream log after append" + + all_lines = actual_data.decode("utf-8") + assert ( + all_lines.count("\n") == lines_to_tail + num_new_lines + ), "Non-matching number of lines tailed after append" + + +@pytest.mark.asyncio +async def test_log_agent_find_task_log_offsets(temp_file): + log_file_content = "" + task_id = "taskid1234" + attempt_number = 0 + # Previous data + for i in range(3): + log_file_content += TEST_LINE_TEMPLATE.format(i) + "\n" + # Task's logs + log_file_content += f"{LOG_PREFIX_TASK_ATTEMPT_START}{task_id}-{attempt_number}\n" + expected_start = len(log_file_content) + for i in range(10): + log_file_content += TEST_LINE_TEMPLATE.format(i) + "\n" + expected_end = len(log_file_content) + log_file_content += f"{LOG_PREFIX_TASK_ATTEMPT_END}{task_id}-{attempt_number}\n" + + # Next data + for i in range(3): + log_file_content += TEST_LINE_TEMPLATE.format(i) + "\n" + + # Write to files + temp_file.write(log_file_content.encode("utf-8")) + + # Test all task logs + start_offset, end_offset = await LogAgentV1Grpc._find_task_log_offsets( + task_id, attempt_number, -1, temp_file + ) + assert start_offset == expected_start + assert end_offset == expected_end + + # Test tailing last X lines + num_tail = 3 + start_offset, end_offset = await LogAgentV1Grpc._find_task_log_offsets( + task_id, attempt_number, num_tail, temp_file + ) + assert end_offset == expected_end + exclude_tail_content = "" + for i in range(10 - num_tail): + exclude_tail_content += TEST_LINE_TEMPLATE.format(i) + "\n" + assert start_offset == expected_start + len(exclude_tail_content) # Unit Tests (LogsManager) @@ -241,17 +550,33 @@ def get_actor_fn(id): """ Test task id is given. """ - with pytest.raises(NotImplementedError): - task_id = TaskID(b"2" * 24) - log_file_name, n = await logs_manager.resolve_filename( - node_id=node_id.hex(), - log_filename=None, - actor_id=None, - task_id=task_id, - pid=None, - get_actor_fn=lambda _: generate_actor_data(actor_id, node_id, worker_id), - timeout=10, - ) + task_id = TaskID(b"2" * 24) + logs_client = logs_manager.data_source_client + logs_client.get_all_task_info = AsyncMock() + logs_client.get_all_task_info.return_value = GetTaskEventsReply( + events_by_task=[ + generate_task_event(task_id, node_id, attempt_number=1, worker_id=worker_id) + ] + ) + logs_manager.list_logs.return_value = { + "worker_out": [f"worker-{worker_id.hex()}-123-123.out"], + "worker_err": [], + } + + # Expect resolved file. + filename, n = await logs_manager.resolve_filename(task_id=task_id, attempt_number=1) + # Default out file. See generate_task_event() for filename + assert filename == f"worker-{worker_id.hex()}-123-123.out" + assert n == node_id.hex() + + # Wrong task attempt + with pytest.raises(FileNotFoundError): + await logs_manager.resolve_filename(task_id=task_id, attempt_number=0) + + # No task found + logs_client.get_all_task_info.return_value = GetTaskEventsReply(events_by_task=[]) + with pytest.raises(FileNotFoundError): + await logs_manager.resolve_filename(task_id=TaskID(b"1" * 24), attempt_number=1) """ Test pid is given. @@ -386,6 +711,8 @@ async def test_logs_manager_stream_log(logs_manager): lines=10, interval=None, timeout=30, + task_id=None, + attempt_number=0, ) # Test pid, media_type = "stream", node_ip @@ -413,6 +740,8 @@ async def test_logs_manager_stream_log(logs_manager): lines=10, interval=0.5, timeout=None, + task_id=None, + attempt_number=0, ) # Currently cannot test actor_id with AsyncMock. @@ -455,6 +784,8 @@ async def test_logs_manager_keepalive_no_timeout(logs_manager): lines=10, interval=None, timeout=None, + task_id=None, + attempt_number=0, ) @@ -566,7 +897,8 @@ def verify_basic(): lines = [] for line in stream_response.iter_lines(): lines.append(line.decode("utf-8")) - return len(lines) == 5 or len(lines) == 6 + assert len(lines) == 5 or len(lines) == 6 + return True wait_for_condition(verify_basic) @@ -586,24 +918,22 @@ def getpid(self): # Test stream and fetching by actor id stream_response = requests.get( webui_url - + "/api/v0/logs/stream?&lines=2" + + "/api/v0/logs/stream?&lines=-1" + f"&actor_id={actor._ray_actor_id.hex()}", stream=True, ) if stream_response.status_code != 200: raise ValueError(stream_response.content.decode("utf-8")) stream_iterator = stream_response.iter_content(chunk_size=None) - # NOTE: Prefix 1 indicates the stream has succeeded. - assert ( - next(stream_iterator).decode("utf-8") - == "1:actor_name:Actor\n" + test_log_text.format("XXXXXX") + "\n" - ) + actual_output = next(stream_iterator).decode("utf-8") + assert "actor_name:Actor\n" in actual_output + assert test_log_text.format("XXXXXX") in actual_output streamed_string = "" for i in range(5): strings = [] - for j in range(100): - strings.append(test_log_text.format(f"{100*i + j:06d}")) + for j in range(3): + strings.append(test_log_text.format(f"{3*i + j:06d}")) ray.get(actor.write_log.remote(strings)) @@ -612,7 +942,7 @@ def getpid(self): string += s + "\n" streamed_string += string # NOTE: Prefix 1 indicates the stream has succeeded. - assert next(stream_iterator).decode("utf-8") == "1" + string + assert string in next(stream_iterator).decode("utf-8") del stream_response # Test tailing log by actor id @@ -624,7 +954,8 @@ def getpid(self): + actor._ray_actor_id.hex(), ).content.decode("utf-8") # NOTE: Prefix 1 indicates the stream has succeeded. - assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :]) + for line in streamed_string.split("\n")[-(LINES + 1) :]: + assert line in file_response # Test query by pid & node_ip instead of actor id. node_ip = list(ray.nodes())[0]["NodeManagerAddress"] @@ -635,7 +966,8 @@ def getpid(self): + f"&pid={pid}", ).content.decode("utf-8") # NOTE: Prefix 1 indicates the stream has succeeded. - assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :]) + for line in streamed_string.split("\n")[-(LINES + 1) :]: + assert line in file_response def test_log_list(ray_start_cluster): @@ -671,6 +1003,79 @@ def verify(): e.match(f"Given node id {node_id} is not available") +@pytest.mark.skipif( + sys.platform == "win32", reason="Job submission is failing on windows." +) +def test_log_job(ray_start_with_dashboard): + assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True + webui_url = ray_start_with_dashboard["webui_url"] + webui_url = format_web_url(webui_url) + node_id = list_nodes()[0]["node_id"] + + # Submit a job + from ray.job_submission import JobSubmissionClient + + JOB_LOG = "test-job-log" + client = JobSubmissionClient(webui_url) + entrypoint = f"python -c \"print('{JOB_LOG}')\"" + job_id = client.submit_job(entrypoint=entrypoint) + + def job_done(): + jobs = list_jobs(filters=[("submission_id", "=", job_id)]) + assert len(jobs) == 1 + assert jobs[0].status == "SUCCEEDED" + return True + + wait_for_condition(job_done) + + def verify(): + logs = "".join(get_log(submission_id=job_id, node_id=node_id)) + assert JOB_LOG + "\n" == logs + + return True + + wait_for_condition(verify) + + +def test_log_get_subdir(ray_start_with_dashboard): + assert ( + wait_until_server_available(ray_start_with_dashboard.address_info["webui_url"]) + is True + ) + webui_url = ray_start_with_dashboard.address_info["webui_url"] + webui_url = format_web_url(webui_url) + node_id = list_nodes()[0]["node_id"] + + log_dir = ray._private.worker.global_worker.node.get_logs_dir_path() + subdir = "test_subdir" + file = "test_#file.log" + path = Path(log_dir) / subdir / file + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("test log") + + # HTTP endpoint + def verify(): + # Direct logs stream + response = requests.get( + webui_url + + f"/api/v0/logs/file?node_id={node_id}" + + f"&filename={urllib.parse.quote('test_subdir/test_#file.log')}" + ) + assert response.status_code == 200, response.reason + assert "test log" in response.text + return True + + wait_for_condition(verify) + + # get log SDK + def verify(): + logs = "".join(get_log(node_id=node_id, filename="test_subdir/test_#file.log")) + assert "test log" in logs + return True + + wait_for_condition(verify) + + def test_log_get(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) @@ -734,10 +1139,6 @@ def verify(): wait_for_condition(verify) - with pytest.raises(NotImplementedError): - for _ in get_log(task_id=123, tail=10): - pass - del a """ Test log suffix selection for worker/actor @@ -851,6 +1252,99 @@ def verify(): wait_for_condition(verify) + # Test running task logs + @ray.remote + def sleep_task(out_msg): + print(out_msg, end="", file=sys.stdout) + import time + + time.sleep(10) + + expected_out = "This is a test log from stdout\n" + task = sleep_task.remote(expected_out) + + def verify(): + lines = get_log(task_id=task.task_id().hex()) + assert expected_out == "".join(lines) + + return True + + wait_for_condition(verify) + + # Test get log by multiple task id + @ray.remote + def task_log(): + out_msg = "This is a test log from stdout\n" + print(out_msg, end="", file=sys.stdout) + err_msg = "THIS IS A TEST LOG FROM STDERR\n" + print(err_msg, end="", file=sys.stderr) + + return out_msg, err_msg + + # Run some other tasks before and after to make sure task + # log only outputs the task's log. + ray.get(task_log.remote()) + task = task_log.remote() + expected_out, expected_err = ray.get(task) + ray.get(task_log.remote()) + + def verify(): + lines = get_log(task_id=task.task_id().hex()) + assert expected_out == "".join(lines) + + # Test suffix + lines = get_log(task_id=task.task_id().hex(), suffix="err") + assert expected_err == "".join(lines) + + return True + + wait_for_condition(verify) + + # Test actor task logs with interleaving logs. + @ray.remote + class Actor: + async def print_log(self, x, out_msg): + for _ in range(3): + print(out_msg, end="", file=sys.stdout) + await asyncio.sleep(1) + + actor = Actor.options(max_concurrency=2).remote() + out_msg = "[{name}]: This is a test log from stdout\n" + task_a = actor.print_log.remote("a", out_msg.format(name="a")) + task_b = actor.print_log.remote("b", out_msg.format(name="b")) + ray.get([task_a, task_b]) + + def verify(): + lines = get_log(task_id=task_a.task_id().hex()) + actual_output = "".join(lines) + assert actual_output.count(out_msg.format(name="a")) == 3 + + lines = get_log(task_id=task_b.task_id().hex()) + actual_output = "".join(lines) + assert actual_output.count(out_msg.format(name="b")) == 3 + + return True + + wait_for_condition(verify) + + # Test task logs tail with lines. + expected_out = [f"task-{i}\n" for i in range(5)] + + @ray.remote + def f(): + print("".join(expected_out), end="", file=sys.stdout) + + t = f.remote() + ray.get(t) + + def verify(): + lines = get_log(task_id=t.task_id().hex(), tail=2) + actual_output = "".join(lines) + assert actual_output == "".join(expected_out[-2:]) + return True + + wait_for_condition(verify) + def test_log_cli(shutdown_only): ray.init(num_cpus=1) diff --git a/python/ray/tests/test_state_api_summary.py b/python/ray/tests/test_state_api_summary.py index 3cab23083134..a90b8ab7f8e9 100644 --- a/python/ray/tests/test_state_api_summary.py +++ b/python/ray/tests/test_state_api_summary.py @@ -7,7 +7,7 @@ import sys from dataclasses import asdict -from ray.experimental.state.api import ( +from ray.util.state import ( summarize_tasks, summarize_actors, summarize_objects, @@ -28,7 +28,7 @@ generate_actor_data, generate_object_info, ) -from ray.experimental.state.common import ( +from ray.util.state.common import ( DEFAULT_RPC_TIMEOUT, SummaryApiOptions, Link, @@ -39,9 +39,9 @@ from ray.core.generated.gcs_service_pb2 import GetAllActorInfoReply from ray.core.generated.gcs_pb2 import ActorTableData from click.testing import CliRunner -from ray.experimental.state.state_cli import summary_state_cli_group +from ray.util.state.state_cli import summary_state_cli_group from ray.dashboard.state_aggregator import StateAPIManager -from ray.experimental.state.state_manager import StateDataSourceClient +from ray.util.state.state_manager import StateDataSourceClient @pytest.fixture diff --git a/python/ray/tests/test_task_events.py b/python/ray/tests/test_task_events.py index 99784d780be9..9a64b61cea62 100644 --- a/python/ray/tests/test_task_events.py +++ b/python/ray/tests/test_task_events.py @@ -13,14 +13,14 @@ from ray.runtime_env import RuntimeEnv import ray -from ray.experimental.state.common import ListApiOptions, StateResource +from ray.util.state.common import ListApiOptions, StateResource from ray._private.test_utils import ( raw_metrics, run_string_as_driver, run_string_as_driver_nonblocking, wait_for_condition, ) -from ray.experimental.state.api import StateApiClient, list_tasks +from ray.util.state import StateApiClient, list_tasks from ray._private.worker import RayContext diff --git a/python/ray/tests/test_task_events_2.py b/python/ray/tests/test_task_events_2.py index ec5e0c36e149..2374abe3550e 100644 --- a/python/ray/tests/test_task_events_2.py +++ b/python/ray/tests/test_task_events_2.py @@ -11,12 +11,12 @@ verify_tasks_running_or_terminated, verify_failed_task, ) -from ray.experimental.state.common import ListApiOptions, StateResource +from ray.util.state.common import ListApiOptions, StateResource from ray._private.test_utils import ( run_string_as_driver_nonblocking, wait_for_condition, ) -from ray.experimental.state.api import ( +from ray.util.state import ( StateApiClient, list_actors, list_tasks, @@ -485,7 +485,6 @@ def test_fault_tolerance_nested_actors_failed(shutdown_only): def test_fault_tolerance_chained_task_fail( shutdown_only, exit_type, actor_or_normal_tasks ): - ray.init(_system_config=_SYSTEM_CONFIG) def sleep_or_fail(pid_actor=None, exit_type=None): diff --git a/python/ray/tests/test_traceback.py b/python/ray/tests/test_traceback.py index f5383782b4e4..be9d309351ea 100644 --- a/python/ray/tests/test_traceback.py +++ b/python/ray/tests/test_traceback.py @@ -67,7 +67,7 @@ def clean_noqa(ex): ) def test_actor_creation_stacktrace(ray_start_regular): """Test the actor creation task stacktrace.""" - expected_output = """The actor died because of an error raised in its creation task, ray::A.__init__() (pid=XXX, ip=YYY, repr=ZZZ) # noqa + expected_output = """The actor died because of an error raised in its creation task, ray::A.__init__() (pid=XXX, ip=YYY, actor_id={actor_id}, repr=ZZZ) # noqa File "FILE", line ZZ, in __init__ g(3) File "FILE", line ZZ, in g @@ -85,12 +85,14 @@ def __init__(self): def ping(self): pass + a = A.remote() try: - a = A.remote() ray.get(a.ping.remote()) except RayActorError as ex: print(ex) - assert clean_noqa(expected_output) == scrub_traceback(str(ex)) + assert clean_noqa( + expected_output.format(actor_id=a._actor_id.hex()) + ) == scrub_traceback(str(ex)) @pytest.mark.skipif( @@ -128,7 +130,7 @@ def f(): ) def test_actor_task_stacktrace(ray_start_regular): """Test the actor task stacktrace.""" - expected_output = """ray::A.f() (pid=XXX, ip=YYY, repr=ZZZ) # noqa + expected_output = """ray::A.f() (pid=XXX, ip=YYY, actor_id={actor_id}, repr=ZZZ) # noqa File "FILE", line ZZ, in f return g(c) File "FILE", line ZZ, in g @@ -151,7 +153,9 @@ def f(self): ray.get(a.f.remote()) except ValueError as ex: print(ex) - assert clean_noqa(expected_output) == scrub_traceback(str(ex)) + assert clean_noqa( + expected_output.format(actor_id=a._actor_id.hex()) + ) == scrub_traceback(str(ex)) @pytest.mark.skipif( diff --git a/python/ray/tests/test_usage_stats.py b/python/ray/tests/test_usage_stats.py index ba45c29ce387..3786519c1aff 100644 --- a/python/ray/tests/test_usage_stats.py +++ b/python/ray/tests/test_usage_stats.py @@ -9,6 +9,7 @@ import requests import pytest from jsonschema import validate +from http.server import BaseHTTPRequestHandler, HTTPServer import ray import ray._private.usage.usage_constants as usage_constants @@ -42,6 +43,7 @@ "min_workers": {"type": ["null", "integer"]}, "max_workers": {"type": ["null", "integer"]}, "head_node_instance_type": {"type": ["null", "string"]}, + "libc_version": {"type": ["null", "string"]}, "worker_node_instance_types": { "type": ["null", "array"], "items": {"type": "string"}, @@ -966,10 +968,6 @@ def test_usage_lib_get_cluster_config_to_report( assert cluster_config_to_report.cloud_provider == "kuberay" -@pytest.mark.skipif( - sys.platform == "win32", - reason="Test depends on runtime env feature not supported on Windows.", -) # TODO(https://github.com/ray-project/ray/issues/33486) @pytest.mark.skipif( sys.version_info >= (3, 11, 0), @@ -983,8 +981,7 @@ def test_usage_lib_report_data( m.setenv("RAY_USAGE_STATS_REPORT_URL", "http://127.0.0.1:8000") cluster = ray_start_cluster cluster.add_node(num_cpus=0) - # Runtime env is required to run this test in minimal installation test. - ray.init(address=cluster.address, runtime_env={"pip": ["ray[serve]"]}) + ray.init(address=cluster.address) """ Make sure the generated data is following the schema. """ @@ -1024,43 +1021,33 @@ def test_usage_lib_report_data( Make sure report usage data works as expected """ - @ray.remote(num_cpus=0) - class ServeInitator: - def __init__(self): - # Start the ray serve server to verify requests are sent - # to the right place. - from ray import serve - - serve.start() - - @serve.deployment(ray_actor_options={"num_cpus": 0}) - async def usage(request): - body = await request.json() - if body == asdict(d): - return True - else: - return False + class UsageStatsServer(BaseHTTPRequestHandler): + expected_data = None - usage.deploy() + def do_POST(self): + content_length = int(self.headers["Content-Length"]) + post_data = self.rfile.read(content_length) + if json.loads(post_data) == self.expected_data: + self.send_response(200) + else: + self.send_response(400) + self.send_header("Content-type", "text/html") + self.end_headers() - def ready(self): - pass + @ray.remote(num_cpus=0) + def run_usage_stats_server(expected_data): + UsageStatsServer.expected_data = expected_data + server = HTTPServer(("127.0.0.1", 8000), UsageStatsServer) + server.serve_forever() - # We need to start a serve with runtime env to make this test - # work with minimal installation. - s = ServeInitator.remote() - ray.get(s.ready.remote()) + run_usage_stats_server.remote(asdict(d)) # Query our endpoint over HTTP. - r = client.report_usage_data("http://127.0.0.1:8000/usage", d) - r.raise_for_status() - assert json.loads(r.text) is True + wait_for_condition( + lambda: client.report_usage_data("http://127.0.0.1:8000", d), timeout=30 + ) -@pytest.mark.skipif( - sys.platform == "win32", - reason="Test depends on runtime env feature not supported on Windows.", -) # TODO(https://github.com/ray-project/ray/issues/33486) @pytest.mark.skipif( sys.version_info >= (3, 11, 0), @@ -1086,7 +1073,7 @@ def test_usage_report_e2e( with monkeypatch.context() as m: m.setenv("HOME", str(tmp_path)) m.setenv("RAY_USAGE_STATS_ENABLED", "1") - m.setenv("RAY_USAGE_STATS_REPORT_URL", "http://127.0.0.1:8000/usage") + m.setenv("RAY_USAGE_STATS_REPORT_URL", "http://127.0.0.1:8000") m.setenv("RAY_USAGE_STATS_REPORT_INTERVAL_S", "1") m.setenv("RAY_USAGE_STATS_EXTRA_TAGS", "extra_k1=extra_v1") cluster = ray_start_cluster @@ -1130,32 +1117,25 @@ def get_payload(self): reporter = StatusReporter.remote() - @ray.remote(num_cpus=0, runtime_env={"pip": ["ray[serve]"]}) - class ServeInitiator: - def __init__(self): - # This is used in the worker process - # so it won't be tracked as library usage. - from ray import serve - - serve.start() + class UsageStatsServer(BaseHTTPRequestHandler): + reporter = None - # Usage report should be sent to the URL every 1 second. - @serve.deployment(ray_actor_options={"num_cpus": 0}) - async def usage(request): - body = await request.json() - reporter.reported.remote() - reporter.report_payload.remote(body) - return True + def do_POST(self): + content_length = int(self.headers["Content-Length"]) + post_data = self.rfile.read(content_length) + self.reporter.reported.remote() + self.reporter.report_payload.remote(json.loads(post_data)) + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() - usage.deploy() - - def ready(self): - pass + @ray.remote(num_cpus=0) + def run_usage_stats_server(reporter): + UsageStatsServer.reporter = reporter + server = HTTPServer(("127.0.0.1", 8000), UsageStatsServer) + server.serve_forever() - # We need to start a serve with runtime env to make this test - # work with minimal installation. - s = ServeInitiator.remote() - ray.get(s.ready.remote()) + run_usage_stats_server.remote(reporter) """ Verify the usage stats are reported to the server. @@ -1174,6 +1154,16 @@ def ready(self): assert payload["python_version"] == python_version assert payload["schema_version"] == "0.1" assert payload["os"] == sys.platform + if sys.platform != "linux": + payload["libc_version"] is None + else: + import platform + + assert ( + payload["libc_version"] + == f"{platform.libc_ver()[0]}:{platform.libc_ver()[1]}" + ) + assert payload["source"] == "OSS" assert payload["cloud_provider"] == "aws" assert payload["min_workers"] is None @@ -1202,9 +1192,6 @@ def ready(self): "_test2": "extra_v3", "dashboard_metrics_grafana_enabled": "False", "dashboard_metrics_prometheus_enabled": "False", - "serve_num_deployments": "1", - "serve_num_gpu_deployments": "0", - "serve_api_version": "v1", "actor_num_created": "0", "pg_num_created": "0", "num_actor_creation_tasks": "0", @@ -1217,15 +1204,14 @@ def ready(self): if os.environ.get("RAY_MINIMAL") != "1": expected_payload["tune_scheduler"] = "FIFOScheduler" expected_payload["tune_searcher"] = "BasicVariantGenerator" + expected_payload["air_storage_configuration"] = "driver" assert payload["extra_usage_tags"] == expected_payload assert payload["total_num_nodes"] == 1 assert payload["total_num_running_jobs"] == 1 if os.environ.get("RAY_MINIMAL") == "1": - # Since we start a serve actor for mocking a server using runtime env. - assert set(payload["library_usages"]) == {"serve"} + assert set(payload["library_usages"]) == set() else: - # Serve is recorded due to our mock server. - assert set(payload["library_usages"]) == {"rllib", "train", "tune", "serve"} + assert set(payload["library_usages"]) == {"rllib", "train", "tune"} validate(instance=payload, schema=schema) """ Verify the usage_stats.json is updated. @@ -1405,8 +1391,8 @@ def verify(): @pytest.mark.skipif( - sys.platform == "win32", - reason="Test depends on runtime env feature not supported on Windows.", + os.environ.get("RAY_MINIMAL") == "1", + reason="This test is not supposed to work for minimal installation.", ) # TODO(https://github.com/ray-project/ray/issues/33486) @pytest.mark.skipif( @@ -1442,10 +1428,7 @@ def objective(*args): tune.run(objective) - # Use a runtime env to run tests in minimal installation. - a = ActorWithLibImport.options( - runtime_env={"pip": ["ray[rllib]", "ray[tune]"]} - ).remote() + a = ActorWithLibImport.remote() ray.get(a.ready.remote()) """ diff --git a/python/ray/tests/test_widgets.py b/python/ray/tests/test_widgets.py new file mode 100644 index 000000000000..ad95d2b3c9e4 --- /dev/null +++ b/python/ray/tests/test_widgets.py @@ -0,0 +1,103 @@ +from unittest import mock + +import pytest +from ray.widgets.util import ensure_notebook_deps, repr_fallback_if_colab + + +@mock.patch("importlib.import_module") +def test_ensure_notebook_dep_missing(mock_import_module, caplog): + """Test that missing notebook dependencies trigger a warning.""" + + class MockDep: + __version__ = "8.0.0" + + def raise_import_error(*args): + raise ImportError + + mock_import_module.return_value = MockDep() + mock_import_module.side_effect = raise_import_error + + class DummyObject: + @ensure_notebook_deps(["somedep", "8"]) + def dummy_ipython_display(self): + return + + DummyObject().dummy_ipython_display() + + assert "Missing packages:" in caplog.records[-1].msg + + +@mock.patch("importlib.import_module") +def test_ensure_notebook_dep_outdated(mock_import_module, caplog): + """Test that outdated notebook dependencies trigger a warning.""" + + class MockDep: + __version__ = "7.0.0" + + mock_import_module.return_value = MockDep() + + class DummyObject: + @ensure_notebook_deps(["somedep", "8"]) + def dummy_ipython_display(): + return + + DummyObject().dummy_ipython_display() + + assert "Outdated packages:" in caplog.records[-1].msg + + +@mock.patch("importlib.import_module") +def test_ensure_notebook_valid(mock_import_module, caplog): + """Test that valid notebook dependencies don't trigger a warning.""" + + class MockDep: + __version__ = "8.0.0" + + mock_import_module.return_value = MockDep() + + class DummyObject: + @ensure_notebook_deps(["somedep", "8"]) + def dummy_ipython_display(self): + return + + DummyObject().dummy_ipython_display() + + assert len(caplog.records) == 0 + + +@pytest.mark.parametrize( + "kernel", + [ + ("google.colab.kernel"), + ("normal.ipython.kernel"), + ], +) +def test_repr_fallback_if_colab(kernel): + """Test that the mimebundle is correctly stripped if run in google colab.""" + pytest.importorskip("IPython", reason="IPython is not installed.") + with mock.patch("IPython.get_ipython") as mock_get_ipython: + mock_get_ipython.return_value = kernel + + class DummyObject: + @repr_fallback_if_colab + def _repr_mimebundle_(self, **kwargs): + return { + "fancy/mimetype": "A fancy repr", + "text/plain": "A simple repr", + } + + obj = DummyObject() + result = obj._repr_mimebundle_() + + assert "text/plain" in result + if "google.colab" in kernel: + assert len(result) == 1 + else: + assert len(result) == 2 + assert "fancy/mimetype" + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_worker_capping.py b/python/ray/tests/test_worker_capping.py index 9fc61ed9f0c2..128ea8f14765 100644 --- a/python/ray/tests/test_worker_capping.py +++ b/python/ray/tests/test_worker_capping.py @@ -211,7 +211,7 @@ def get(self): @ray.remote def get_node_id(): - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() @ray.remote def func(i, counter): @@ -220,7 +220,7 @@ def func(i, counter): while True: time.sleep(1) else: - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() refs = [func.remote(i, counter) for i in range(2)] diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index e2a244455ace..2d51df828f8b 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -344,46 +344,6 @@ py_test( deps = [":train_lib"] ) -py_test( - name = "test_huggingface_checkpoint", - size = "small", - srcs = ["tests/test_huggingface_checkpoint.py"], - tags = ["team:ml", "exclusive", "ray_air"], - deps = [":train_lib", ":conftest"] -) - -py_test( - name = "test_huggingface_gpu", - size = "medium", - srcs = ["tests/test_huggingface_gpu.py"], - tags = ["team:ml", "exclusive", "gpu_only"], - deps = [":train_lib", ":conftest"] -) - -py_test( - name = "test_huggingface_predictor", - size = "medium", - srcs = ["tests/test_huggingface_predictor.py"], - tags = ["team:ml", "exclusive", "ray_air"], - deps = [":train_lib", ":conftest"] -) - -py_test( - name = "test_huggingface_trainer", - size = "large", - srcs = ["tests/test_huggingface_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], - deps = [":train_lib"] -) - -py_test( - name = "test_huggingface_trainer_steps", - size = "large", - srcs = ["tests/test_huggingface_trainer_steps.py"], - tags = ["team:ml", "exclusive", "ray_air"], - deps = [":train_lib"] -) - py_test( name = "test_mosaic_trainer", size = "medium", @@ -412,7 +372,7 @@ py_test( name = "test_lightning_checkpoint", size = "medium", srcs = ["tests/test_lightning_checkpoint.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -420,7 +380,7 @@ py_test( name = "test_lightning_trainer_restore", size = "medium", srcs = ["tests/test_lightning_trainer_restore.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -428,7 +388,7 @@ py_test( name = "test_lightning_trainer", size = "large", srcs = ["tests/test_lightning_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -436,7 +396,7 @@ py_test( name = "test_lightning_predictor", size = "medium", srcs = ["tests/test_lightning_predictor.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -568,6 +528,46 @@ py_test( deps = [":train_lib"] ) +py_test( + name = "test_transformers_checkpoint", + size = "small", + srcs = ["tests/test_transformers_checkpoint.py"], + tags = ["team:ml", "exclusive", "ray_air"], + deps = [":train_lib", ":conftest"] +) + +py_test( + name = "test_transformers_gpu", + size = "medium", + srcs = ["tests/test_transformers_gpu.py"], + tags = ["team:ml", "exclusive", "gpu_only"], + deps = [":train_lib", ":conftest"] +) + +py_test( + name = "test_transformers_predictor", + size = "medium", + srcs = ["tests/test_transformers_predictor.py"], + tags = ["team:ml", "exclusive", "ray_air"], + deps = [":train_lib", ":conftest"] +) + +py_test( + name = "test_transformers_trainer_steps", + size = "large", + srcs = ["tests/test_transformers_trainer_steps.py"], + tags = ["team:ml", "exclusive", "ray_air"], + deps = [":train_lib"] +) + +py_test( + name = "test_transformers_trainer", + size = "large", + srcs = ["tests/test_transformers_trainer.py"], + tags = ["team:ml", "exclusive", "ray_air"], + deps = [":train_lib"] +) + py_test( name = "test_tune", size = "large", diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index 15c99aa9ed6e..be940111f1c4 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -5,6 +5,7 @@ import ray from ray._private.ray_constants import env_integer +from ray.air.config import CheckpointConfig from ray.exceptions import RayActorError from ray.train._internal.dataset_spec import RayDatasetSpec from ray.air.checkpoint import Checkpoint @@ -71,6 +72,7 @@ def __init__( num_gpus_per_worker: float = 0, additional_resources_per_worker: Optional[Dict[str, float]] = None, max_retries: int = 3, + checkpoint_config: Optional[CheckpointConfig] = None, ): self._backend_config = backend_config self._backend = backend_config.backend_cls() @@ -91,6 +93,13 @@ def __init__( self.worker_group = InactiveWorkerGroup() self.dataset_shards = None + self._checkpoint_keep_all_ranks = ( + checkpoint_config and checkpoint_config._checkpoint_keep_all_ranks + ) + self._checkpoint_upload_from_workers = ( + checkpoint_config and checkpoint_config._checkpoint_upload_from_workers + ) + def start( self, initialization_hook: Optional[Callable[[], None]] = None, @@ -120,6 +129,13 @@ def start( # TODO remove if self._trial_info and self._trial_info.driver_ip: self.worker_group._move_workers_with_ip_to_front(self._trial_info.driver_ip) + + worker_locs = [ + f"{w.metadata.pid} ({w.metadata.node_ip})" + for w in self.worker_group.workers + ] + logger.info(f"Starting distributed worker processes: {worker_locs}") + try: if initialization_hook: self._initialization_hook = initialization_hook @@ -334,7 +350,7 @@ def start_training( Args: train_func: The training function to run on each worker. - dataset_spec: A specification for the Ray Dataset to be + dataset_spec: A specification for the Dataset to be passed to the training workers, and the logic on how to shard the Ray Dataset. checkpoint: The checkpoint data that @@ -359,6 +375,8 @@ def initialize_session( checkpoint, dataset_shard, encode_data_fn, + checkpoint_keep_all_ranks, + checkpoint_upload_from_workers, ): try: init_session( @@ -374,6 +392,8 @@ def initialize_session( encode_data_fn=encode_data_fn, detailed_autofilled_metrics=use_detailed_autofilled_metrics, enable_lazy_checkpointing=use_lazy_checkpointing, + checkpoint_keep_all_ranks=checkpoint_keep_all_ranks, + checkpoint_upload_from_workers=(checkpoint_upload_from_workers), ) except ValueError: raise TrainBackendError( @@ -409,6 +429,10 @@ def initialize_session( dataset_shard=self.dataset_shards[index], checkpoint=checkpoint, encode_data_fn=self._backend._encode_data, + checkpoint_keep_all_ranks=self._checkpoint_keep_all_ranks, + checkpoint_upload_from_workers=( + self._checkpoint_upload_from_workers + ), ) ) @@ -475,8 +499,19 @@ def get_next(): "`session.report()` are called the " "same number of times on all workers." ) + return results + def _set_checkpoint_uri(self, uri: str): + """Tell remote sessions where to upload the chekcpoint.""" + + def set_uri(): + session = _get_session("_set_checkpoint_uri") + session._set_checkpoint_uri(uri) + + futures = self.worker_group.execute_async(set_uri) + self.get_with_failure_handling(futures) + def pause_reporting(self): """Disable workers from enqueuing results from ``session.report()``. diff --git a/python/ray/train/_internal/checkpoint.py b/python/ray/train/_internal/checkpoint.py index a85f05d7c915..83b92b819cb7 100644 --- a/python/ray/train/_internal/checkpoint.py +++ b/python/ray/train/_internal/checkpoint.py @@ -12,6 +12,7 @@ from ray.train._internal.session import TrainingResult from ray.train._internal.utils import construct_path from ray.train.constants import ( + CHECKPOINT_RANK_KEY, TIMESTAMP, TRAIN_CHECKPOINT_SUBDIR, TUNE_CHECKPOINT_ID, @@ -98,15 +99,12 @@ def _load_checkpoint( def _process_checkpoint( self, - checkpoint_results: List[TrainingResult], + checkpoint_result: TrainingResult, decode_checkpoint_fn: Callable, - ) -> None: - """Ray Train entrypoint. Perform all processing for a checkpoint.""" - # Get checkpoint from first worker. - checkpoint_result = checkpoint_results[0] - + ) -> _TrackedCheckpoint: checkpoint_data = checkpoint_result.data checkpoint_metadata = checkpoint_result.metadata or {} + checkpoint_rank = checkpoint_metadata.get(CHECKPOINT_RANK_KEY, 0) if isinstance(checkpoint_data, str): checkpoint_class: Type[Checkpoint] = checkpoint_metadata[ @@ -131,13 +129,31 @@ def _process_checkpoint( f"`session.report()`." ) - tracked_checkpoint = _TrackedCheckpoint( + return _TrackedCheckpoint( dir_or_data=checkpoint_data, checkpoint_id=self._latest_checkpoint_id, storage_mode=CheckpointStorage.MEMORY, metrics={score_attr: checkpoint_metadata.get(score_attr, 0.0)}, + rank=checkpoint_rank, ) - self.register_checkpoint(checkpoint=tracked_checkpoint) + + def _process_checkpoints( + self, + checkpoint_results: List[TrainingResult], + decode_checkpoint_fn: Callable, + ) -> None: + """Ray Train entrypoint. Perform all processing for a checkpoint.""" + if self._checkpoint_strategy._checkpoint_keep_all_ranks: + tracked_checkpoints = [ + self._process_checkpoint(checkpoint_result, decode_checkpoint_fn) + for checkpoint_result in checkpoint_results + ] + else: + # Get checkpoint from first worker. + tracked_checkpoints = [ + self._process_checkpoint(checkpoint_results[0], decode_checkpoint_fn) + ] + self.register_checkpoints(checkpoints=tracked_checkpoints) def _get_next_checkpoint_path(self) -> Optional[Path]: """Path to the next checkpoint to persist.""" @@ -249,7 +265,12 @@ def add_tune_checkpoint_id(self, checkpoint: Checkpoint): def _process_persistent_checkpoint(self, checkpoint: _TrackedCheckpoint): self.add_tune_checkpoint_id(checkpoint.dir_or_data) - # If inside a Tune Trainable, then checkpoint with Tune. + + # Train may choose not to commit a checkpoint, but make sure the + # checkpoint is always committed for Tuning purpose. + # After this is committed, checkpoint.dir_or_path will become a string, + # which will prevent this checkpoint from being commtted again in the + # subsequent super()._process_persistent_checkpoint() call. with tune.checkpoint_dir(step=self._latest_checkpoint_id) as checkpoint_dir: path = Path(checkpoint_dir) checkpoint.commit(path) diff --git a/python/ray/train/_internal/dataset_spec.py b/python/ray/train/_internal/dataset_spec.py index 55113e31c651..6ad4d2bf1239 100644 --- a/python/ray/train/_internal/dataset_spec.py +++ b/python/ray/train/_internal/dataset_spec.py @@ -17,9 +17,9 @@ @dataclass class RayDatasetSpec: - """Configuration for Ray Datasets to pass to the training workers. + """Configuration for Datasets to pass to the training workers. - dataset_or_dict: An optional Ray Dataset (or DatasetPipeline) or a dictionary of + dataset_or_dict: An optional Dataset (or DatasetPipeline) or a dictionary of datasets to be sharded across all the training workers, which can be accessed from the training function via ``session.get_dataset_shard()``. Multiple Datasets can be passed in as a dictionary that maps each name key to a @@ -32,7 +32,7 @@ class RayDatasetSpec: training workers (to use as locality hints). The Callable is expected to return a list of RayDatasets or a list of dictionaries of RayDatasets, with the length of the list equal to the length of the list of actor handles. - If None is provided, the provided Ray Dataset(s) will be equally split. + If None is provided, the provided Dataset(s) will be equally split. """ diff --git a/python/ray/train/_internal/results_preprocessors/__init__.py b/python/ray/train/_internal/results_preprocessors/__init__.py deleted file mode 100644 index aeaa22813837..000000000000 --- a/python/ray/train/_internal/results_preprocessors/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors` and the `ray.train.Trainer` API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/results_preprocessors/aggregate/__init__.py b/python/ray/train/_internal/results_preprocessors/aggregate/__init__.py deleted file mode 100644 index fae5be80db31..000000000000 --- a/python/ray/train/_internal/results_preprocessors/aggregate/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors.aggregate` and the `ray.train.Trainer` " - "API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_fn.py b/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_fn.py deleted file mode 100644 index fae5be80db31..000000000000 --- a/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_fn.py +++ /dev/null @@ -1,9 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors.aggregate` and the `ray.train.Trainer` " - "API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_preprocessor.py b/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_preprocessor.py deleted file mode 100644 index fae5be80db31..000000000000 --- a/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_preprocessor.py +++ /dev/null @@ -1,9 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors.aggregate` and the `ray.train.Trainer` " - "API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_utils.py b/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_utils.py deleted file mode 100644 index fae5be80db31..000000000000 --- a/python/ray/train/_internal/results_preprocessors/aggregate/aggregate_utils.py +++ /dev/null @@ -1,9 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors.aggregate` and the `ray.train.Trainer` " - "API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/results_preprocessors/index.py b/python/ray/train/_internal/results_preprocessors/index.py deleted file mode 100644 index aeaa22813837..000000000000 --- a/python/ray/train/_internal/results_preprocessors/index.py +++ /dev/null @@ -1,8 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors` and the `ray.train.Trainer` API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/results_preprocessors/keys.py b/python/ray/train/_internal/results_preprocessors/keys.py deleted file mode 100644 index aeaa22813837..000000000000 --- a/python/ray/train/_internal/results_preprocessors/keys.py +++ /dev/null @@ -1,8 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors` and the `ray.train.Trainer` API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/results_preprocessors/preprocessor.py b/python/ray/train/_internal/results_preprocessors/preprocessor.py deleted file mode 100644 index aeaa22813837..000000000000 --- a/python/ray/train/_internal/results_preprocessors/preprocessor.py +++ /dev/null @@ -1,8 +0,0 @@ -raise DeprecationWarning( - "`ray.train.callbacks.results_preprocessors` and the `ray.train.Trainer` API are " - "deprecated in Ray " - "2.0, and are replaced by Ray AI Runtime (Ray AIR). Ray AIR " - "(https://docs.ray.io/en/latest/ray-air/getting-started.html) " - "provides greater functionality and a unified API " - "compared to the old Ray Train API. " -) diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py index 369261901f46..f13a5a4bb29a 100644 --- a/python/ray/train/_internal/session.py +++ b/python/ray/train/_internal/session.py @@ -9,6 +9,7 @@ from datetime import datetime from enum import Enum, auto from pathlib import Path +import shutil from typing import Callable, Dict, Optional, Type, Union import ray @@ -18,6 +19,9 @@ from ray.data import Dataset, DatasetPipeline from ray.train._internal.accelerator import Accelerator from ray.train.constants import ( + CHECKPOINT_DISTRIBUTED_KEY, + CHECKPOINT_METADATA_KEY, + CHECKPOINT_RANK_KEY, DETAILED_AUTOFILLED_KEYS, WORKER_HOSTNAME, WORKER_NODE_IP, @@ -25,11 +29,16 @@ TIME_THIS_ITER_S, TIME_TOTAL_S, TIMESTAMP, - CHECKPOINT_METADATA_KEY, LAZY_CHECKPOINT_MARKER_FILE, ) from ray.train.error import SessionMisuseError from ray.train.session import _TrainSessionImpl +from ray.util.annotations import DeveloperAPI +from ray.util.debug import log_once + + +_INDEX_FILE_EXTENSION = ".files" +_INDEX_FILE = ".RANK_{0}" + _INDEX_FILE_EXTENSION class TrainingResultType(Enum): @@ -60,6 +69,7 @@ class TrainingResult: # TODO(xwjiang): This needs a better name. +@DeveloperAPI class _TrainSession: """Holds information for training on each worker.""" @@ -83,6 +93,8 @@ def __init__( # will send over checkpoint path and metadata instead of # the whole checkpoint to avoid unnecessary serialization. enable_lazy_checkpointing: bool = True, + checkpoint_keep_all_ranks: bool = False, + checkpoint_upload_from_workers: bool = False, ): self.dataset_shard = dataset_shard @@ -96,6 +108,10 @@ def __init__( # TODO(xwjiang): Legacy Ray Train trainer clean up! self.loaded_checkpoint = checkpoint self.enable_lazy_checkpointing = enable_lazy_checkpointing + self.checkpoint_keep_all_ranks = checkpoint_keep_all_ranks + self.checkpoint_upload_from_workers = checkpoint_upload_from_workers + # Only used if checkpoint_upload_from_workers is True. + self.checkpoint_uri = None # Function to encode checkpoint dict before sending to the driver. if not encode_data_fn: @@ -281,17 +297,85 @@ def _report_thread_runner_error(self, block=False): except queue.Empty: pass + def _create_checkpoint_file_list(self, checkpoint: Checkpoint): + """Create an index of the folder contents + + So we know which files belong to which rank. + """ + root = checkpoint._local_path + ckpt_files = [] + for dir, _, files in os.walk(root): + # Strip the root path from the path though, since + # we are only interested in the part relative to + # the root of this checkpoint. + dir = dir[len(root) :] + for fn in files: + ckpt_files.append(os.path.join(dir, fn)) + # Write these files into the index file. + with open(os.path.join(root, _INDEX_FILE.format(self.world_rank)), "w") as f: + for fn in ckpt_files: + f.write(f"{fn}\n") + + def _remove_uploaded_checkpoint_files(self, checkpoint: Checkpoint): + """Get rid of already uploaded large checkpoint files. + + This is so they don't get shipped to the driver node. + """ + root = checkpoint._local_path + for f in os.listdir(root): + if f.endswith(_INDEX_FILE_EXTENSION): + # We will leave the index file in there so local + # checkpoint has knowledge about the cloud files. + continue + fp = os.path.join(root, f) + if os.path.isfile(fp): + os.unlink(fp) + elif os.path.isdir(fp): + shutil.rmtree(fp) + def checkpoint(self, checkpoint: Checkpoint): """Adds kwargs to the queue to be consumed by main thread. Also stores the checkpoint in ``self.loaded_checkpoint``. """ + checkpoint_type, _ = checkpoint.get_internal_representation() + + if checkpoint_type == "data_dict" and self.checkpoint_keep_all_ranks: + if log_once("keep_all_ranks_dict_checkpoint"): + logger.warning( + "Saving checkpoints from all ranks does not work with " + "dictionary checkpoints. Set `ray.air.CheckpointConfig" + "(_checkpoint_keep_all_ranks=False)`, or write checkpoints " + "to a directory and report directory checkpoints that " + "contain unique files per worker rank. For example, " + "use filenames that contain the unique rank. You can " + "retrieve the rank with `session.get_world_rank()` within " + "your training loop per worker." + ) + + upload_from_workers = ( + checkpoint_type == "local_path" + and self.checkpoint_upload_from_workers + and self.checkpoint_uri + ) + if upload_from_workers: + self._create_checkpoint_file_list(checkpoint) + logger.info( + f"Uploading checkpoint files from worker rank {self.world_rank} " + f"to cloud URI {self.checkpoint_uri}." + ) + # We want to upload the files directly to cloud storage, + # so that they won't need to be shipped to the driver node + # via object store. + checkpoint.to_uri(self.checkpoint_uri) + logger.info("Done uploading checkpoint files.") + self._remove_uploaded_checkpoint_files(checkpoint) # Update session checkpoint to latest checkpoint. self.loaded_checkpoint = checkpoint # Only store checkpoints on worker with rank 0. - if self.world_rank != 0: + if self.world_rank != 0 and not self.checkpoint_keep_all_ranks: checkpoint = None elif checkpoint: checkpoint = self._encode_data_fn(checkpoint) @@ -307,11 +391,20 @@ def checkpoint(self, checkpoint: Checkpoint): metadata.update({CHECKPOINT_METADATA_KEY: checkpoint._metadata}) checkpoint = str(checkpoint._local_path) + # Save the rank of the worker that created this checkpoint. + metadata.update( + { + CHECKPOINT_RANK_KEY: self.world_rank, + CHECKPOINT_DISTRIBUTED_KEY: upload_from_workers, + } + ) + result = TrainingResult( type=TrainingResultType.CHECKPOINT, data=checkpoint, metadata=metadata, ) + # Add result to a thread-safe queue. self.result_queue.put(result, block=True) @@ -319,6 +412,14 @@ def checkpoint(self, checkpoint: Checkpoint): # checkpoint has been processed. self.continue_lock.acquire() + def _set_checkpoint_uri(self, uri: str): + """Tell session where to save the next directory checkpoint on the cloud. + + Args: + uri: URI to the location where next checkpoint should be saved. + """ + self.checkpoint_uri = uri + def report(self, metrics: Dict, checkpoint: Optional[Checkpoint] = None) -> None: # TODO(xwjiang): tons of optimizations. diff --git a/python/ray/train/_internal/worker_group.py b/python/ray/train/_internal/worker_group.py index ac0aab8ebf2e..36dd2326f7f4 100644 --- a/python/ray/train/_internal/worker_group.py +++ b/python/ray/train/_internal/worker_group.py @@ -1,4 +1,5 @@ import logging +import os import socket from dataclasses import dataclass from typing import Callable, List, TypeVar, Optional, Dict, Type, Tuple, Union @@ -42,13 +43,15 @@ class WorkerMetadata: node_id: ID of the node this worker is on. node_ip: IP address of the node this worker is on. hostname: Hostname that this worker is on. - gpu_ids (List[int]): List of CUDA IDs available to this worker. + gpu_ids: List of CUDA IDs available to this worker. + pid: Process ID of this worker. """ node_id: str node_ip: str hostname: str gpu_ids: Optional[List[str]] + pid: int @dataclass @@ -83,9 +86,14 @@ def construct_metadata() -> WorkerMetadata: node_ip = ray.util.get_node_ip_address() hostname = socket.gethostname() gpu_ids = [str(gpu_id) for gpu_id in ray.get_gpu_ids()] + pid = os.getpid() return WorkerMetadata( - node_id=node_id, node_ip=node_ip, hostname=hostname, gpu_ids=gpu_ids + node_id=node_id, + node_ip=node_ip, + hostname=hostname, + gpu_ids=gpu_ids, + pid=pid, ) diff --git a/python/ray/train/base_trainer.py b/python/ray/train/base_trainer.py index 595dd4646c6c..3f554ddb684a 100644 --- a/python/ray/train/base_trainer.py +++ b/python/ray/train/base_trainer.py @@ -157,7 +157,7 @@ def training_loop(self): Args: scaling_config: Configuration for how to scale training. run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use the key "train" + datasets: Any Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed @@ -425,7 +425,8 @@ def _validate_attributes(self): dataset ): raise ValueError( - f"The Dataset under '{key}' key is not a `ray.data.Dataset`. " + f"The Dataset under '{key}' key is not a " + "`ray.data.Dataset`. " f"Received {dataset} instead." ) @@ -461,7 +462,7 @@ def _maybe_sync_down_trainer_state(cls, restore_path: str) -> Path: """Sync down trainer state from remote storage. Returns: - local_dir of the synced trainer state + str: Local directory containing the trainer state """ if not is_non_local_path_uri(restore_path): return Path(os.path.expanduser(restore_path)) / _TRAINER_PKL @@ -577,7 +578,10 @@ def fit(self) -> Result: ) else: tuner = Tuner( - trainable=trainable, param_space=param_space, run_config=self.run_config + trainable=trainable, + param_space=param_space, + run_config=self.run_config, + _trainer_api=True, ) experiment_path = Path( @@ -622,7 +626,7 @@ def _save(self, experiment_path: Union[str, Path]): of parameters can be passed in again), that parameter will be loaded from the saved copy. - Ray Datasets should not be saved as part of the state. Instead, we save the + Datasets should not be saved as part of the state. Instead, we save the keys and replace the dataset values with dummy functions that will raise an error if invoked. The error only serves as a guardrail for misuse (e.g., manually unpickling and constructing the Trainer again) diff --git a/python/ray/train/batch_predictor.py b/python/ray/train/batch_predictor.py index 745572d909e6..576d14699e5f 100644 --- a/python/ray/train/batch_predictor.py +++ b/python/ray/train/batch_predictor.py @@ -21,7 +21,7 @@ class BatchPredictor: """Batch predictor class. Takes a predictor class and a checkpoint and provides an interface to run - batch scoring on Ray datasets. + batch scoring on Datasets. This batch predictor wraps around a predictor class and executes it in a distributed way when calling ``predict()``. @@ -127,10 +127,10 @@ def predict( """Run batch scoring on a Dataset. .. note:: - In Ray 2.4, `BatchPredictor` is lazy by default. Use one of the Datasets consumption APIs, such as iterating through the output, to trigger the execution of prediction. + In Ray 2.4, `BatchPredictor` is lazy by default. Use one of the Dataset consumption APIs, such as iterating through the output, to trigger the execution of prediction. Args: - data: Ray dataset or pipeline to run batch prediction on. + data: Dataset or pipeline to run batch prediction on. feature_columns: List of columns in the preprocessed dataset to use for prediction. Columns not specified will be dropped from `data` before being passed to the predictor. @@ -190,7 +190,7 @@ def calculate_accuracy(df): .. testoutput:: MapBatches(ScoringWrapper) - +- Datastream(num_blocks=1, num_rows=3, schema={feature_1: int64, label: int64}) + +- Dataset(num_blocks=1, num_rows=3, schema={feature_1: int64, label: int64}) Final accuracy: 1.0 """ # noqa: E501 if num_gpus_per_worker is None: @@ -297,7 +297,7 @@ def _keep_columns_from_input_batch( return prediction_output_batch def __call__(self, input_batch: DataBatchType) -> DataBatchType: - # TODO: Delegate separate_gpu_stage flag to Datasets. + # TODO: Delegate separate_gpu_stage flag to Dataset. if self.override_prep: # Apply preprocessing before selecting feature columns. input_batch = self.override_prep.transform_batch(input_batch) @@ -330,7 +330,7 @@ def __call__(self, input_batch: DataBatchType) -> DataBatchType: preprocessor = self.get_preprocessor() override_prep = None if preprocessor: - # TODO: Delegate separate_gpu_stage flag to Datasets. + # TODO: Delegate separate_gpu_stage flag to Dataset. if not separate_gpu_stage and num_gpus_per_worker > 0: override_prep = preprocessor else: @@ -387,7 +387,7 @@ def predict_pipelined( to passing it `BatchPredictor.predict()`. Args: - data: Ray dataset to run batch prediction on. + data: Dataset to run batch prediction on. blocks_per_window: The window size (parallelism) in blocks. Increasing window size increases pipeline throughput, but also increases the latency to initial output, since it decreases the diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index 4b6b1ac61bce..1ddba37fd238 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -49,7 +49,11 @@ # is restarted, the checkpoint_id can continue to increment. TUNE_CHECKPOINT_ID = "_current_checkpoint_id" -# Env var name + +# ================================================== +# Environment Variables +# ================================================== + ENABLE_DETAILED_AUTOFILLED_METRICS_ENV = ( "TRAIN_RESULT_ENABLE_DETAILED_AUTOFILLED_METRICS" ) @@ -67,8 +71,23 @@ TRAIN_ENABLE_WORKER_SPREAD_ENV = "TRAIN_ENABLE_WORKER_SPREAD" +# NOTE: When adding a new environment variable, please track it in this list. +TRAIN_ENV_VARS = { + ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, + ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, + TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, + TRAIN_ENABLE_WORKER_SPREAD_ENV, +} + # Blacklist virtualized networking. DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth" # Key for AIR Checkpoint metadata in TrainingResult metadata CHECKPOINT_METADATA_KEY = "checkpoint_metadata" + +# Key for AIR Checkpoint world rank in TrainingResult metadata +CHECKPOINT_RANK_KEY = "checkpoint_rank" + + +# Key for AIR Checkpoint that gets uploaded from distributed workers. +CHECKPOINT_DISTRIBUTED_KEY = "distributed" diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index 4f0b95f17ae5..78d4b54d7a41 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -1,8 +1,9 @@ +import copy import inspect import logging from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Type, Union -from tabulate import tabulate +from ray._private.thirdparty.tabulate.tabulate import tabulate import ray from ray import tune @@ -21,7 +22,7 @@ from ray.train.trainer import BaseTrainer, GenDataset from ray.util.annotations import DeveloperAPI from ray.widgets import Template -from ray.widgets.util import ensure_notebook_deps, fallback_if_colab +from ray.widgets.util import ensure_ipywidgets_dep, repr_fallback_if_colab if TYPE_CHECKING: from ray.data.preprocessor import Preprocessor @@ -39,7 +40,8 @@ def __init__( ): self.preprocessor = preprocessor super(_DataParallelCheckpointManager, self).__init__( - run_dir=run_dir, checkpoint_strategy=checkpoint_strategy + run_dir=run_dir, + checkpoint_strategy=checkpoint_strategy, ) def _process_persistent_checkpoint(self, checkpoint: _TrackedCheckpoint): @@ -99,7 +101,7 @@ def train_loop_per_worker(): # Returns dict of last saved checkpoint. session.get_checkpoint() - # Returns the Ray Dataset shard for the given key. + # Returns the Dataset shard for the given key. session.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. @@ -210,7 +212,7 @@ def __init__(self, train_loop_per_worker, my_backend_config: dataset_config: Configuration for dataset ingest. This is merged with the default dataset config for the given trainer (`cls._dataset_config`). run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use + datasets: Any Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed @@ -411,6 +413,7 @@ def training_loop(self) -> None: num_gpus_per_worker=scaling_config.num_gpus_per_worker, additional_resources_per_worker=additional_resources_per_worker, max_retries=0, + checkpoint_config=self.run_config.checkpoint_config, ) checkpoint_manager = self._checkpoint_manager_cls( @@ -420,6 +423,17 @@ def training_loop(self) -> None: # Start the remote actors. backend_executor.start(initialization_hook=None) + # Disable TrainingIterator's CheckpointManager from handling + # checkpoints itself by setting num_to_keep to None. + # This is important because otherwise Trainer's CheckpointManager + # may delete a checkpoint prematurely, before the next checkpoint + # has been fully handled by Tune. + # TODO(jungong, justinvyu) : Trainer should not own a + # CheckpointManager. + checkpoint_strategy = copy.deepcopy(self.run_config.checkpoint_config) + checkpoint_strategy.num_to_keep = None + checkpoint_strategy.checkpoint_score_attribute = None + training_iterator = self._training_iterator_cls( backend_executor=backend_executor, backend_config=self._backend_config, @@ -427,7 +441,8 @@ def training_loop(self) -> None: dataset_spec=self._ingest_spec, checkpoint_manager=checkpoint_manager, checkpoint=self.resume_from_checkpoint, - checkpoint_strategy=None, + checkpoint_strategy=checkpoint_strategy, + storage_path=self.run_config.storage_path, ) self._report(training_iterator) @@ -443,40 +458,61 @@ def get_dataset_config(self) -> Dict[str, DatasetConfig]: """ return self._dataset_config.copy() - @ensure_notebook_deps( - ["tabulate", None], - ["ipywidgets", "8"], - ) - @fallback_if_colab - def _ipython_display_(self): + @ensure_ipywidgets_dep("8") + @repr_fallback_if_colab + def _repr_mimebundle_(self, **kwargs): + """Return a mimebundle with an ipywidget repr and a simple text repr. + + Depending on the frontend where the data is being displayed, + different mimetypes will be used from this bundle. + See https://ipython.readthedocs.io/en/stable/config/integrating.html + for information about this method, and + https://ipywidgets.readthedocs.io/en/latest/embedding.html + for more information about the jupyter widget mimetype. + + Returns: + A mimebundle containing an ipywidget repr and a simple text repr. + """ from ipywidgets import HTML, VBox, Tab, Layout - from IPython.display import display title = HTML(f"

    {self.__class__.__name__}

    ") - children = [ - self._datasets_repr_() if self.datasets else None, - HTML(self._dataset_config_repr_html_()) if self._dataset_config else None, - HTML(self._train_loop_config_repr_html_()) - if self._train_loop_config - else None, - HTML(self.scaling_config._repr_html_()) if self.scaling_config else None, - HTML(self.run_config._repr_html_()) if self.run_config else None, - HTML(self._backend_config._repr_html_()) if self._backend_config else None, - ] - - tab = Tab( - children, - titles=[ - "Datasets", - "Dataset Config", - "Train Loop Config", - "Scaling Config", - "Run Config", - "Backend Config", - ], + children = [] + titles = [] + + if self.datasets: + children.append(self._datasets_repr_()) + titles.append("Datasets") + + if self._dataset_config: + children.append(HTML(self._dataset_config_repr_html_())) + titles.append("Dataset Config") + + if self._train_loop_config: + children.append(HTML(self._train_loop_config_repr_html_())) + titles.append("Train Loop Config") + + if self.scaling_config: + children.append(HTML(self.scaling_config._repr_html_())) + titles.append("Scaling Config") + + if self.run_config: + children.append(HTML(self.run_config._repr_html_())) + titles.append("Run Config") + + if self._backend_config: + children.append(HTML(self._backend_config._repr_html_())) + titles.append("Backend Config") + + tab = Tab(children, titles=titles) + widget = VBox([title, tab], layout=Layout(width="100%")) + bundle = widget._repr_mimebundle_(**kwargs) + bundle.update( + { + "text/plain": repr(self), + } ) - display(VBox([title, tab], layout=Layout(width="100%"))) + return bundle def _train_loop_config_repr_html_(self) -> str: if self._train_loop_config: @@ -514,7 +550,6 @@ def _dataset_config_repr_html_(self) -> str: return Template("rendered_html_common.html.j2").render(content=content) - @ensure_notebook_deps(["ipywidgets", "8"]) def _datasets_repr_(self) -> str: from ipywidgets import HTML, VBox, Layout diff --git a/python/ray/train/examples/huggingface/huggingface_basic_language_modeling_example.py b/python/ray/train/examples/huggingface/huggingface_basic_language_modeling_example.py index fa0817e68abc..3ab907253faa 100644 --- a/python/ray/train/examples/huggingface/huggingface_basic_language_modeling_example.py +++ b/python/ray/train/examples/huggingface/huggingface_basic_language_modeling_example.py @@ -20,7 +20,10 @@ import ray import ray.data from ray.train.batch_predictor import BatchPredictor -from ray.train.huggingface import HuggingFacePredictor, HuggingFaceTrainer +from ray.train.hf_transformers import ( + TransformersPredictor, + TransformersTrainer, +) from ray.air.config import ScalingConfig @@ -113,7 +116,7 @@ def train_function(train_dataset, eval_dataset=None, **config): ray_train = ray_train.limit(16) ray_validation = ray_validation.limit(8) - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_per_worker=train_function, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), datasets={"train": ray_train, "evaluation": ray_validation}, @@ -125,7 +128,7 @@ def train_function(train_dataset, eval_dataset=None, **config): prompt = ["My text: Complete me..."] predictor = BatchPredictor.from_checkpoint( results.checkpoint, - HuggingFacePredictor, + TransformersPredictor, task="text-generation", tokenizer=tokenizer, ) @@ -138,7 +141,7 @@ def train_function(train_dataset, eval_dataset=None, **config): if __name__ == "__main__": # Training settings parser = argparse.ArgumentParser( - description="Language modelling from scratch with HuggingFaceTrainer Example", + description="Language modelling from scratch with TransformersTrainer Example", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( diff --git a/python/ray/train/examples/pytorch/torch_regression_example.py b/python/ray/train/examples/pytorch/torch_regression_example.py index f3bcedfc7b07..663f4af5a621 100644 --- a/python/ray/train/examples/pytorch/torch_regression_example.py +++ b/python/ray/train/examples/pytorch/torch_regression_example.py @@ -29,7 +29,7 @@ def combine_x(batch): } ) - dataset = dataset.map_batches(combine_x) + dataset = dataset.map_batches(combine_x, batch_format="pandas") train_dataset, validation_dataset = dataset.repartition( num_blocks=4 ).train_test_split(split, shuffle=True) diff --git a/python/ray/train/examples/transformers/cluster.yaml b/python/ray/train/examples/transformers/cluster.yaml index 4559964fb035..72e8676e0198 100644 --- a/python/ray/train/examples/transformers/cluster.yaml +++ b/python/ray/train/examples/transformers/cluster.yaml @@ -51,7 +51,7 @@ setup_commands: - pip install ray[tune] - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - # Install HuggingFace + # Install Transformers - git clone https://github.com/huggingface/transformers || true - cd transformers && pip install -U . && diff --git a/python/ray/train/examples/transformers/transformers_example.py b/python/ray/train/examples/transformers/transformers_example.py index f0e24b24f88a..ba5718bacaa2 100644 --- a/python/ray/train/examples/transformers/transformers_example.py +++ b/python/ray/train/examples/transformers/transformers_example.py @@ -43,7 +43,7 @@ from transformers.utils.versions import require_version import ray -from ray.train.huggingface.accelerate import AccelerateTrainer +from ray.train.hf_accelerate import AccelerateTrainer from ray.air.config import ScalingConfig logger = logging.getLogger(__name__) diff --git a/python/ray/train/gbdt_trainer.py b/python/ray/train/gbdt_trainer.py index 48a19866e011..997be4230008 100644 --- a/python/ray/train/gbdt_trainer.py +++ b/python/ray/train/gbdt_trainer.py @@ -1,4 +1,5 @@ import os +import logging import warnings from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type @@ -21,6 +22,9 @@ from ray.data.preprocessor import Preprocessor _WARN_REPARTITION_THRESHOLD = 10 * 1024**3 +_DEFAULT_NUM_ITERATIONS = 10 + +logger = logging.getLogger(__name__) def _convert_scaling_config_to_ray_params( @@ -108,7 +112,7 @@ class GBDTTrainer(BaseTrainer): Inherited by XGBoostTrainer and LightGBMTrainer. Args: - datasets: Ray Datasets to use for training and validation. Must include a + datasets: Datasets to use for training and validation. Must include a "train" key denoting the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if @@ -119,6 +123,7 @@ class GBDTTrainer(BaseTrainer): params: Framework specific training parameters. dmatrix_params: Dict of ``dataset name:dict of kwargs`` passed to respective :class:`xgboost_ray.RayDMatrix` initializations. + num_boost_round: Target number of boosting iterations (trees in the model). scaling_config: Configuration for how to scale data parallel training. run_config: Configuration for the execution of the training run. preprocessor: A ray.data.Preprocessor to preprocess the @@ -142,6 +147,8 @@ class GBDTTrainer(BaseTrainer): _tune_callback_checkpoint_cls: type _default_ray_params: Dict[str, Any] = {"checkpoint_frequency": 1} _init_model_arg_name: str + _num_iterations_argument: str = "num_boost_round" + _default_num_iterations: int = _DEFAULT_NUM_ITERATIONS def __init__( self, @@ -150,6 +157,7 @@ def __init__( label_column: str, params: Dict[str, Any], dmatrix_params: Optional[Dict[str, Dict[str, Any]]] = None, + num_boost_round: int = _DEFAULT_NUM_ITERATIONS, scaling_config: Optional[ScalingConfig] = None, run_config: Optional[RunConfig] = None, preprocessor: Optional["Preprocessor"] = None, @@ -159,6 +167,7 @@ def __init__( self.label_column = label_column self.params = params + self.num_boost_round = num_boost_round self.train_kwargs = train_kwargs self.dmatrix_params = dmatrix_params or {} @@ -170,7 +179,7 @@ def __init__( resume_from_checkpoint=resume_from_checkpoint, ) - # Ray Datasets should always use distributed loading. + # Datasets should always use distributed loading. for dataset_name in self.datasets.keys(): dataset_params = self.dmatrix_params.get(dataset_name, {}) dataset_params["distributed"] = True @@ -226,12 +235,12 @@ def _ray_params(self) -> "xgboost_ray.RayParams": scaling_config_dataclass, self._ray_params_cls, self._default_ray_params ) - def preprocess_datasets(self) -> None: - super().preprocess_datasets() - + def _repartition_datasets_to_match_num_actors(self): # XGBoost/LightGBM-Ray requires each dataset to have at least as many # blocks as there are workers. - # TODO: Move this logic to the respective libraries + # This is only applicable for xgboost-ray<0.1.16. The version check + # is done in subclasses to ensure that xgboost-ray doesn't need to be + # imported here. for dataset_key, dataset in self.datasets.items(): if dataset.num_blocks() < self._ray_params.num_actors: if dataset.size_bytes() > _WARN_REPARTITION_THRESHOLD: @@ -262,6 +271,7 @@ def _checkpoint_at_end(self, model, evals_result: dict) -> None: def training_loop(self) -> None: config = self.train_kwargs.copy() + config[self._num_iterations_argument] = self.num_boost_round dmatrices = self._get_dmatrices( dmatrix_params=self.dmatrix_params, @@ -297,6 +307,22 @@ def training_loop(self) -> None: config[self._init_model_arg_name] = init_model + if init_model: + # If restoring, make sure that we only create num_boosting_round trees, + # and not init_model_trees + num_boosting_round trees + last_iteration = self._model_iteration(init_model) + num_iterations = config.get( + self._num_iterations_argument, self._default_num_iterations + ) + new_iterations = num_iterations - last_iteration + config[self._num_iterations_argument] = new_iterations + logger.warning( + f"Model loaded from checkpoint will train for " + f"additional {new_iterations} iterations (trees) in order " + "to achieve the target number of iterations " + f"({self._num_iterations_argument}={num_iterations})." + ) + model = self._train( params=self.params, dtrain=train_dmatrix, diff --git a/python/ray/train/hf_accelerate/__init__.py b/python/ray/train/hf_accelerate/__init__.py new file mode 100644 index 000000000000..b84e46c0910f --- /dev/null +++ b/python/ray/train/hf_accelerate/__init__.py @@ -0,0 +1,5 @@ +from ray.train.hf_accelerate.accelerate_trainer import AccelerateTrainer + +__all__ = [ + "AccelerateTrainer", +] diff --git a/python/ray/train/huggingface/accelerate/_accelerate_utils.py b/python/ray/train/hf_accelerate/_accelerate_utils.py similarity index 97% rename from python/ray/train/huggingface/accelerate/_accelerate_utils.py rename to python/ray/train/hf_accelerate/_accelerate_utils.py index 64a362bc3867..497878df2d78 100644 --- a/python/ray/train/huggingface/accelerate/_accelerate_utils.py +++ b/python/ray/train/hf_accelerate/_accelerate_utils.py @@ -20,16 +20,10 @@ from typing import Optional, Tuple, Union import tempfile from pathlib import Path - +from packaging.version import Version from contextlib import nullcontext - -try: - from packaging.version import Version -except ImportError: - from distutils.version import LooseVersion as Version - import accelerate if Version(accelerate.__version__) < Version("0.17.0.dev0"): diff --git a/python/ray/train/huggingface/accelerate/accelerate_trainer.py b/python/ray/train/hf_accelerate/accelerate_trainer.py similarity index 97% rename from python/ray/train/huggingface/accelerate/accelerate_trainer.py rename to python/ray/train/hf_accelerate/accelerate_trainer.py index f0d441d23171..489788fb9b84 100644 --- a/python/ray/train/huggingface/accelerate/accelerate_trainer.py +++ b/python/ray/train/hf_accelerate/accelerate_trainer.py @@ -4,10 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Tuple, Union -try: - from packaging.version import Version -except ImportError: - from distutils.version import LooseVersion as Version +from packaging.version import Version import accelerate @@ -25,7 +22,7 @@ from ray.train.torch.config import _set_torch_distributed_env_vars try: - from ray.train.huggingface.accelerate._accelerate_utils import ( + from ray.train.hf_accelerate._accelerate_utils import ( launch_command, AccelerateDefaultNamespace, AccelerateConfigWrapper, @@ -71,7 +68,7 @@ def train_loop_per_worker(): # Get dict of last saved checkpoint. session.get_checkpoint() - # Session returns the Ray Dataset shard for the given key. + # Session returns the Dataset shard for the given key. session.get_dataset_shard("my_dataset") # Get the total number of workers executing training. @@ -125,7 +122,7 @@ def train_loop_per_worker(): import ray from ray.air import session, Checkpoint - from ray.train.huggingface.accelerate import AccelerateTrainer + from ray.train.hf_accelerate import AccelerateTrainer from ray.air.config import ScalingConfig from ray.air.config import RunConfig from ray.air.config import CheckpointConfig @@ -252,7 +249,7 @@ def train_loop_per_worker(): scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use + datasets: Any Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed diff --git a/python/ray/train/hf_transformers/__init__.py b/python/ray/train/hf_transformers/__init__.py new file mode 100644 index 000000000000..9ecd347fe0ed --- /dev/null +++ b/python/ray/train/hf_transformers/__init__.py @@ -0,0 +1,15 @@ +from ray.train.hf_transformers.transformers_checkpoint import ( + TransformersCheckpoint, +) +from ray.train.hf_transformers.transformers_predictor import ( + TransformersPredictor, +) +from ray.train.hf_transformers.transformers_trainer import ( + TransformersTrainer, +) + +__all__ = [ + "TransformersCheckpoint", + "TransformersPredictor", + "TransformersTrainer", +] diff --git a/python/ray/train/huggingface/_huggingface_utils.py b/python/ray/train/hf_transformers/_transformers_utils.py similarity index 85% rename from python/ray/train/huggingface/_huggingface_utils.py rename to python/ray/train/hf_transformers/_transformers_utils.py index 7aaab6ad4e73..d33f69c1a20c 100644 --- a/python/ray/train/huggingface/_huggingface_utils.py +++ b/python/ray/train/hf_transformers/_transformers_utils.py @@ -8,7 +8,9 @@ from ray.air import session from ray.data import DataIterator -from ray.train.huggingface.huggingface_checkpoint import HuggingFaceCheckpoint +from ray.train.hf_transformers.transformers_checkpoint import ( + TransformersCheckpoint, +) if TYPE_CHECKING: from torch.utils.data import IterableDataset @@ -44,7 +46,7 @@ def get_train_dataloader(self): data_loader = super().get_train_dataloader() if isinstance( data_loader.dataset, transformers.trainer.IterableDatasetShard - ): + ) and getattr(data_loader.dataset.dataset, "_do_not_split", False): # Default Trainer.get_train_dataloader will wrap the dataset in # IterableDatasetShard, which will perform additional sharding on top # of the already sharded dataset. By setting those two attributes, @@ -59,9 +61,9 @@ def get_train_dataloader(self): return trainer -# TODO(ml-team): Replace with a Ray Datasets-HuggingFace integration when available. +# TODO(ml-team): Replace with a Datasets-HuggingFace integration when available. class RayDatasetHFIterable(datasets.iterable_dataset.ExamplesIterable): - """HF ExamplesIterable backed by a Ray Dataset.""" + """HF ExamplesIterable backed by a Dataset.""" def __init__(self, dataset: DataIterator) -> None: self.dataset = dataset @@ -72,10 +74,12 @@ def __init__(self, dataset: DataIterator) -> None: def __iter__(self): for row in self.generate_examples_fn(**self.kwargs): - yield (0, {k: v for k, v in row.as_pydict().items()}) + yield (0, {k: v for k, v in row.items()}) -def process_dataset_for_hf(dataset: DataIterator) -> "IterableDataset": +def process_dataset_for_hf( + dataset: DataIterator, disable_transformers_splitting: bool = False +) -> "IterableDataset": """Converts a Ray Dataset into a HF IterableDataset.""" hf_iterable = RayDatasetHFIterable(dataset) @@ -84,12 +88,15 @@ def process_dataset_for_hf(dataset: DataIterator) -> "IterableDataset": ).with_format("torch") try: - dataset_length = dataset._base_datastream.count() + dataset_length = dataset._base_dataset.count() except (ValueError, AttributeError): # pipeline case dataset_length = None iterable_dataset = maybe_add_length(iterable_dataset, dataset_length) + # Trigger logic in `wrap_transformers_trainer` to disable built-in + # HuggingFace splitting, as we have already split the dataset ourselves. + iterable_dataset._do_not_split = disable_transformers_splitting return iterable_dataset @@ -99,7 +106,9 @@ def process_datasets( ) -> Tuple["IterableDataset", "IterableDataset"]: """Convert Ray train and validation to HF IterableDatasets.""" if train_dataset: - train_torch_dataset = process_dataset_for_hf(train_dataset) + train_torch_dataset = process_dataset_for_hf( + train_dataset, disable_transformers_splitting=True + ) else: train_torch_dataset = None @@ -150,8 +159,8 @@ def on_save(self, args, state, control, **kwargs): transformers.trainer.get_last_checkpoint(args.output_dir) ).absolute() if checkpoint_path: - # Use HuggingFaceCheckpoint here to avoid a warning in _TrainSession - self.delayed_report["checkpoint"] = HuggingFaceCheckpoint.from_directory( + # Use TransformersCheckpoint here to avoid a warning in _TrainSession + self.delayed_report["checkpoint"] = TransformersCheckpoint.from_directory( str(checkpoint_path) ) diff --git a/python/ray/train/hf_transformers/transformers_checkpoint.py b/python/ray/train/hf_transformers/transformers_checkpoint.py new file mode 100644 index 000000000000..3128797c4652 --- /dev/null +++ b/python/ray/train/hf_transformers/transformers_checkpoint.py @@ -0,0 +1,104 @@ +import os +from typing import TYPE_CHECKING, Type, Optional, Union + +import torch +import transformers +import transformers.modeling_utils +import transformers.trainer +import transformers.training_args +from transformers.trainer import TRAINING_ARGS_NAME, WEIGHTS_NAME + +from ray.air._internal.checkpointing import save_preprocessor_to_dir +from ray.air._internal.torch_utils import load_torch_model +from ray.air.checkpoint import Checkpoint +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + + +@PublicAPI(stability="alpha") +class TransformersCheckpoint(Checkpoint): + """A :py:class:`~ray.air.checkpoint.Checkpoint` with HuggingFace-specific + functionality. + + Use ``TransformersCheckpoint.from_model`` to create this type of checkpoint. + """ + + @classmethod + def from_model( + cls, + model: Union[transformers.modeling_utils.PreTrainedModel, torch.nn.Module], + tokenizer: Optional[transformers.PreTrainedTokenizer] = None, + *, + path: os.PathLike, + preprocessor: Optional["Preprocessor"] = None, + ) -> "TransformersCheckpoint": + """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores a + HuggingFace model. + + Args: + model: The pretrained transformer or Torch model to store in the + checkpoint. + tokenizer: The Tokenizer to use in the Transformers pipeline for inference. + path: The directory where the checkpoint will be stored. + preprocessor: A fitted preprocessor to be applied before inference. + + Returns: + A :py:class:`TransformersCheckpoint` containing the specified model. + """ + if not isinstance(model, transformers.modeling_utils.PreTrainedModel): + state_dict = model.state_dict() + torch.save(state_dict, os.path.join(path, WEIGHTS_NAME)) + else: + model.save_pretrained(path) + + if tokenizer: + tokenizer.save_pretrained(path) + + if preprocessor: + save_preprocessor_to_dir(preprocessor, path) + + checkpoint = cls.from_directory(path) + + return checkpoint + + def get_model( + self, + model: Union[ + Type[transformers.modeling_utils.PreTrainedModel], torch.nn.Module + ], + **pretrained_model_kwargs, + ) -> Union[transformers.modeling_utils.PreTrainedModel, torch.nn.Module]: + """Retrieve the model stored in this checkpoint.""" + with self.as_directory() as checkpoint_path: + if isinstance(model, torch.nn.Module): + state_dict = torch.load( + os.path.join(checkpoint_path, WEIGHTS_NAME), map_location="cpu" + ) + model = load_torch_model(saved_model=state_dict, model_definition=model) + else: + model = model.from_pretrained( + checkpoint_path, **pretrained_model_kwargs + ) + return model + + def get_tokenizer( + self, + tokenizer: Type[transformers.PreTrainedTokenizer], + **kwargs, + ) -> Optional[transformers.PreTrainedTokenizer]: + """Create a tokenizer using the data stored in this checkpoint.""" + with self.as_directory() as checkpoint_path: + return tokenizer.from_pretrained(checkpoint_path, **kwargs) + + def get_training_arguments(self) -> transformers.training_args.TrainingArguments: + """Retrieve the training arguments stored in this checkpoint.""" + with self.as_directory() as checkpoint_path: + training_args_path = os.path.join(checkpoint_path, TRAINING_ARGS_NAME) + if os.path.exists(training_args_path): + with open(training_args_path, "rb") as f: + training_args = torch.load(f, map_location="cpu") + else: + training_args = None + return training_args diff --git a/python/ray/train/hf_transformers/transformers_predictor.py b/python/ray/train/hf_transformers/transformers_predictor.py new file mode 100644 index 000000000000..29e73273abb5 --- /dev/null +++ b/python/ray/train/hf_transformers/transformers_predictor.py @@ -0,0 +1,243 @@ +import logging +from typing import TYPE_CHECKING, List, Optional, Type, Union + +import pandas as pd +from transformers.pipelines import Pipeline +from transformers.pipelines import pipeline as pipeline_factory +from transformers.pipelines.table_question_answering import ( + TableQuestionAnsweringPipeline, +) + +from ray.air.checkpoint import Checkpoint +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.data_batch_type import DataBatchType +from ray.train.predictor import Predictor +from ray.util import log_once +from ray.util.annotations import PublicAPI + +try: + import torch + + torch_get_gpus = torch.cuda.device_count +except ImportError: + + def torch_get_gpus(): + return 0 + + +try: + import tensorflow + + def tf_get_gpus(): + return len(tensorflow.config.list_physical_devices("GPU")) + +except ImportError: + + def tf_get_gpus(): + return 0 + + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + +logger = logging.getLogger(__name__) + + +@PublicAPI(stability="alpha") +class TransformersPredictor(Predictor): + """A predictor for HuggingFace Transformers PyTorch models. + + This predictor uses Transformers Pipelines for inference. + + Args: + pipeline: The Transformers pipeline to use for inference. + preprocessor: A preprocessor used to transform data batches prior + to prediction. + use_gpu: If set, the model will be moved to GPU on instantiation and + prediction happens on GPU. + """ + + def __init__( + self, + pipeline: Optional[Pipeline] = None, + preprocessor: Optional["Preprocessor"] = None, + use_gpu: bool = False, + ): + self.pipeline = pipeline + self.use_gpu = use_gpu + + num_gpus = max(torch_get_gpus(), tf_get_gpus()) + if not use_gpu and num_gpus > 0 and log_once("hf_predictor_not_using_gpu"): + logger.warning( + "You have `use_gpu` as False but there are " + f"{num_gpus} GPUs detected on host where " + "prediction will only use CPU. Please consider explicitly " + "setting `TransformersPredictor(use_gpu=True)` or " + "`batch_predictor.predict(ds, num_gpus_per_worker=1)` to " + "enable GPU prediction. Ignore if you have set `device` or " + "`device_map` arguments in the `pipeline` manually." + ) + + super().__init__(preprocessor) + + def __repr__(self): + return ( + f"{self.__class__.__name__}(pipeline={self.pipeline!r}, " + f"preprocessor={self._preprocessor!r})" + ) + + @classmethod + def from_checkpoint( + cls, + checkpoint: Checkpoint, + *, + pipeline_cls: Optional[Type[Pipeline]] = None, + use_gpu: bool = False, + **pipeline_kwargs, + ) -> "TransformersPredictor": + """Instantiate the predictor from a Checkpoint. + + The checkpoint is expected to be a result of ``TransformersTrainer``. + + Note that the Transformers ``pipeline`` used internally expects to + recieve raw text. If you have any Preprocessors in Checkpoint + that tokenize the data, remove them by calling + ``Checkpoint.set_preprocessor(None)`` beforehand. + + Args: + checkpoint: The checkpoint to load the model, tokenizer and + preprocessor from. It is expected to be from the result of a + ``TransformersTrainer`` run. + pipeline_cls: A ``transformers.pipelines.Pipeline`` class to use. + If not specified, will use the ``pipeline`` abstraction + wrapper. + use_gpu: If set, the model will be moved to GPU on instantiation and + prediction happens on GPU. + **pipeline_kwargs: Any kwargs to pass to the pipeline + initialization. If ``pipeline`` is None, this must contain + the 'task' argument. Cannot contain 'model'. Can be used + to override the tokenizer with 'tokenizer'. If ``use_gpu`` is + True, 'device' will be set to 0 by default, unless 'device_map' is + passed. + """ + if not pipeline_cls and "task" not in pipeline_kwargs: + raise ValueError( + "If `pipeline_cls` is not specified, 'task' must be passed as a kwarg." + ) + if use_gpu and "device_map" not in pipeline_kwargs: + # default to using the GPU with the first index + pipeline_kwargs.setdefault("device", 0) + pipeline_cls = pipeline_cls or pipeline_factory + preprocessor = checkpoint.get_preprocessor() + with checkpoint.as_directory() as checkpoint_path: + # Tokenizer will be loaded automatically (no need to specify + # `tokenizer=checkpoint_path`) + pipeline = pipeline_cls(model=checkpoint_path, **pipeline_kwargs) + return cls( + pipeline=pipeline, + preprocessor=preprocessor, + use_gpu=use_gpu, + ) + + def _predict( + self, data: Union[list, pd.DataFrame], **pipeline_call_kwargs + ) -> pd.DataFrame: + ret = self.pipeline(data, **pipeline_call_kwargs) + # Remove unnecessary lists + try: + new_ret = [x[0] if isinstance(x, list) and len(x) == 1 else x for x in ret] + df = pd.DataFrame(new_ret) + except Exception: + # if we fail for any reason, just give up + df = pd.DataFrame(ret) + df.columns = [str(col) for col in df.columns] + return df + + @staticmethod + def _convert_data_for_pipeline( + data: pd.DataFrame, pipeline: Pipeline + ) -> Union[list, pd.DataFrame]: + """Convert the data into a format accepted by the pipeline. + + In most cases, this format is a list of strings.""" + # Special case where pd.DataFrame is allowed. + if isinstance(pipeline, TableQuestionAnsweringPipeline): + # TODO(team-ml): This may be a performance bottleneck. + return data + + # Otherwise, a list of columns as lists. + columns = [data[col].to_list() for col in data.columns] + # Flatten if it's only one column. + while isinstance(columns, list) and len(columns) == 1: + columns = columns[0] + return columns + + def predict( + self, + data: DataBatchType, + feature_columns: Optional[Union[List[str], List[int]]] = None, + **predict_kwargs, + ) -> DataBatchType: + """Run inference on data batch. + + The data is converted into a list (unless ``pipeline`` is a + ``TableQuestionAnsweringPipeline``) and passed to the ``pipeline`` + object. + + Args: + data: A batch of input data. Either a pandas DataFrame or numpy + array. + feature_columns: The names or indices of the columns in the + data to use as features to predict on. If None, use all + columns. + **pipeline_call_kwargs: additional kwargs to pass to the + ``pipeline`` object. + + Examples: + >>> import pandas as pd + >>> from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + >>> from transformers.pipelines import pipeline + >>> from ray.train.hf_transformers import TransformersPredictor + >>> + >>> model_checkpoint = "gpt2" + >>> tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer" + >>> tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) + >>> + >>> model_config = AutoConfig.from_pretrained(model_checkpoint) + >>> model = AutoModelForCausalLM.from_config(model_config) + >>> predictor = TransformersPredictor( + ... pipeline=pipeline( + ... task="text-generation", model=model, tokenizer=tokenizer + ... ) + ... ) + >>> + >>> prompts = pd.DataFrame( + ... ["Complete me", "And me", "Please complete"], columns=["sentences"] + ... ) + >>> predictions = predictor.predict(prompts) + + + Returns: + Prediction result. + """ + return Predictor.predict( + self, data, feature_columns=feature_columns, **predict_kwargs + ) + + def _predict_pandas( + self, + data: "pd.DataFrame", + feature_columns: Optional[List[str]] = None, + **pipeline_call_kwargs, + ) -> "pd.DataFrame": + if TENSOR_COLUMN_NAME in data: + arr = data[TENSOR_COLUMN_NAME].to_numpy() + if feature_columns: + data = pd.DataFrame(arr[:, feature_columns]) + elif feature_columns: + data = data[feature_columns] + + data = data[feature_columns] if feature_columns else data + + data = self._convert_data_for_pipeline(data, self.pipeline) + return self._predict(data, **pipeline_call_kwargs) diff --git a/python/ray/train/hf_transformers/transformers_trainer.py b/python/ray/train/hf_transformers/transformers_trainer.py new file mode 100644 index 000000000000..2df919d625d5 --- /dev/null +++ b/python/ray/train/hf_transformers/transformers_trainer.py @@ -0,0 +1,469 @@ +import importlib.util +import inspect +import os +import sys +import warnings +from packaging.version import Version +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Type + +import transformers +import transformers.modeling_utils +import transformers.trainer +import transformers.training_args +from transformers.trainer_utils import IntervalStrategy +from transformers.utils import is_datasets_available +from torch.utils.data import Dataset as TorchDataset + +from ray.air import session +from ray.air.checkpoint import Checkpoint +from ray.air.config import DatasetConfig, RunConfig, ScalingConfig +from ray.train.constants import ( + EVALUATION_DATASET_KEY, + TRAIN_DATASET_KEY, +) +from ray.train.data_parallel_trainer import DataParallelTrainer +from ray.train.hf_transformers._transformers_utils import ( + TrainReportCallback, + process_datasets, + wrap_transformers_trainer, +) +from ray.train.torch import TorchConfig, TorchTrainer +from ray.train.trainer import GenDataset +from ray.util import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + +# Due to HF Dataset's dynamic module system, we need to dynamically import the +# datasets_modules module on every actor when training. +# We accomplish this by simply running the following bit of code directly +# in module you are currently viewing. This ensures that when we +# unpickle the TransformersTrainer, it will be ran before pickle tries to +# import datasets_modules and prevents an exception from being thrown. +# Same logic is present inside HF Transformers Ray integration: +# https://github.com/huggingface/transformers/blob/\ +# 7d5fde991d598370d961be8cb7add6541e2b59ce/src/transformers/integrations.py#L271 +# Also see https://github.com/ray-project/ray/issues/28084 +if "datasets_modules" not in sys.modules and is_datasets_available(): + import datasets.load + + dynamic_modules_path = os.path.join( + datasets.load.init_dynamic_modules(), "__init__.py" + ) + # load dynamic_modules from path + spec = importlib.util.spec_from_file_location( + "datasets_modules", dynamic_modules_path + ) + datasets_modules = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = datasets_modules + spec.loader.exec_module(datasets_modules) + + +TRAINER_INIT_FN_KEY = "_trainer_init_per_worker" + + +@PublicAPI(stability="alpha") +class TransformersTrainer(TorchTrainer): + """A Trainer for data parallel HuggingFace Transformers on PyTorch training. + + This Trainer runs the ``transformers.Trainer.train()`` method on multiple + Ray Actors. The training is carried out in a distributed fashion through PyTorch + DDP. These actors already have the necessary torch process group already + configured for distributed PyTorch training. If you have PyTorch >= 1.12.0 + installed, you can also run FSDP training by specifying the ``fsdp`` argument + in ``TrainingArguments``. DeepSpeed is + also supported - see :doc:`/ray-air/examples/gptj_deepspeed_fine_tuning`. + For more information on configuring FSDP or DeepSpeed, refer to `Hugging Face + documentation `__. + + The training function ran on every Actor will first run the + specified ``trainer_init_per_worker`` function to obtain an instantiated + ``transformers.Trainer`` object. The ``trainer_init_per_worker`` function + will have access to preprocessed train and evaluation datasets. + + If the ``datasets`` dict contains a training dataset (denoted by + the "train" key), then it will be split into multiple dataset + shards, with each Actor training on a single shard. + All the other datasets will not be split. + + Please note that if you use a custom ``transformers.Trainer`` subclass, + the ``get_train_dataloader`` method will be wrapped around to disable + sharding by ``transformers.IterableDatasetShard``, as the dataset will + already be sharded on the Ray AIR side. + + You can also provide ``datasets.Dataset`` object or other dataset objects + allowed by ``transformers.Trainer`` directly in the ``trainer_init_per_worker`` + function, without specifying the ``datasets`` dict. It is recommended to initialize + those objects inside the function, as otherwise they will be serialized and passed + to the function, which may lead to long runtime and memory issues with large + amounts of data. In this case, the training dataset will be split + automatically by Transformers. + + HuggingFace loggers will be automatically disabled, and the ``local_rank`` + argument in ``TrainingArguments`` will be automatically set. Please note + that if you want to use CPU training, you will need to set the ``no_cuda`` + argument in ``TrainingArguments`` manually - otherwise, an exception + (segfault) may be thrown. + + This Trainer requires ``transformers>=4.19.0`` package. + It is tested with ``transformers==4.19.1``. + + Example: + .. code-block:: python + + # Based on + # huggingface/notebooks/examples/language_modeling_from_scratch.ipynb + + # Hugging Face imports + from datasets import load_dataset + import transformers + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + + import ray + from ray.train.hf_transformers import TransformersTrainer + from ray.air.config import ScalingConfig + + # If using GPUs, set this to True. + use_gpu = False + + model_checkpoint = "gpt2" + tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer" + block_size = 128 + + datasets = load_dataset("wikitext", "wikitext-2-raw-v1") + tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) + + def tokenize_function(examples): + return tokenizer(examples["text"]) + + tokenized_datasets = datasets.map( + tokenize_function, batched=True, num_proc=1, remove_columns=["text"] + ) + + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = { + k: sum(examples[k], []) for k in examples.keys() + } + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model + # supported it. + # instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [ + t[i : i + block_size] + for i in range(0, total_length, block_size) + ] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + batch_size=1000, + num_proc=1, + ) + ray_train_ds = ray.data.from_huggingface(lm_datasets["train"]) + ray_evaluation_ds = ray.data.from_huggingface( + lm_datasets["validation"] + ) + + def trainer_init_per_worker(train_dataset, eval_dataset, **config): + model_config = AutoConfig.from_pretrained(model_checkpoint) + model = AutoModelForCausalLM.from_config(model_config) + args = transformers.TrainingArguments( + output_dir=f"{model_checkpoint}-wikitext2", + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + learning_rate=2e-5, + weight_decay=0.01, + no_cuda=(not use_gpu), + ) + return transformers.Trainer( + model=model, + args=args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + ) + + scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) + trainer = TransformersTrainer( + trainer_init_per_worker=trainer_init_per_worker, + scaling_config=scaling_config, + datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds}, + ) + result = trainer.fit() + + Args: + trainer_init_per_worker: The function that returns an instantiated + ``transformers.Trainer`` object and takes in the following arguments: + train ``Torch.Dataset``, optional evaluation ``Torch.Dataset`` + and config as kwargs. The Torch Datasets are automatically + created by converting the Ray Datasets internally before + they are passed into the function. + trainer_init_config: Configurations to pass into + ``trainer_init_per_worker`` as kwargs. + torch_config: Configuration for setting up the PyTorch backend. If set to + None, use the default configuration. This replaces the ``backend_config`` + arg of ``DataParallelTrainer``. Same as in ``TorchTrainer``. + scaling_config: Configuration for how to scale data parallel training. + dataset_config: Configuration for dataset ingest. + run_config: Configuration for the execution of the training run. + datasets: Any Ray Datasets to use for training. Use + the key "train" to denote which dataset is the training + dataset and key "evaluation" to denote the evaluation + dataset. Can only contain a training dataset + and up to one extra dataset to be used for evaluation. + If a ``preprocessor`` is provided and has not already been fit, + it will be fit on the training dataset. All datasets will be + transformed by the ``preprocessor`` if one is provided. + preprocessor: A ray.data.Preprocessor to preprocess the + provided datasets. + resume_from_checkpoint: A checkpoint to resume training from. + """ + + _dataset_config = { + # training dataset should be split by us + "train": DatasetConfig(fit=True, split=True), + # do not split eval dataset, as HF has a system to parallelize + # evaluation across workers, and it requires each worker + # to have the full eval dataset + "evaluation": DatasetConfig(split=False), + } + + def __init__( + self, + trainer_init_per_worker: Callable[ + [Optional[TorchDataset], Optional[TorchDataset], Any], + transformers.trainer.Trainer, + ], + *, + trainer_init_config: Optional[Dict] = None, + torch_config: Optional[TorchConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + dataset_config: Optional[Dict[str, DatasetConfig]] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + preprocessor: Optional["Preprocessor"] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + + # Functionality required for TransformersTrainer only added in this + # version + if Version(transformers.__version__) < Version("4.19.0"): + raise RuntimeError( + "TransformersTrainer requires transformers>=4.19.0, but you " + f"have {transformers.__version__} which is incompatible. " + "Update on all nodes with `pip install -U 'transformers>=4.19.0'`." + ) + + self._validate_trainer_init_per_worker( + trainer_init_per_worker, "trainer_init_per_worker" + ) + + super().__init__( + train_loop_per_worker=_huggingface_train_loop_per_worker, + train_loop_config=self._create_trainer_init_config( + trainer_init_per_worker, trainer_init_config + ), + torch_config=torch_config, + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + preprocessor=preprocessor, + resume_from_checkpoint=resume_from_checkpoint, + ) + + @classmethod + def _create_trainer_init_config( + cls, + trainer_init_per_worker: Callable[ + [TorchDataset, Optional[TorchDataset], Any], + transformers.trainer.Trainer, + ], + trainer_init_config: Optional[Dict[str, Any]], + ) -> Dict[str, Any]: + trainer_init_config = trainer_init_config.copy() if trainer_init_config else {} + if TRAINER_INIT_FN_KEY in trainer_init_config: + raise ValueError( + f"'{TRAINER_INIT_FN_KEY}' is a reserved key in `trainer_init_config`." + ) + if trainer_init_per_worker: + trainer_init_config[TRAINER_INIT_FN_KEY] = trainer_init_per_worker + return trainer_init_config + + @classmethod + def restore( + cls: Type["TransformersTrainer"], + path: str, + trainer_init_per_worker: Optional[ + Callable[ + [TorchDataset, Optional[TorchDataset], Any], + transformers.trainer.Trainer, + ] + ] = None, + trainer_init_config: Optional[Dict] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + preprocessor: Optional["Preprocessor"] = None, + scaling_config: Optional[ScalingConfig] = None, + ) -> "TransformersTrainer": + """Restores a TransformersTrainer from a previously interrupted/failed run. + + Args: + trainer_init_per_worker: Optionally re-specified trainer init function. + This should be used to re-specify a function that is not + restorable in a new Ray cluster (e.g., it holds onto outdated + object references). This should be the same trainer init + that was passed to the original trainer constructor. + trainer_init_config: Optionally re-specified trainer init config. + This should similarly be used if the original `train_loop_config` + contained outdated object references, and it should not be modified + from what was originally passed in. + + See :meth:`BaseTrainer.restore() ` + for descriptions of the other arguments. + + Returns: + TransformersTrainer: A restored instance of `TransformersTrainer` + """ + return super(DataParallelTrainer, cls).restore( + path=path, + trainer_init_per_worker=trainer_init_per_worker, + trainer_init_config=trainer_init_config, + datasets=datasets, + preprocessor=preprocessor, + scaling_config=scaling_config, + ) + + def _validate_trainer_init_per_worker( + self, trainer_init_per_worker: Callable, fn_name: str + ) -> None: + num_params = len(inspect.signature(trainer_init_per_worker).parameters) + if num_params < 3: + raise ValueError( + f"{fn_name} should take in at least 3 arguments, " + f"but it accepts {num_params} arguments instead." + ) + + def _validate_attributes(self): + for key, conf in self._dataset_config.items(): + if conf.use_stream_api: + raise ValueError( + "TransformersTrainer does not support `use_stream_api`." + ) + gpus_per_worker = self.scaling_config.num_gpus_per_worker + if gpus_per_worker > 1: + raise ValueError( + f"You have assigned {gpus_per_worker} GPUs per worker. " + "This is not supported by HuggingFace, which expects " + "one GPU per worker in DDP mode and will fail " + "if more are assigned." + ) + if gpus_per_worker != int(gpus_per_worker): + raise ValueError( + f"You have assigned {gpus_per_worker} GPUs per worker, " + "but fractional GPUs are not supported by HuggingFace." + ) + + super()._validate_attributes() + + +def _huggingface_train_loop_per_worker(config): + """Per-worker training loop for HuggingFace Transformers.""" + trainer_init_per_worker = config.pop("_trainer_init_per_worker") + + train_dataset = session.get_dataset_shard(TRAIN_DATASET_KEY) + eval_dataset = session.get_dataset_shard(EVALUATION_DATASET_KEY) + + train_torch_dataset, eval_torch_dataset = process_datasets( + train_dataset, + eval_dataset, + ) + + trainer: transformers.trainer.Trainer = trainer_init_per_worker( + train_torch_dataset, eval_torch_dataset, **config + ) + + strategies = [ + strategy + for strategy in (trainer.args.evaluation_strategy, trainer.args.save_strategy) + if strategy not in ("no", IntervalStrategy.NO) + ] + strategies = [trainer.args.logging_strategy] + strategies + if not all(strategy == strategies[0] for strategy in strategies[1:]): + raise ValueError( + "When using Ray AIR,`logging_strategy`, `evaluation_strategy` " + "and `save_strategy` must all be set to the same value. " + "`evaluation_strategy` or `save_strategy` may also be set to 'no'.\n" + f"Got `logging_strategy`={trainer.args.logging_strategy}\n" + f"`evaluation_strategy`={trainer.args.evaluation_strategy}\n" + f"`save_strategy`={trainer.args.save_strategy}" + ) + + if trainer.args.save_strategy in ("steps", IntervalStrategy.STEPS): + if ( + trainer.args.save_steps < trainer.args.logging_steps + or trainer.args.save_steps % trainer.args.logging_steps != 0 + ): + raise ValueError( + "When using 'steps' `save_strategy`, `save_steps` must be " + "equal or bigger to `logging_steps`, and must be divisible " + "by `logging_steps` (so that saving occurs at the same time " + f"logging does). Got `save_steps`={trainer.args.save_steps}, " + f"`logging_steps`={trainer.args.logging_steps}." + ) + + if trainer.args.evaluation_strategy in ("steps", IntervalStrategy.STEPS): + if trainer.args.logging_steps != trainer.args.eval_steps: + raise ValueError( + "`logging_steps` must be equal to `eval_steps`. " + f"Got `logging_steps`={trainer.args.logging_steps}, " + f"`eval_steps`={trainer.args.eval_steps}" + ) + + if trainer.args.load_best_model_at_end: + raise ValueError( + "As Ray AIR replaces Transformers checkpointing, " + "`load_best_model_at_end` must be set to False.\n" + "You can obtain the AIR Checkpoint with " + "`Result.checkpoint` returned by the `fit()` method " + "of this Trainer, and the model itself by calling " + "`Checkpoint.get_model()`.\n" + "You can configure the checkpointing by setting " + "`run_config.checkpoint_config`." + ) + + if trainer.args.push_to_hub and not trainer.args.hub_token: + warnings.warn( + "You have set `push_to_hub=True` but didn't specify `hub_token`. " + "Pushing to hub will most likely fail, as the credentials will not " + "be automatically propagated from the local enviroment to the Ray Actors. " + "If that happens, specify `hub_token` in `TrainingArguments`." + ) + + trainer = wrap_transformers_trainer(trainer) + + # ensure no HF logging callbacks are added + # aside from doubling functionality with our callbacks, + # the Wandb callbacks causes training to freeze + integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( + trainer.args.report_to + ) + for callback in integration_callbacks: + trainer.pop_callback(callback) + + trainer.add_callback(TrainReportCallback) + + checkpoint = session.get_checkpoint() + if checkpoint: + with checkpoint.as_directory() as checkpoint_path: + trainer.train(resume_from_checkpoint=checkpoint_path) + else: + trainer.train() diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py index a4209cec65a2..e6601c5fd8a3 100644 --- a/python/ray/train/horovod/horovod_trainer.py +++ b/python/ray/train/horovod/horovod_trainer.py @@ -57,7 +57,7 @@ def train_loop_per_worker(): # Returns dict of last saved checkpoint. session.get_checkpoint() - # Returns the Ray Dataset shard for the given key. + # Returns the Dataset shard for the given key. session.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. @@ -162,7 +162,7 @@ def train_loop_per_worker(): scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use + datasets: Any Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed diff --git a/python/ray/train/huggingface/__init__.py b/python/ray/train/huggingface/__init__.py index 3c6d4932e793..5f04ff6cb8b1 100644 --- a/python/ray/train/huggingface/__init__.py +++ b/python/ray/train/huggingface/__init__.py @@ -1,5 +1,9 @@ -from ray.train.huggingface.huggingface_checkpoint import HuggingFaceCheckpoint -from ray.train.huggingface.huggingface_predictor import HuggingFacePredictor +from ray.train.huggingface.huggingface_checkpoint import ( + HuggingFaceCheckpoint, +) +from ray.train.huggingface.huggingface_predictor import ( + HuggingFacePredictor, +) from ray.train.huggingface.huggingface_trainer import ( HuggingFaceTrainer, ) diff --git a/python/ray/train/huggingface/_deprecation_msg.py b/python/ray/train/huggingface/_deprecation_msg.py new file mode 100644 index 000000000000..85622587afaf --- /dev/null +++ b/python/ray/train/huggingface/_deprecation_msg.py @@ -0,0 +1,8 @@ +deprecation_msg = ( + "`ray.train.huggingface` has been split into " + "`ray.train.hf_transformers` and `ray.train.hf_accelerate`," + " with `HuggingFaceTrainer`, `HuggingFacePredictor` and `HuggingFaceCheckpoint` " + "renamed to `TransformersTrainer`, `TransformersPredictor` and " + "`TransformersCheckpoint` respectively. Update your code to use the new import " + "paths. This will raise an exception in the future." +) diff --git a/python/ray/train/huggingface/accelerate.py b/python/ray/train/huggingface/accelerate.py new file mode 100644 index 000000000000..60a5d8c33d6d --- /dev/null +++ b/python/ray/train/huggingface/accelerate.py @@ -0,0 +1,10 @@ +import warnings + +deprecation_msg = ( + "`ray.train.huggingface.accelerate` has been renamed to " + "`ray.train.hf_accelerate`. This import path is left as an alias " + "but will be removed in the future." +) +warnings.warn(deprecation_msg, DeprecationWarning) + +from ray.train.hf_accelerate import * # noqa diff --git a/python/ray/train/huggingface/accelerate/__init__.py b/python/ray/train/huggingface/accelerate/__init__.py deleted file mode 100644 index 152928f54a62..000000000000 --- a/python/ray/train/huggingface/accelerate/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from ray.train.huggingface.accelerate.accelerate_trainer import AccelerateTrainer - -__all__ = [ - "AccelerateTrainer", -] diff --git a/python/ray/train/huggingface/huggingface_checkpoint.py b/python/ray/train/huggingface/huggingface_checkpoint.py index 01e64c2f742f..35539c151323 100644 --- a/python/ray/train/huggingface/huggingface_checkpoint.py +++ b/python/ray/train/huggingface/huggingface_checkpoint.py @@ -1,104 +1,22 @@ -import os -from typing import TYPE_CHECKING, Type, Optional, Union +import warnings +from ray.util.annotations import Deprecated -import torch -import transformers -import transformers.modeling_utils -import transformers.trainer -import transformers.training_args -from transformers.trainer import TRAINING_ARGS_NAME, WEIGHTS_NAME +from ray.train.hf_transformers.transformers_checkpoint import ( + TransformersCheckpoint, +) -from ray.air._internal.checkpointing import save_preprocessor_to_dir -from ray.air._internal.torch_utils import load_torch_model -from ray.air.checkpoint import Checkpoint -from ray.util.annotations import PublicAPI +from ._deprecation_msg import deprecation_msg -if TYPE_CHECKING: - from ray.data.preprocessor import Preprocessor +@Deprecated(message=deprecation_msg) +class HuggingFaceCheckpoint(TransformersCheckpoint): + # Use __new__ as it is much less likely to be overriden + # than __init__ + def __new__(cls: type, *args, **kwargs): + warnings.warn(deprecation_msg, DeprecationWarning) + return super(HuggingFaceCheckpoint, cls).__new__(cls) -@PublicAPI(stability="alpha") -class HuggingFaceCheckpoint(Checkpoint): - """A :py:class:`~ray.air.checkpoint.Checkpoint` with HuggingFace-specific - functionality. - Use ``HuggingFaceCheckpoint.from_model`` to create this type of checkpoint. - """ - - @classmethod - def from_model( - cls, - model: Union[transformers.modeling_utils.PreTrainedModel, torch.nn.Module], - tokenizer: Optional[transformers.PreTrainedTokenizer] = None, - *, - path: os.PathLike, - preprocessor: Optional["Preprocessor"] = None, - ) -> "HuggingFaceCheckpoint": - """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores a - HuggingFace model. - - Args: - model: The pretrained transformer or Torch model to store in the - checkpoint. - tokenizer: The Tokenizer to use in the Transformers pipeline for inference. - path: The directory where the checkpoint will be stored. - preprocessor: A fitted preprocessor to be applied before inference. - - Returns: - A :py:class:`HuggingFaceCheckpoint` containing the specified model. - """ - if not isinstance(model, transformers.modeling_utils.PreTrainedModel): - state_dict = model.state_dict() - torch.save(state_dict, os.path.join(path, WEIGHTS_NAME)) - else: - model.save_pretrained(path) - - if tokenizer: - tokenizer.save_pretrained(path) - - if preprocessor: - save_preprocessor_to_dir(preprocessor, path) - - checkpoint = cls.from_directory(path) - - return checkpoint - - def get_model( - self, - model: Union[ - Type[transformers.modeling_utils.PreTrainedModel], torch.nn.Module - ], - **pretrained_model_kwargs, - ) -> Union[transformers.modeling_utils.PreTrainedModel, torch.nn.Module]: - """Retrieve the model stored in this checkpoint.""" - with self.as_directory() as checkpoint_path: - if isinstance(model, torch.nn.Module): - state_dict = torch.load( - os.path.join(checkpoint_path, WEIGHTS_NAME), map_location="cpu" - ) - model = load_torch_model(saved_model=state_dict, model_definition=model) - else: - model = model.from_pretrained( - checkpoint_path, **pretrained_model_kwargs - ) - return model - - def get_tokenizer( - self, - tokenizer: Type[transformers.PreTrainedTokenizer], - **kwargs, - ) -> Optional[transformers.PreTrainedTokenizer]: - """Create a tokenizer using the data stored in this checkpoint.""" - with self.as_directory() as checkpoint_path: - return tokenizer.from_pretrained(checkpoint_path, **kwargs) - - def get_training_arguments(self) -> transformers.training_args.TrainingArguments: - """Retrieve the training arguments stored in this checkpoint.""" - with self.as_directory() as checkpoint_path: - training_args_path = os.path.join(checkpoint_path, TRAINING_ARGS_NAME) - if os.path.exists(training_args_path): - with open(training_args_path, "rb") as f: - training_args = torch.load(f, map_location="cpu") - else: - training_args = None - return training_args +__all__ = [ + "HuggingFaceCheckpoint", +] diff --git a/python/ray/train/huggingface/huggingface_predictor.py b/python/ray/train/huggingface/huggingface_predictor.py index aef519970df5..fd90557e80f5 100644 --- a/python/ray/train/huggingface/huggingface_predictor.py +++ b/python/ray/train/huggingface/huggingface_predictor.py @@ -1,243 +1,22 @@ -import logging -from typing import TYPE_CHECKING, List, Optional, Type, Union +import warnings +from ray.util.annotations import Deprecated -import pandas as pd -from transformers.pipelines import Pipeline -from transformers.pipelines import pipeline as pipeline_factory -from transformers.pipelines.table_question_answering import ( - TableQuestionAnsweringPipeline, +from ray.train.hf_transformers.transformers_predictor import ( + TransformersPredictor, ) -from ray.air.checkpoint import Checkpoint -from ray.air.constants import TENSOR_COLUMN_NAME -from ray.air.data_batch_type import DataBatchType -from ray.train.predictor import Predictor -from ray.util import log_once -from ray.util.annotations import PublicAPI +from ._deprecation_msg import deprecation_msg -try: - import torch - torch_get_gpus = torch.cuda.device_count -except ImportError: +@Deprecated(message=deprecation_msg) +class HuggingFacePredictor(TransformersPredictor): + # Use __new__ as it is much less likely to be overriden + # than __init__ + def __new__(cls: type, *args, **kwargs): + warnings.warn(deprecation_msg, DeprecationWarning) + return super(HuggingFacePredictor, cls).__new__(cls) - def torch_get_gpus(): - return 0 - -try: - import tensorflow - - def tf_get_gpus(): - return len(tensorflow.config.list_physical_devices("GPU")) - -except ImportError: - - def tf_get_gpus(): - return 0 - - -if TYPE_CHECKING: - from ray.data.preprocessor import Preprocessor - -logger = logging.getLogger(__name__) - - -@PublicAPI(stability="alpha") -class HuggingFacePredictor(Predictor): - """A predictor for HuggingFace Transformers PyTorch models. - - This predictor uses Transformers Pipelines for inference. - - Args: - pipeline: The Transformers pipeline to use for inference. - preprocessor: A preprocessor used to transform data batches prior - to prediction. - use_gpu: If set, the model will be moved to GPU on instantiation and - prediction happens on GPU. - """ - - def __init__( - self, - pipeline: Optional[Pipeline] = None, - preprocessor: Optional["Preprocessor"] = None, - use_gpu: bool = False, - ): - self.pipeline = pipeline - self.use_gpu = use_gpu - - num_gpus = max(torch_get_gpus(), tf_get_gpus()) - if not use_gpu and num_gpus > 0 and log_once("hf_predictor_not_using_gpu"): - logger.warning( - "You have `use_gpu` as False but there are " - f"{num_gpus} GPUs detected on host where " - "prediction will only use CPU. Please consider explicitly " - "setting `HuggingFacePredictor(use_gpu=True)` or " - "`batch_predictor.predict(ds, num_gpus_per_worker=1)` to " - "enable GPU prediction. Ignore if you have set `device` or " - "`device_map` arguments in the `pipeline` manually." - ) - - super().__init__(preprocessor) - - def __repr__(self): - return ( - f"{self.__class__.__name__}(pipeline={self.pipeline!r}, " - f"preprocessor={self._preprocessor!r})" - ) - - @classmethod - def from_checkpoint( - cls, - checkpoint: Checkpoint, - *, - pipeline_cls: Optional[Type[Pipeline]] = None, - use_gpu: bool = False, - **pipeline_kwargs, - ) -> "HuggingFacePredictor": - """Instantiate the predictor from a Checkpoint. - - The checkpoint is expected to be a result of ``HuggingFaceTrainer``. - - Note that the Transformers ``pipeline`` used internally expects to - recieve raw text. If you have any Preprocessors in Checkpoint - that tokenize the data, remove them by calling - ``Checkpoint.set_preprocessor(None)`` beforehand. - - Args: - checkpoint: The checkpoint to load the model, tokenizer and - preprocessor from. It is expected to be from the result of a - ``HuggingFaceTrainer`` run. - pipeline_cls: A ``transformers.pipelines.Pipeline`` class to use. - If not specified, will use the ``pipeline`` abstraction - wrapper. - use_gpu: If set, the model will be moved to GPU on instantiation and - prediction happens on GPU. - **pipeline_kwargs: Any kwargs to pass to the pipeline - initialization. If ``pipeline`` is None, this must contain - the 'task' argument. Cannot contain 'model'. Can be used - to override the tokenizer with 'tokenizer'. If ``use_gpu`` is - True, 'device' will be set to 0 by default, unless 'device_map' is - passed. - """ - if not pipeline_cls and "task" not in pipeline_kwargs: - raise ValueError( - "If `pipeline_cls` is not specified, 'task' must be passed as a kwarg." - ) - if use_gpu and "device_map" not in pipeline_kwargs: - # default to using the GPU with the first index - pipeline_kwargs.setdefault("device", 0) - pipeline_cls = pipeline_cls or pipeline_factory - preprocessor = checkpoint.get_preprocessor() - with checkpoint.as_directory() as checkpoint_path: - # Tokenizer will be loaded automatically (no need to specify - # `tokenizer=checkpoint_path`) - pipeline = pipeline_cls(model=checkpoint_path, **pipeline_kwargs) - return cls( - pipeline=pipeline, - preprocessor=preprocessor, - use_gpu=use_gpu, - ) - - def _predict( - self, data: Union[list, pd.DataFrame], **pipeline_call_kwargs - ) -> pd.DataFrame: - ret = self.pipeline(data, **pipeline_call_kwargs) - # Remove unnecessary lists - try: - new_ret = [x[0] if isinstance(x, list) and len(x) == 1 else x for x in ret] - df = pd.DataFrame(new_ret) - except Exception: - # if we fail for any reason, just give up - df = pd.DataFrame(ret) - df.columns = [str(col) for col in df.columns] - return df - - @staticmethod - def _convert_data_for_pipeline( - data: pd.DataFrame, pipeline: Pipeline - ) -> Union[list, pd.DataFrame]: - """Convert the data into a format accepted by the pipeline. - - In most cases, this format is a list of strings.""" - # Special case where pd.DataFrame is allowed. - if isinstance(pipeline, TableQuestionAnsweringPipeline): - # TODO(team-ml): This may be a performance bottleneck. - return data - - # Otherwise, a list of columns as lists. - columns = [data[col].to_list() for col in data.columns] - # Flatten if it's only one column. - while isinstance(columns, list) and len(columns) == 1: - columns = columns[0] - return columns - - def predict( - self, - data: DataBatchType, - feature_columns: Optional[Union[List[str], List[int]]] = None, - **predict_kwargs, - ) -> DataBatchType: - """Run inference on data batch. - - The data is converted into a list (unless ``pipeline`` is a - ``TableQuestionAnsweringPipeline``) and passed to the ``pipeline`` - object. - - Args: - data: A batch of input data. Either a pandas DataFrame or numpy - array. - feature_columns: The names or indices of the columns in the - data to use as features to predict on. If None, use all - columns. - **pipeline_call_kwargs: additional kwargs to pass to the - ``pipeline`` object. - - Examples: - >>> import pandas as pd - >>> from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - >>> from transformers.pipelines import pipeline - >>> from ray.train.huggingface import HuggingFacePredictor - >>> - >>> model_checkpoint = "gpt2" - >>> tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer" - >>> tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) - >>> - >>> model_config = AutoConfig.from_pretrained(model_checkpoint) - >>> model = AutoModelForCausalLM.from_config(model_config) - >>> predictor = HuggingFacePredictor( - ... pipeline=pipeline( - ... task="text-generation", model=model, tokenizer=tokenizer - ... ) - ... ) - >>> - >>> prompts = pd.DataFrame( - ... ["Complete me", "And me", "Please complete"], columns=["sentences"] - ... ) - >>> predictions = predictor.predict(prompts) - - - Returns: - Prediction result. - """ - return Predictor.predict( - self, data, feature_columns=feature_columns, **predict_kwargs - ) - - def _predict_pandas( - self, - data: "pd.DataFrame", - feature_columns: Optional[List[str]] = None, - **pipeline_call_kwargs, - ) -> "pd.DataFrame": - if TENSOR_COLUMN_NAME in data: - arr = data[TENSOR_COLUMN_NAME].to_numpy() - if feature_columns: - data = pd.DataFrame(arr[:, feature_columns]) - elif feature_columns: - data = data[feature_columns] - - data = data[feature_columns] if feature_columns else data - - data = self._convert_data_for_pipeline(data, self.pipeline) - return self._predict(data, **pipeline_call_kwargs) +__all__ = [ + "HuggingFacePredictor", +] diff --git a/python/ray/train/huggingface/huggingface_trainer.py b/python/ray/train/huggingface/huggingface_trainer.py index a29414786f32..7255a13d262c 100644 --- a/python/ray/train/huggingface/huggingface_trainer.py +++ b/python/ray/train/huggingface/huggingface_trainer.py @@ -1,473 +1,22 @@ -import importlib.util -import inspect -import os -import sys import warnings -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Type +from ray.util.annotations import Deprecated -try: - from packaging.version import Version -except ImportError: - from distutils.version import LooseVersion as Version - - -import transformers -import transformers.modeling_utils -import transformers.trainer -import transformers.training_args -from transformers.trainer_utils import IntervalStrategy -from transformers.utils import is_datasets_available -from torch.utils.data import Dataset as TorchDataset - -from ray.air import session -from ray.air.checkpoint import Checkpoint -from ray.air.config import DatasetConfig, RunConfig, ScalingConfig -from ray.train.constants import ( - EVALUATION_DATASET_KEY, - TRAIN_DATASET_KEY, -) -from ray.train.data_parallel_trainer import DataParallelTrainer -from ray.train.huggingface._huggingface_utils import ( - TrainReportCallback, - process_datasets, - wrap_transformers_trainer, +from ray.train.hf_transformers.transformers_trainer import ( + TransformersTrainer, ) -from ray.train.torch import TorchConfig, TorchTrainer -from ray.train.trainer import GenDataset -from ray.util import PublicAPI - -if TYPE_CHECKING: - from ray.data.preprocessor import Preprocessor - -# Due to HF Dataset's dynamic module system, we need to dynamically import the -# datasets_modules module on every actor when training. -# We accomplish this by simply running the following bit of code directly -# in module you are currently viewing. This ensures that when we -# unpickle the HuggingFaceTrainer, it will be ran before pickle tries to -# import datasets_modules and prevents an exception from being thrown. -# Same logic is present inside HF Transformers Ray integration: -# https://github.com/huggingface/transformers/blob/\ -# 7d5fde991d598370d961be8cb7add6541e2b59ce/src/transformers/integrations.py#L271 -# Also see https://github.com/ray-project/ray/issues/28084 -if "datasets_modules" not in sys.modules and is_datasets_available(): - import datasets.load - - dynamic_modules_path = os.path.join( - datasets.load.init_dynamic_modules(), "__init__.py" - ) - # load dynamic_modules from path - spec = importlib.util.spec_from_file_location( - "datasets_modules", dynamic_modules_path - ) - datasets_modules = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = datasets_modules - spec.loader.exec_module(datasets_modules) - - -TRAINER_INIT_FN_KEY = "_trainer_init_per_worker" - - -@PublicAPI(stability="alpha") -class HuggingFaceTrainer(TorchTrainer): - """A Trainer for data parallel HuggingFace Transformers on PyTorch training. - - This Trainer runs the ``transformers.Trainer.train()`` method on multiple - Ray Actors. The training is carried out in a distributed fashion through PyTorch - DDP. These actors already have the necessary torch process group already - configured for distributed PyTorch training. If you have PyTorch >= 1.12.0 - installed, you can also run FSDP training by specifying the ``fsdp`` argument - in ``TrainingArguments``. DeepSpeed is - also supported - see :doc:`/ray-air/examples/gptj_deepspeed_fine_tuning`. - For more information on configuring FSDP or DeepSpeed, refer to `Hugging Face - documentation `__. - - The training function ran on every Actor will first run the - specified ``trainer_init_per_worker`` function to obtain an instantiated - ``transformers.Trainer`` object. The ``trainer_init_per_worker`` function - will have access to preprocessed train and evaluation datasets. - - If the ``datasets`` dict contains a training dataset (denoted by - the "train" key), then it will be split into multiple dataset - shards, with each Actor training on a single shard. - All the other datasets will not be split. - - You can also provide ``datasets.Dataset`` object or other dataset objects - allowed by ``transformers.Trainer`` directly in the ``trainer_init_per_worker`` - function, without specifying the ``datasets`` dict. It is recommended to initialize - those objects inside the function, as otherwise they will be serialized and passed - to the function, which may lead to long runtime and memory issues with large - amounts of data. - - Please note that if you use a custom ``transformers.Trainer`` subclass, - the ``get_train_dataloader`` method will be wrapped around to disable - sharding by ``transformers.IterableDatasetShard``, as the dataset will - already be sharded on the Ray AIR side. - - HuggingFace loggers will be automatically disabled, and the ``local_rank`` - argument in ``TrainingArguments`` will be automatically set. Please note - that if you want to use CPU training, you will need to set the ``no_cuda`` - argument in ``TrainingArguments`` manually - otherwise, an exception - (segfault) may be thrown. - - This Trainer requires ``transformers>=4.19.0`` package. - It is tested with ``transformers==4.19.1``. - - Example: - .. code-block:: python - - # Based on - # huggingface/notebooks/examples/language_modeling_from_scratch.ipynb - - # Hugging Face imports - from datasets import load_dataset - import transformers - from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - - import ray - from ray.train.huggingface import HuggingFaceTrainer - from ray.air.config import ScalingConfig - - # If using GPUs, set this to True. - use_gpu = False - - model_checkpoint = "gpt2" - tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer" - block_size = 128 - - datasets = load_dataset("wikitext", "wikitext-2-raw-v1") - tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) - - def tokenize_function(examples): - return tokenizer(examples["text"]) - - tokenized_datasets = datasets.map( - tokenize_function, batched=True, num_proc=1, remove_columns=["text"] - ) - - def group_texts(examples): - # Concatenate all texts. - concatenated_examples = { - k: sum(examples[k], []) for k in examples.keys() - } - total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model - # supported it. - # instead of this drop, you can customize this part to your needs. - total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. - result = { - k: [ - t[i : i + block_size] - for i in range(0, total_length, block_size) - ] - for k, t in concatenated_examples.items() - } - result["labels"] = result["input_ids"].copy() - return result - - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - batch_size=1000, - num_proc=1, - ) - ray_train_ds = ray.data.from_huggingface(lm_datasets["train"]) - ray_evaluation_ds = ray.data.from_huggingface( - lm_datasets["validation"] - ) - - def trainer_init_per_worker(train_dataset, eval_dataset, **config): - model_config = AutoConfig.from_pretrained(model_checkpoint) - model = AutoModelForCausalLM.from_config(model_config) - args = transformers.TrainingArguments( - output_dir=f"{model_checkpoint}-wikitext2", - evaluation_strategy="epoch", - save_strategy="epoch", - logging_strategy="epoch", - learning_rate=2e-5, - weight_decay=0.01, - no_cuda=(not use_gpu), - ) - return transformers.Trainer( - model=model, - args=args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - ) - - scaling_config = ScalingConfig(num_workers=3, use_gpu=use_gpu) - trainer = HuggingFaceTrainer( - trainer_init_per_worker=trainer_init_per_worker, - scaling_config=scaling_config, - datasets={"train": ray_train_ds, "evaluation": ray_evaluation_ds}, - ) - result = trainer.fit() - - Args: - trainer_init_per_worker: The function that returns an instantiated - ``transformers.Trainer`` object and takes in the following arguments: - train ``Torch.Dataset``, optional evaluation ``Torch.Dataset`` - and config as kwargs. The Torch Datasets are automatically - created by converting the Ray Datasets internally before - they are passed into the function. - trainer_init_config: Configurations to pass into - ``trainer_init_per_worker`` as kwargs. - torch_config: Configuration for setting up the PyTorch backend. If set to - None, use the default configuration. This replaces the ``backend_config`` - arg of ``DataParallelTrainer``. Same as in ``TorchTrainer``. - scaling_config: Configuration for how to scale data parallel training. - dataset_config: Configuration for dataset ingest. - run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use - the key "train" to denote which dataset is the training - dataset and key "evaluation" to denote the evaluation - dataset. Can only contain a training dataset - and up to one extra dataset to be used for evaluation. - If a ``preprocessor`` is provided and has not already been fit, - it will be fit on the training dataset. All datasets will be - transformed by the ``preprocessor`` if one is provided. - preprocessor: A ray.data.Preprocessor to preprocess the - provided datasets. - resume_from_checkpoint: A checkpoint to resume training from. - """ - - _dataset_config = { - # training dataset should be split by us - "train": DatasetConfig(fit=True, split=True), - # do not split eval dataset, as HF has a system to parallelize - # evaluation across workers, and it requires each worker - # to have the full eval dataset - "evaluation": DatasetConfig(split=False), - } - - def __init__( - self, - trainer_init_per_worker: Callable[ - [Optional[TorchDataset], Optional[TorchDataset], Any], - transformers.trainer.Trainer, - ], - *, - trainer_init_config: Optional[Dict] = None, - torch_config: Optional[TorchConfig] = None, - scaling_config: Optional[ScalingConfig] = None, - dataset_config: Optional[Dict[str, DatasetConfig]] = None, - run_config: Optional[RunConfig] = None, - datasets: Optional[Dict[str, GenDataset]] = None, - preprocessor: Optional["Preprocessor"] = None, - resume_from_checkpoint: Optional[Checkpoint] = None, - ): - - # Functionality required for HuggingFaceTrainer only added in this - # version - if Version(transformers.__version__) < Version("4.19.0"): - raise RuntimeError( - "HuggingFaceTrainer requires transformers>=4.19.0, but you " - f"have {transformers.__version__} which is incompatible. " - "Update on all nodes with `pip install -U 'transformers>=4.19.0'`." - ) - - self._validate_trainer_init_per_worker( - trainer_init_per_worker, "trainer_init_per_worker" - ) - - super().__init__( - train_loop_per_worker=_huggingface_train_loop_per_worker, - train_loop_config=self._create_trainer_init_config( - trainer_init_per_worker, trainer_init_config - ), - torch_config=torch_config, - scaling_config=scaling_config, - dataset_config=dataset_config, - run_config=run_config, - datasets=datasets, - preprocessor=preprocessor, - resume_from_checkpoint=resume_from_checkpoint, - ) - - @classmethod - def _create_trainer_init_config( - cls, - trainer_init_per_worker: Callable[ - [TorchDataset, Optional[TorchDataset], Any], - transformers.trainer.Trainer, - ], - trainer_init_config: Optional[Dict[str, Any]], - ) -> Dict[str, Any]: - trainer_init_config = trainer_init_config.copy() if trainer_init_config else {} - if TRAINER_INIT_FN_KEY in trainer_init_config: - raise ValueError( - f"'{TRAINER_INIT_FN_KEY}' is a reserved key in `trainer_init_config`." - ) - if trainer_init_per_worker: - trainer_init_config[TRAINER_INIT_FN_KEY] = trainer_init_per_worker - return trainer_init_config - - @classmethod - def restore( - cls: Type["HuggingFaceTrainer"], - path: str, - trainer_init_per_worker: Optional[ - Callable[ - [TorchDataset, Optional[TorchDataset], Any], - transformers.trainer.Trainer, - ] - ] = None, - trainer_init_config: Optional[Dict] = None, - datasets: Optional[Dict[str, GenDataset]] = None, - preprocessor: Optional["Preprocessor"] = None, - scaling_config: Optional[ScalingConfig] = None, - ) -> "HuggingFaceTrainer": - """Restores a HuggingFaceTrainer from a previously interrupted/failed run. - - Args: - trainer_init_per_worker: Optionally re-specified trainer init function. - This should be used to re-specify a function that is not - restorable in a new Ray cluster (e.g., it holds onto outdated - object references). This should be the same trainer init - that was passed to the original trainer constructor. - trainer_init_config: Optionally re-specified trainer init config. - This should similarly be used if the original `train_loop_config` - contained outdated object references, and it should not be modified - from what was originally passed in. - - See :meth:`BaseTrainer.restore() ` - for descriptions of the other arguments. - - Returns: - HuggingFaceTrainer: A restored instance of `HuggingFaceTrainer` - """ - return super(DataParallelTrainer, cls).restore( - path=path, - trainer_init_per_worker=trainer_init_per_worker, - trainer_init_config=trainer_init_config, - datasets=datasets, - preprocessor=preprocessor, - scaling_config=scaling_config, - ) - - def _validate_trainer_init_per_worker( - self, trainer_init_per_worker: Callable, fn_name: str - ) -> None: - num_params = len(inspect.signature(trainer_init_per_worker).parameters) - if num_params < 3: - raise ValueError( - f"{fn_name} should take in at least 3 arguments, " - f"but it accepts {num_params} arguments instead." - ) - - def _validate_attributes(self): - for key, conf in self._dataset_config.items(): - if conf.use_stream_api: - raise ValueError( - "HuggingFaceTrainer does not support `use_stream_api`." - ) - gpus_per_worker = self.scaling_config.num_gpus_per_worker - if gpus_per_worker > 1: - raise ValueError( - f"You have assigned {gpus_per_worker} GPUs per worker. " - "This is not supported by HuggingFace, which expects " - "one GPU per worker in DDP mode and will fail " - "if more are assigned." - ) - if gpus_per_worker != int(gpus_per_worker): - raise ValueError( - f"You have assigned {gpus_per_worker} GPUs per worker, " - "but fractional GPUs are not supported by HuggingFace." - ) - - super()._validate_attributes() - - -def _huggingface_train_loop_per_worker(config): - """Per-worker training loop for HuggingFace Transformers.""" - trainer_init_per_worker = config.pop("_trainer_init_per_worker") - - train_dataset = session.get_dataset_shard(TRAIN_DATASET_KEY) - eval_dataset = session.get_dataset_shard(EVALUATION_DATASET_KEY) - - train_torch_dataset, eval_torch_dataset = process_datasets( - train_dataset, - eval_dataset, - ) - - trainer: transformers.trainer.Trainer = trainer_init_per_worker( - train_torch_dataset, eval_torch_dataset, **config - ) - - strategies = [ - strategy - for strategy in (trainer.args.evaluation_strategy, trainer.args.save_strategy) - if strategy not in ("no", IntervalStrategy.NO) - ] - strategies = [trainer.args.logging_strategy] + strategies - if not all(strategy == strategies[0] for strategy in strategies[1:]): - raise ValueError( - "When using Ray AIR,`logging_strategy`, `evaluation_strategy` " - "and `save_strategy` must all be set to the same value. " - "`evaluation_strategy` or `save_strategy` may also be set to 'no'.\n" - f"Got `logging_strategy`={trainer.args.logging_strategy}\n" - f"`evaluation_strategy`={trainer.args.evaluation_strategy}\n" - f"`save_strategy`={trainer.args.save_strategy}" - ) - - if trainer.args.save_strategy in ("steps", IntervalStrategy.STEPS): - if ( - trainer.args.save_steps < trainer.args.logging_steps - or trainer.args.save_steps % trainer.args.logging_steps != 0 - ): - raise ValueError( - "When using 'steps' `save_strategy`, `save_steps` must be " - "equal or bigger to `logging_steps`, and must be divisible " - "by `logging_steps` (so that saving occurs at the same time " - f"logging does). Got `save_steps`={trainer.args.save_steps}, " - f"`logging_steps`={trainer.args.logging_steps}." - ) - - if trainer.args.evaluation_strategy in ("steps", IntervalStrategy.STEPS): - if trainer.args.logging_steps != trainer.args.eval_steps: - raise ValueError( - "`logging_steps` must be equal to `eval_steps`. " - f"Got `logging_steps`={trainer.args.logging_steps}, " - f"`eval_steps`={trainer.args.eval_steps}" - ) - - if trainer.args.load_best_model_at_end: - raise ValueError( - "As Ray AIR replaces Hugging Face checkpointing, " - "`load_best_model_at_end` must be set to False.\n" - "You can obtain the AIR Checkpoint with " - "`Result.checkpoint` returned by the `fit()` method " - "of this Trainer, and the model itself by calling " - "`Checkpoint.get_model()`.\n" - "You can configure the checkpointing by setting " - "`run_config.checkpoint_config`." - ) - if trainer.args.push_to_hub and not trainer.args.hub_token: - warnings.warn( - "You have set `push_to_hub=True` but didn't specify `hub_token`. " - "Pushing to hub will most likely fail, as the credentials will not " - "be automatically propagated from the local enviroment to the Ray Actors. " - "If that happens, specify `hub_token` in `TrainingArguments`." - ) +from ._deprecation_msg import deprecation_msg - trainer = wrap_transformers_trainer(trainer) - # ensure no HF logging callbacks are added - # aside from doubling functionality with our callbacks, - # the Wandb callbacks causes training to freeze - integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( - trainer.args.report_to - ) - for callback in integration_callbacks: - trainer.pop_callback(callback) +@Deprecated(message=deprecation_msg) +class HuggingFaceTrainer(TransformersTrainer): + # Use __new__ as it is much less likely to be overriden + # than __init__ + def __new__(cls: type, *args, **kwargs): + warnings.warn(deprecation_msg, DeprecationWarning) + return super(HuggingFaceTrainer, cls).__new__(cls, *args, **kwargs) - trainer.add_callback(TrainReportCallback) - checkpoint = session.get_checkpoint() - if checkpoint: - with checkpoint.as_directory() as checkpoint_path: - trainer.train(resume_from_checkpoint=checkpoint_path) - else: - trainer.train() +__all__ = [ + "HuggingFaceTrainer", +] diff --git a/python/ray/train/lightgbm/lightgbm_trainer.py b/python/ray/train/lightgbm/lightgbm_trainer.py index 6ae4fe6f0f01..38ebe60f9cd8 100644 --- a/python/ray/train/lightgbm/lightgbm_trainer.py +++ b/python/ray/train/lightgbm/lightgbm_trainer.py @@ -1,4 +1,9 @@ -from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING +from typing import Dict, Any, Optional, Tuple, Union, TYPE_CHECKING + +try: + from packaging.version import Version +except ImportError: + from distutils.version import LooseVersion as Version from ray.air.checkpoint import Checkpoint from ray.train.gbdt_trainer import GBDTTrainer @@ -7,6 +12,7 @@ import lightgbm import lightgbm_ray +import xgboost_ray from lightgbm_ray.tune import TuneReportCheckpointCallback, TuneReportCallback if TYPE_CHECKING: @@ -50,7 +56,7 @@ class LightGBMTrainer(GBDTTrainer): result = trainer.fit() Args: - datasets: Ray Datasets to use for training and validation. Must include a + datasets: Datasets to use for training and validation. Must include a "train" key denoting the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if @@ -65,6 +71,11 @@ class LightGBMTrainer(GBDTTrainer): :class:`xgboost_ray.RayDMatrix` initializations, which in turn are passed to ``lightgbm.Dataset`` objects created on each worker. For example, this can be used to add sample weights with the ``weights`` parameter. + num_boost_round: Target number of boosting iterations (trees in the model). + Note that unlike in ``lightgbm.train``, this is the target number + of trees, meaning that if you set ``num_boost_round=10`` and pass a model + that has already been trained for 5 iterations, it will be trained for 5 + iterations more, instead of 10 more. scaling_config: Configuration for how to scale data parallel training. run_config: Configuration for the execution of the training run. preprocessor: A ray.data.Preprocessor to preprocess the @@ -100,5 +111,18 @@ def _load_checkpoint( def _save_model(self, model: lightgbm.LGBMModel, path: str): model.booster_.save_model(path) - def _model_iteration(self, model: lightgbm.LGBMModel) -> int: + def _model_iteration( + self, model: Union[lightgbm.LGBMModel, lightgbm.Booster] + ) -> int: + if isinstance(model, lightgbm.Booster): + return model.current_iteration() return model.booster_.current_iteration() + + def preprocess_datasets(self) -> None: + super().preprocess_datasets() + + # XGBoost/LightGBM-Ray requires each dataset to have at least as many + # blocks as there are workers. + # This is only applicable for xgboost-ray<0.1.16 + if Version(xgboost_ray.__version__) < Version("0.1.16"): + self._repartition_datasets_to_match_num_actors() diff --git a/python/ray/train/lightning/_lightning_utils.py b/python/ray/train/lightning/_lightning_utils.py index 4d2c90987b3d..ddd8629bc28e 100644 --- a/python/ray/train/lightning/_lightning_utils.py +++ b/python/ray/train/lightning/_lightning_utils.py @@ -1,26 +1,38 @@ +import ray +from ray.air import session +from ray.air.constants import MODEL_KEY +from ray.data.dataset import DataIterator +from ray.train.lightning.lightning_checkpoint import LightningCheckpoint + import logging import shutil import torch import tempfile from packaging.version import Version from typing import Any, Dict, Optional +from torch.utils.data import IterableDataset, DataLoader import pytorch_lightning as pl from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.plugins.environments import LightningEnvironment from pytorch_lightning.strategies import DDPStrategy -if Version(pl.__version__) >= Version("2.0.0"): +_LIGHTNING_GREATER_EQUAL_2_0 = Version(pl.__version__) >= Version("2.0.0") +_TORCH_GREATER_EQUAL_1_12 = Version(torch.__version__) >= Version("1.12.0") +_TORCH_FSDP_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.distributed.is_available() + +if _LIGHTNING_GREATER_EQUAL_2_0: from pytorch_lightning.strategies import FSDPStrategy else: from pytorch_lightning.strategies import DDPFullyShardedStrategy as FSDPStrategy -import ray -from ray.air import session -from ray.air.constants import MODEL_KEY -from ray.train.lightning.lightning_checkpoint import LightningCheckpoint -from torch.utils.data import IterableDataset, DataLoader -from ray.data.datastream import DataIterator +if _TORCH_FSDP_AVAILABLE: + from torch.distributed.fsdp import ( + FullStateDictConfig, + FullyShardedDataParallel, + StateDictType, + ) + logger = logging.getLogger(__name__) @@ -65,6 +77,25 @@ def distributed_sampler_kwargs(self) -> Dict[str, Any]: rank=self.global_rank, ) + def lightning_module_state_dict(self) -> Dict[str, Any]: + """Gathers the full state dict to rank 0 on CPU.""" + assert self.model is not None, "Failed to get the state dict for a None model!" + + if _LIGHTNING_GREATER_EQUAL_2_0 and _TORCH_FSDP_AVAILABLE: + with FullyShardedDataParallel.state_dict_type( + module=self.model, + state_dict_type=StateDictType.FULL_STATE_DICT, + state_dict_config=FullStateDictConfig( + offload_to_cpu=True, rank0_only=True + ), + ): + state_dict = self.model.state_dict() + prefix_len = len("_forward_module.") + return {k[prefix_len:]: v for k, v in state_dict.items()} + else: + # Otherwise Lightning uses Fairscale FSDP, no need to unshard by ourself. + return super().lightning_module_state_dict() + class RayEnvironment(LightningEnvironment): """Setup Lightning DDP training environment for Ray cluster.""" diff --git a/python/ray/train/lightning/lightning_trainer.py b/python/ray/train/lightning/lightning_trainer.py index 3ec7b14db04c..dd6c4a9969d9 100644 --- a/python/ray/train/lightning/lightning_trainer.py +++ b/python/ray/train/lightning/lightning_trainer.py @@ -1,16 +1,10 @@ import os +import pytorch_lightning as pl + from inspect import isclass from typing import Any, Dict, Optional, Type -import pytorch_lightning as pl from pytorch_lightning.plugins.environments import ClusterEnvironment -from packaging.version import Version - -if Version(pl.__version__) >= Version("2.0.0"): - from pytorch_lightning.callbacks.progress import ProgressBar as ProgressBarBase -else: - from pytorch_lightning.callbacks.progress.base import ProgressBarBase - from ray.air import session from ray.air.config import CheckpointConfig, DatasetConfig, RunConfig, ScalingConfig from ray.air.constants import MODEL_KEY @@ -226,7 +220,7 @@ class LightningTrainer(TorchTrainer): ``pytorch_lightning.LightningModule`` using the arguments provided in ``LightningConfigBuilder.module()``. - For data ingestion, the LightningTrainer will then either convert the Ray Dataset + For data ingestion, the LightningTrainer will then either convert the Dataset shards to a ``pytorch_lightning.LightningDataModule``, or directly use the datamodule or dataloaders if provided by users. @@ -348,18 +342,19 @@ def configure_optimizers(self): scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. - datasets: A dictionary of Ray Datasets to use for training. + datasets: A dictionary of Datasets to use for training. Use the key "train" to denote which dataset is the training dataset and (optionally) key "val" to denote the validation dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if one is provided. - datasets_iter_config: Configurations for iterating over input Ray datasets. + datasets_iter_config: Configurations for iterating over input Datasets. This configuration is only valid when `datasets` argument is provided to the LightningTrainer. Otherwise, LightningTrainer will use datamodule or dataloaders specified in ``LightningConfig.trainer_init_config``. For valid arguments to pass, please refer to: - :py:meth:`Dataset.iter_torch_batches ` + :py:meth:`Dataset.iter_torch_batches + ` preprocessor: A ray.data.Preprocessor to preprocess the provided datasets. resume_from_checkpoint: A checkpoint to resume training from. @@ -488,13 +483,13 @@ def _lightning_train_loop_per_worker(config): if not (train_dataloaders or datamodule or train_ray_dataset): raise RuntimeError( "Please provide at least one of the following data inputs: " - "train_dataloaders, datamodule, or Ray Datasets with key 'train'." + "train_dataloaders, datamodule, or Datasets with key 'train'." ) if train_ray_dataset: if datamodule: logger.warning( - "Using Ray datasets as primary input. The 'datamodule' defined in " + "Using Datasets as primary input. The 'datamodule' defined in " "'LightningConfig.trainer_fit_params' is ignored!" ) @@ -508,13 +503,6 @@ def _lightning_train_loop_per_worker(config): lightning_module = module_class(**module_init_config) # Prepare Lightning Trainer - # Disable the Lightning progress bar to avoid corrupted AIR outputs, - # unless users provide a customized progress bar callback. - trainer_config["enable_progress_bar"] = any( - isinstance(callback, ProgressBarBase) - for callback in trainer_config.get("callbacks", []) - ) - # Setup trainer's parallel devices if trainer_config.get("accelerator", None) == "gpu": current_device = get_worker_root_device() diff --git a/python/ray/train/mosaic/mosaic_trainer.py b/python/ray/train/mosaic/mosaic_trainer.py index cef1789616b0..d9c0ce5a6eb2 100644 --- a/python/ray/train/mosaic/mosaic_trainer.py +++ b/python/ray/train/mosaic/mosaic_trainer.py @@ -108,7 +108,7 @@ class MosaicTrainer(TorchTrainer): ``composer.Trainer`` object and takes in configuration dictionary (``config``) as an argument. This dictionary is based on ``trainer_init_config`` and is modified for Ray - Composer integration. - datasets: Any Ray Datasets to use for training. At the moment, we do not support + datasets: Any Datasets to use for training. At the moment, we do not support passing datasets to the trainer and using the dataset shards in the trainer loop. Instead, configure and load the datasets inside ``trainer_init_per_worker`` function diff --git a/python/ray/train/rl/rl_trainer.py b/python/ray/train/rl/rl_trainer.py index c6feb79fedc4..71b41779d5af 100644 --- a/python/ray/train/rl/rl_trainer.py +++ b/python/ray/train/rl/rl_trainer.py @@ -37,7 +37,7 @@ class RLTrainer(BaseTrainer): (e.g. ``"PPO"``) or a RLlib trainer class. scaling_config: Configuration for how to scale training. run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use the key "train" + datasets: Any Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed diff --git a/python/ray/train/session.py b/python/ray/train/session.py index ca46d3e2f79d..7b53116e6254 100644 --- a/python/ray/train/session.py +++ b/python/ray/train/session.py @@ -81,7 +81,7 @@ def get_dataset_shard( if shard is None: warnings.warn( "No dataset passed in. Returning None. Make sure to " - "pass in a Ray Dataset to Trainer.run to use this " + "pass in a Dataset to Trainer.run to use this " "function." ) elif isinstance(shard, dict): diff --git a/python/ray/train/sklearn/sklearn_trainer.py b/python/ray/train/sklearn/sklearn_trainer.py index e31205783a4a..60dbb9c85ac1 100644 --- a/python/ray/train/sklearn/sklearn_trainer.py +++ b/python/ray/train/sklearn/sklearn_trainer.py @@ -84,7 +84,7 @@ class SklearnTrainer(BaseTrainer): Args: estimator: A scikit-learn compatible estimator to use. - datasets: Ray Datasets to use for training and validation. Must include a + datasets: Datasets to use for training and validation. Must include a "train" key denoting the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if diff --git a/python/ray/train/tensorflow/tensorflow_trainer.py b/python/ray/train/tensorflow/tensorflow_trainer.py index 6b2ef8609df1..28ea7f8a2c74 100644 --- a/python/ray/train/tensorflow/tensorflow_trainer.py +++ b/python/ray/train/tensorflow/tensorflow_trainer.py @@ -65,7 +65,7 @@ def train_loop_per_worker(): # Returns dict of last saved checkpoint. session.get_checkpoint() - # Returns the Ray Dataset shard for the given key. + # Returns the Dataset shard for the given key. session.get_dataset_shard("my_dataset") # Returns the total number of workers executing training. @@ -154,7 +154,7 @@ def train_loop_per_worker(config): scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use + datasets: Any Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed diff --git a/python/ray/train/tests/lightning_test_utils.py b/python/ray/train/tests/lightning_test_utils.py index fcf37af1becc..36288308cf59 100644 --- a/python/ray/train/tests/lightning_test_utils.py +++ b/python/ray/train/tests/lightning_test_utils.py @@ -7,11 +7,16 @@ class LinearModule(pl.LightningModule): - def __init__(self, input_dim, output_dim) -> None: + def __init__(self, input_dim, output_dim, strategy="ddp") -> None: super().__init__() self.linear = nn.Linear(input_dim, output_dim) + self.loss = [] + self.strategy = strategy def forward(self, input): + # Backwards compat for Ray data strict mode. + if isinstance(input, dict) and len(input) == 1: + input = list(input.values())[0] return self.linear(input) def training_step(self, batch): @@ -22,17 +27,23 @@ def training_step(self, batch): def validation_step(self, val_batch, batch_idx): loss = self.forward(val_batch) + self.loss.append(loss) return {"val_loss": loss} - def validation_epoch_end(self, outputs) -> None: - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() + def on_validation_epoch_end(self) -> None: + avg_loss = torch.stack(self.loss).mean() self.log("val_loss", avg_loss) + self.loss.clear() def predict_step(self, batch, batch_idx): return self.forward(batch) def configure_optimizers(self): - return torch.optim.SGD(self.parameters(), lr=0.1) + if self.strategy == "fsdp": + # Feed FSDP wrapped model parameters to optimizer + return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1) + else: + return torch.optim.SGD(self.parameters(), lr=0.1) class DoubleLinearModule(pl.LightningModule): @@ -40,6 +51,7 @@ def __init__(self, input_dim_1, input_dim_2, output_dim) -> None: super().__init__() self.linear_1 = nn.Linear(input_dim_1, output_dim) self.linear_2 = nn.Linear(input_dim_2, output_dim) + self.loss = [] def forward(self, batch): input_1 = batch["input_1"] @@ -54,12 +66,14 @@ def training_step(self, batch): def validation_step(self, val_batch, batch_idx): loss = self.forward(val_batch) + self.loss.append(loss) return {"val_loss": loss} - def validation_epoch_end(self, outputs) -> None: + def on_validation_epoch_end(self) -> None: print("Validation Epoch:", self.current_epoch) - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() + avg_loss = torch.stack(self.loss).mean() self.log("val_loss", avg_loss) + self.loss.clear() def predict_step(self, batch, batch_idx): return self.forward(batch) @@ -91,7 +105,9 @@ def __init__(self, lr: float, layer_1: int, layer_2: int): self.layer_1 = torch.nn.Linear(28 * 28, layer_1) self.layer_2 = torch.nn.Linear(layer_1, layer_2) self.layer_3 = torch.nn.Linear(layer_2, 10) - self.accuracy = Accuracy() + self.accuracy = Accuracy(task="multiclass", num_classes=10) + self.val_acc_list = [] + self.val_loss_list = [] def forward(self, x): batch_size, channels, width, height = x.size() @@ -121,13 +137,17 @@ def validation_step(self, val_batch, batch_idx): logits = self.forward(x) loss = F.nll_loss(logits, y) acc = self.accuracy(logits, y) + self.val_acc_list.append(acc) + self.val_loss_list.append(loss) return {"val_loss": loss, "val_accuracy": acc} - def validation_epoch_end(self, outputs): - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() - avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean() + def on_validation_epoch_end(self): + avg_loss = torch.stack(self.val_loss_list).mean() + avg_acc = torch.stack(self.val_acc_list).mean() self.log("ptl/val_loss", avg_loss) self.log("ptl/val_accuracy", avg_acc) + self.val_acc_list.clear() + self.val_loss_list.clear() def predict_step(self, batch, batch_idx, dataloader_idx=None): x = batch diff --git a/python/ray/train/tests/test_accelerate_trainer_gpu.py b/python/ray/train/tests/test_accelerate_trainer_gpu.py index f71f3a90a76e..a57290aea070 100644 --- a/python/ray/train/tests/test_accelerate_trainer_gpu.py +++ b/python/ray/train/tests/test_accelerate_trainer_gpu.py @@ -11,7 +11,7 @@ from ray.air import session from ray.train.tests.dummy_preprocessor import DummyPreprocessor from ray.train.torch.torch_checkpoint import TorchCheckpoint -from ray.train.huggingface.accelerate import AccelerateTrainer +from ray.train.hf_accelerate import AccelerateTrainer from accelerate import Accelerator ACCELERATE_CONFIG_CPU = """compute_environment: LOCAL_MACHINE diff --git a/python/ray/train/tests/test_backend.py b/python/ray/train/tests/test_backend.py index 1667dc8678a5..7508a8d51eb5 100644 --- a/python/ray/train/tests/test_backend.py +++ b/python/ray/train/tests/test_backend.py @@ -71,7 +71,11 @@ def mock_add_workers(self, num_workers): original_add_workers(self, num_workers) for i, worker in enumerate(self.workers): metadata = WorkerMetadata( - node_id=0, node_ip=str(i % 2), hostname=0, gpu_ids=[0] + node_id=0, + node_ip=str(i % 2), + hostname=0, + gpu_ids=[0], + pid=0, ) worker.metadata = metadata diff --git a/python/ray/train/tests/test_base_trainer.py b/python/ray/train/tests/test_base_trainer.py index ad9d7a4ab9a7..505b3bef039a 100644 --- a/python/ray/train/tests/test_base_trainer.py +++ b/python/ray/train/tests/test_base_trainer.py @@ -60,7 +60,7 @@ def fit(self, ds): self.fit_counter += 1 def transform(self, ds): - return ds.map(lambda x: x + 1) + return ds.map(lambda x: {"item": x["item"] + 1}) class DummyTrainer(BaseTrainer): @@ -102,7 +102,7 @@ def test_preprocess_datasets(ray_start_4_cpus): ctx.execution_options.preserve_order = True def training_loop(self): - assert self.datasets["my_dataset"].take() == [2, 3, 4] + assert self.datasets["my_dataset"].take_batch()["item"].tolist() == [2, 3, 4] datasets = {"my_dataset": ray.data.from_items([1, 2, 3])} trainer = DummyTrainer( @@ -144,8 +144,8 @@ def training_loop(self): # Fit was only called once. assert self.preprocessor.fit_counter == 1 # Datasets should all be transformed. - assert self.datasets["train"].take() == [2, 3, 4] - assert self.datasets["my_dataset"].take() == [2, 3, 4] + assert self.datasets["train"].take_batch()["item"].tolist() == [2, 3, 4] + assert self.datasets["my_dataset"].take_batch()["item"].tolist() == [2, 3, 4] if gen_dataset: datasets = { @@ -168,8 +168,8 @@ def training_loop(self): # Make sure fit is not called if preprocessor is already fit. assert self.preprocessor.fit_counter == 1 # Datasets should all be transformed. - assert self.datasets["train"].take() == [2, 3, 4] - assert self.datasets["my_dataset"].take() == [2, 3, 4] + assert self.datasets["train"].take_batch()["item"].tolist() == [2, 3, 4] + assert self.datasets["my_dataset"].take_batch()["item"].tolist() == [2, 3, 4] datasets = { "train": ray.data.from_items([1, 2, 3]), diff --git a/python/ray/train/tests/test_batch_predictor.py b/python/ray/train/tests/test_batch_predictor.py index 7b8175040108..a404ea5f0f5e 100644 --- a/python/ray/train/tests/test_batch_predictor.py +++ b/python/ray/train/tests/test_batch_predictor.py @@ -110,7 +110,7 @@ def test_separate_gpu_stage(shutdown_only): DummyPredictor, ) ds = batch_predictor.predict( - ray.data.range_table(10), + ray.data.range(10), num_gpus_per_worker=1, separate_gpu_stage=True, allow_gpu=True, @@ -118,10 +118,10 @@ def test_separate_gpu_stage(shutdown_only): stats = ds.stats() assert "Stage 1 ReadRange->DummyPreprocessor:" in stats, stats assert "Stage 2 MapBatches(ScoringWrapper):" in stats, stats - assert ds.max("value") == 36.0, ds + assert ds.max("id") == 36.0, ds ds = batch_predictor.predict( - ray.data.range_table(10), + ray.data.range(10), num_gpus_per_worker=1, separate_gpu_stage=False, allow_gpu=True, @@ -129,7 +129,7 @@ def test_separate_gpu_stage(shutdown_only): stats = ds.stats() assert "Stage 1 ReadRange:" in stats, stats assert "Stage 2 MapBatches(ScoringWrapper):" in stats, stats - assert ds.max("value") == 36.0, ds + assert ds.max("id") == 36.0, ds def test_automatic_enable_gpu_from_num_gpus_per_worker(shutdown_only): @@ -143,7 +143,7 @@ def test_automatic_enable_gpu_from_num_gpus_per_worker(shutdown_only): Checkpoint.from_dict({"factor": 2.0, PREPROCESSOR_KEY: DummyPreprocessor()}), DummyPredictor, ) - test_dataset = ray.data.range_table(4) + test_dataset = ray.data.range(4) with pytest.raises( ValueError, match="DummyPredictor does not support GPU prediction" @@ -157,7 +157,7 @@ def test_batch_prediction(): DummyPredictor, ) - test_dataset = ray.data.range_table(4) + test_dataset = ray.data.range(4) ds = batch_predictor.predict(test_dataset).materialize() # Check fusion occurred. assert "ReadRange->DummyPreprocessor" in ds.stats(), ds.stats() @@ -168,7 +168,7 @@ def test_batch_prediction(): 12.0, ] - test_dataset = ray.data.range_table(4) + test_dataset = ray.data.range(4) assert next( batch_predictor.predict_pipelined( test_dataset, blocks_per_window=2 @@ -406,12 +406,12 @@ def test_batch_predictor_transform_config(): def check_batch(batch): assert isinstance(batch, dict) - assert isinstance(batch["value"], np.ndarray) - assert len(batch["value"]) == batch_size + assert isinstance(batch["id"], np.ndarray) + assert len(batch["id"]) == batch_size return batch prep = BatchMapper(check_batch, batch_format="numpy", batch_size=2) - ds = ray.data.range_table(6, parallelism=1) + ds = ray.data.range(6, parallelism=1) batch_predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({"factor": 2.0, PREPROCESSOR_KEY: prep}), @@ -421,7 +421,7 @@ def check_batch(batch): batch_predictor.predict(ds) # Pipelined case. - ds = ray.data.range_table(6, parallelism=1) + ds = ray.data.range(6, parallelism=1) batch_predictor.predict_pipelined(ds, blocks_per_window=1) @@ -497,7 +497,7 @@ def test_get_and_set_preprocessor(): ) assert batch_predictor.get_preprocessor() == preprocessor - test_dataset = ray.data.range_table(4) + test_dataset = ray.data.range(4) output_ds = batch_predictor.predict(test_dataset) assert output_ds.to_pandas().to_numpy().squeeze().tolist() == [ 0.0, @@ -561,26 +561,26 @@ def test_separate_gpu_stage_pipelined(shutdown_only): DummyPredictor, ) ds = batch_predictor.predict_pipelined( - ray.data.range_table(5), + ray.data.range(5), blocks_per_window=1, num_gpus_per_worker=1, separate_gpu_stage=True, allow_gpu=True, ) - out = [x["value"] for x in ds.iter_rows()] + out = [x["id"] for x in ds.iter_rows()] stats = ds.stats() assert "Stage 1 ReadRange->DummyPreprocessor:" in stats, stats assert "Stage 2 MapBatches(ScoringWrapper):" in stats, stats assert max(out) == 16.0, out ds = batch_predictor.predict_pipelined( - ray.data.range_table(5), + ray.data.range(5), blocks_per_window=1, num_gpus_per_worker=1, separate_gpu_stage=False, allow_gpu=True, ) - out = [x["value"] for x in ds.iter_rows()] + out = [x["id"] for x in ds.iter_rows()] stats = ds.stats() assert "Stage 1 ReadRange:" in stats, stats assert "Stage 2 MapBatches(ScoringWrapper):" in stats, stats diff --git a/python/ray/train/tests/test_checkpoints.py b/python/ray/train/tests/test_checkpoints.py index b99f42501e96..c5447c2e7a2c 100644 --- a/python/ray/train/tests/test_checkpoints.py +++ b/python/ray/train/tests/test_checkpoints.py @@ -3,7 +3,7 @@ import pytest from ray.air.constants import MAX_REPR_LENGTH -from ray.train.huggingface import HuggingFaceCheckpoint +from ray.train.hf_transformers import TransformersCheckpoint from ray.train.lightgbm import LightGBMCheckpoint from ray.train.rl import RLCheckpoint from ray.train.sklearn import SklearnCheckpoint @@ -15,7 +15,7 @@ @pytest.mark.parametrize( "checkpoint", [ - HuggingFaceCheckpoint(data_dict={"foo": "bar"}), + TransformersCheckpoint(data_dict={"foo": "bar"}), LightGBMCheckpoint(data_dict={"foo": "bar"}), RLCheckpoint(data_dict={"foo": "bar"}), SklearnCheckpoint(data_dict={"foo": "bar"}), diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py index 1b0577a227af..6d46b0e976ab 100644 --- a/python/ray/train/tests/test_gpu.py +++ b/python/ray/train/tests/test_gpu.py @@ -371,16 +371,16 @@ def test_torch_iter_torch_batches_auto_device(ray_start_4_cpus_2_gpus, use_gpu): def train_fn(): dataset = session.get_dataset_shard("train") for batch in dataset.iter_torch_batches(dtypes=torch.float, device="cpu"): - assert str(batch.device) == "cpu" + assert str(batch["data"].device) == "cpu" # Autodetect for batch in dataset.iter_torch_batches(dtypes=torch.float): - assert str(batch.device) == str(train.torch.get_device()) + assert str(batch["data"].device) == str(train.torch.get_device()) dataset = ray.data.from_numpy(np.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]).T) # Test that this works outside a Train function for batch in dataset.iter_torch_batches(dtypes=torch.float, device="cpu"): - assert str(batch.device) == "cpu" + assert str(batch["data"].device) == "cpu" trainer = TorchTrainer( train_fn, diff --git a/python/ray/train/tests/test_lightgbm_predictor.py b/python/ray/train/tests/test_lightgbm_predictor.py index 5a6d18f7fdce..61bf416ec4fa 100644 --- a/python/ray/train/tests/test_lightgbm_predictor.py +++ b/python/ray/train/tests/test_lightgbm_predictor.py @@ -79,7 +79,10 @@ def test_predict_batch(ray_start_4_cpus, batch_type): data_batch = _convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type]) if batch_type == np.ndarray: + # TODO(ekl) how do we fix this to work with "data" column? dataset = ray.data.from_numpy(dummy_data) + dataset = dataset.add_column("__value__", lambda b: b["data"]) + dataset = dataset.drop_columns(["data"]) elif batch_type == pd.DataFrame: dataset = ray.data.from_pandas(data_batch) elif batch_type == pa.Table: diff --git a/python/ray/train/tests/test_lightgbm_trainer.py b/python/ray/train/tests/test_lightgbm_trainer.py index 430a74ecc48e..37e9ce84b9c1 100644 --- a/python/ray/train/tests/test_lightgbm_trainer.py +++ b/python/ray/train/tests/test_lightgbm_trainer.py @@ -100,7 +100,7 @@ def test_resume_from_checkpoint(ray_start_6_cpus, tmpdir): scaling_config=scale_config, label_column="target", params=params, - num_boost_round=5, + num_boost_round=10, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) diff --git a/python/ray/train/tests/test_lightning_checkpoint.py b/python/ray/train/tests/test_lightning_checkpoint.py index e253bb2a8b85..5109fb0a051b 100644 --- a/python/ray/train/tests/test_lightning_checkpoint.py +++ b/python/ray/train/tests/test_lightning_checkpoint.py @@ -4,9 +4,15 @@ import torch.nn as nn import tempfile -from ray.train.lightning import LightningCheckpoint +import ray from ray.air.constants import MODEL_KEY from torch.utils.data import DataLoader +from ray.train.tests.lightning_test_utils import LinearModule, DummyDataModule +from ray.train.lightning import ( + LightningCheckpoint, + LightningConfigBuilder, + LightningTrainer, +) class Net(pl.LightningModule): @@ -38,7 +44,10 @@ def test_load_from_path(): # Train one epoch and save a checkpoint trainer = pl.Trainer( - max_epochs=1, enable_progress_bar=False, enable_checkpointing=False + max_epochs=1, + accelerator="cpu", + enable_progress_bar=False, + enable_checkpointing=False, ) trainer.fit(model=model, train_dataloaders=dataloader) ckpt_path = f"{tmpdir}/random_checkpoint_name.ckpt" @@ -75,7 +84,10 @@ def test_from_directory(): # Train one epoch and save a checkpoint trainer = pl.Trainer( - max_epochs=1, enable_progress_bar=False, enable_checkpointing=False + max_epochs=1, + accelerator="cpu", + enable_progress_bar=False, + enable_checkpointing=False, ) trainer.fit(model=model, train_dataloaders=dataloader) trainer.save_checkpoint(f"{tmpdir}/{MODEL_KEY}") @@ -94,6 +106,42 @@ def test_from_directory(): assert torch.equal(output, checkpoint_output) +def test_fsdp_checkpoint(): + num_epochs = 1 + batch_size = 8 + input_dim = 32 + output_dim = 4 + dataset_size = 256 + + datamodule = DummyDataModule(batch_size, dataset_size) + + config_builder = ( + LightningConfigBuilder() + .module( + LinearModule, input_dim=input_dim, output_dim=output_dim, strategy="fsdp" + ) + .trainer(max_epochs=num_epochs, accelerator="gpu") + .strategy("fsdp") + .checkpointing(save_last=True) + .fit_params(datamodule=datamodule) + ) + + scaling_config = ray.air.ScalingConfig(num_workers=2, use_gpu=True) + + trainer = LightningTrainer( + lightning_config=config_builder.build(), scaling_config=scaling_config + ) + + results = trainer.fit() + + with results.checkpoint.as_directory() as checkpoint_dir: + checkpoint = torch.load(f"{checkpoint_dir}/{MODEL_KEY}") + model = LinearModule(input_dim=input_dim, output_dim=output_dim) + + for key in model.state_dict().keys(): + assert key in checkpoint["state_dict"] + + if __name__ == "__main__": import sys diff --git a/python/ray/train/tests/test_lightning_predictor.py b/python/ray/train/tests/test_lightning_predictor.py index 49ee42073b16..2c34b5dcc984 100644 --- a/python/ray/train/tests/test_lightning_predictor.py +++ b/python/ray/train/tests/test_lightning_predictor.py @@ -28,7 +28,7 @@ def test_repr(): def save_checkpoint(model: pl.LightningModule, ckpt_path: str): - trainer = pl.Trainer(max_epochs=0) + trainer = pl.Trainer(max_epochs=0, accelerator="cpu") trainer.fit(model, train_dataloaders=DataLoader(torch.randn(1))) trainer.save_checkpoint(ckpt_path) diff --git a/python/ray/train/tests/test_lightning_trainer.py b/python/ray/train/tests/test_lightning_trainer.py index a35f37ac54e9..aab21fb4a6d1 100644 --- a/python/ray/train/tests/test_lightning_trainer.py +++ b/python/ray/train/tests/test_lightning_trainer.py @@ -74,7 +74,7 @@ def test_trainer_with_native_dataloader( config_builder = ( LightningConfigBuilder() - .module(LinearModule, input_dim=32, output_dim=4) + .module(LinearModule, input_dim=32, output_dim=4, strategy=strategy) .trainer(max_epochs=num_epochs, accelerator=accelerator) .strategy(strategy) ) @@ -124,7 +124,7 @@ def test_trainer_with_ray_data(ray_start_6_cpus_2_gpus, strategy, accelerator): lightning_config = ( LightningConfigBuilder() - .module(cls=LinearModule, input_dim=32, output_dim=4) + .module(cls=LinearModule, input_dim=32, output_dim=4, strategy=strategy) .trainer(max_epochs=num_epochs, accelerator=accelerator) .strategy(strategy) .build() diff --git a/python/ray/train/tests/test_sklearn_predictor.py b/python/ray/train/tests/test_sklearn_predictor.py index 063b2ea79b89..c39949ab410c 100644 --- a/python/ray/train/tests/test_sklearn_predictor.py +++ b/python/ray/train/tests/test_sklearn_predictor.py @@ -87,7 +87,10 @@ def test_predict_batch(ray_start_4_cpus, batch_type): data_batch = _convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type]) if batch_type == np.ndarray: + # TODO(ekl) how do we fix this to work with "data" column? dataset = ray.data.from_numpy(dummy_data) + dataset = dataset.add_column("__value__", lambda b: b["data"]) + dataset = dataset.drop_columns(["data"]) elif batch_type == pd.DataFrame: dataset = ray.data.from_pandas(data_batch) elif batch_type == pa.Table: diff --git a/python/ray/train/tests/test_torch_trainer.py b/python/ray/train/tests/test_torch_trainer.py index d7a7f8ef5d81..a2e2dc306f1a 100644 --- a/python/ray/train/tests/test_torch_trainer.py +++ b/python/ray/train/tests/test_torch_trainer.py @@ -1,4 +1,6 @@ import contextlib +import uuid + import pytest import time import torch @@ -11,7 +13,7 @@ from ray.train.batch_predictor import BatchPredictor from ray.train.constants import DISABLE_LAZY_CHECKPOINTING_ENV from ray.train.torch import TorchPredictor, TorchTrainer -from ray.air.config import ScalingConfig +from ray.air.config import RunConfig, ScalingConfig from ray.train.torch import TorchConfig from ray.train.trainer import TrainingFailedError import ray.train as train @@ -158,7 +160,10 @@ def __call__(self, x): predict_dataset = ray.data.range(9) predictions = predict_dataset.map_batches( - TorchScorer, batch_size=3, batch_format="pandas", compute="actors" + TorchScorer, + batch_size=3, + batch_format="pandas", + compute=ray.data.ActorPoolStrategy(), ) assert predictions.count() == 3 @@ -255,7 +260,6 @@ def test_tune_torch_get_device_gpu(num_gpus_per_worker): (for example when used with Tune). """ from ray.air.config import ScalingConfig - import time num_samples = 2 num_workers = 2 @@ -266,6 +270,7 @@ def test_tune_torch_get_device_gpu(num_gpus_per_worker): # Divide by two because of a 2 node cluster. gpus_per_node = total_gpus_required // 2 + exception = None # Use the same number of cpus per node as gpus per node. with ray_start_2_node_cluster( num_cpus_per_node=gpus_per_node, num_gpus_per_node=gpus_per_node @@ -287,12 +292,14 @@ def train_fn(): @ray.remote(num_cpus=0) class TrialActor: def __init__(self, warmup_steps): - # adding warmup_steps to the config - # to avoid the error of checkpoint name conflict - time.sleep(2 * warmup_steps) self.trainer = TorchTrainer( train_fn, torch_config=TorchConfig(backend="gloo"), + run_config=RunConfig( + # Use a unique name to avoid using the same + # experiment directory + name=f"test_tune_torch_get_device_gpu_{uuid.uuid4()}" + ), scaling_config=ScalingConfig( num_workers=num_workers, use_gpu=True, @@ -310,8 +317,15 @@ def __init__(self, warmup_steps): def run(self): return self.trainer.fit() - actors = [TrialActor.remote(1) for _ in range(num_samples)] - ray.get([actor.run.remote() for actor in actors]) + try: + actors = [TrialActor.remote(1) for _ in range(num_samples)] + ray.get([actor.run.remote() for actor in actors]) + except Exception as exc: + exception = exc + + # Raise exception after Ray cluster has been shutdown to avoid corrupted state + if exception: + raise exception def test_torch_auto_unwrap(ray_start_4_cpus): diff --git a/python/ray/train/tests/test_trainer_restore.py b/python/ray/train/tests/test_trainer_restore.py index 94da6f7eae62..f90f6a60b2ea 100644 --- a/python/ray/train/tests/test_trainer_restore.py +++ b/python/ray/train/tests/test_trainer_restore.py @@ -11,7 +11,7 @@ from ray.train.torch import TorchTrainer from ray.train.xgboost import XGBoostTrainer from ray.train.lightgbm import LightGBMTrainer -from ray.train.huggingface import HuggingFaceTrainer +from ray.train.hf_transformers import TransformersTrainer from ray.train.rl import RLTrainer from ray.tune import Callback from ray.data.preprocessors.batch_mapper import BatchMapper @@ -153,11 +153,10 @@ def test_gbdt_trainer_restore(ray_start_6_cpus, tmpdir, trainer_cls): run_config=RunConfig( local_dir=str(tmpdir), name=exp_name, - checkpoint_config=CheckpointConfig(num_to_keep=1, checkpoint_frequency=1), + checkpoint_config=CheckpointConfig( + num_to_keep=1, checkpoint_frequency=1, checkpoint_at_end=False + ), callbacks=[FailureInjectionCallback(num_iters=2)], - # We also use a stopper, since the restored run will go for - # another 5 boosting rounds otherwise. - stop={"training_iteration": 5}, ), num_boost_round=5, ) @@ -172,14 +171,14 @@ def test_gbdt_trainer_restore(ray_start_6_cpus, tmpdir, trainer_cls): assert tmpdir / exp_name in result.log_dir.parents -@pytest.mark.parametrize("trainer_cls", [HuggingFaceTrainer]) +@pytest.mark.parametrize("trainer_cls", [TransformersTrainer]) def test_trainer_with_init_fn_restore(ray_start_4_cpus, tmpdir, trainer_cls): """Tests restore for data parallel trainers that take in a `train_init` function and config. Success criteria: same as for data parallel trainers.""" exp_name = f"{trainer_cls.__name__}_restore_test" - if trainer_cls == HuggingFaceTrainer: - from ray.train.tests.test_huggingface_trainer import ( + if trainer_cls == TransformersTrainer: + from ray.train.tests.test_transformers_trainer import ( train_function as hf_init, train_df, ) @@ -398,7 +397,7 @@ def check_for_raise(): trainer_cls.restore(str(tmpdir)) if should_warn: - with pytest.warns() as warn_record: + with pytest.warns(Warning) as warn_record: check_for_raise() assert any( "Invalid trainer type" in str(record.message) diff --git a/python/ray/train/tests/test_huggingface_checkpoint.py b/python/ray/train/tests/test_transformers_checkpoint.py similarity index 85% rename from python/ray/train/tests/test_huggingface_checkpoint.py rename to python/ray/train/tests/test_transformers_checkpoint.py index b272b2568425..557b237d7fa7 100644 --- a/python/ray/train/tests/test_huggingface_checkpoint.py +++ b/python/ray/train/tests/test_transformers_checkpoint.py @@ -3,11 +3,14 @@ from transformers.pipelines import pipeline import ray -from ray.train.huggingface import HuggingFaceCheckpoint, HuggingFacePredictor +from ray.train.hf_transformers import ( + TransformersCheckpoint, + TransformersPredictor, +) from ray.train.tests.dummy_preprocessor import DummyPreprocessor -from test_huggingface_predictor import ( +from test_transformers_predictor import ( model_checkpoint, tokenizer_checkpoint, test_strings, @@ -15,13 +18,13 @@ ) -def test_huggingface_checkpoint(tmpdir, ray_start_runtime_env): +def test_transformers_checkpoint(tmpdir, ray_start_runtime_env): model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) preprocessor = DummyPreprocessor() - checkpoint = HuggingFaceCheckpoint.from_model( + checkpoint = TransformersCheckpoint.from_model( model, tokenizer, path=tmpdir, preprocessor=preprocessor ) checkpoint_model = checkpoint.get_model(AutoModelForCausalLM) @@ -31,7 +34,7 @@ def test_huggingface_checkpoint(tmpdir, ray_start_runtime_env): @ray.remote def test(model, tokenizer, preprocessor): os.chdir(tmpdir) - predictor = HuggingFacePredictor( + predictor = TransformersPredictor( pipeline=pipeline( task="text-generation", model=model, diff --git a/python/ray/train/tests/test_huggingface_gpu.py b/python/ray/train/tests/test_transformers_gpu.py similarity index 83% rename from python/ray/train/tests/test_huggingface_gpu.py rename to python/ray/train/tests/test_transformers_gpu.py index 24aff99667dd..9eb8ea1cb2b3 100644 --- a/python/ray/train/tests/test_huggingface_gpu.py +++ b/python/ray/train/tests/test_transformers_gpu.py @@ -9,7 +9,10 @@ import ray -from ray.train.huggingface import HuggingFaceCheckpoint, HuggingFacePredictor +from ray.train.hf_transformers import ( + TransformersCheckpoint, + TransformersPredictor, +) test_strings = ["Complete me", "And me", "Please complete"] prompts = pd.DataFrame(test_strings, columns=["sentences"]) @@ -25,12 +28,12 @@ def create_checkpoint(): model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) - checkpoint = HuggingFaceCheckpoint.from_model(model, tokenizer, path=tmpdir) + checkpoint = TransformersCheckpoint.from_model(model, tokenizer, path=tmpdir) # Serialize to dict so we can remove the temporary directory - return HuggingFaceCheckpoint.from_dict(checkpoint.to_dict()) + return TransformersCheckpoint.from_dict(checkpoint.to_dict()) -class AssertingHuggingFacePredictor(HuggingFacePredictor): +class AssertingTransformersPredictor(TransformersPredictor): def __init__(self, pipeline=None, preprocessor=None, use_gpu: bool = False): super().__init__(pipeline, preprocessor, use_gpu) assert use_gpu @@ -48,7 +51,7 @@ def test_predict_batch(ray_start_4_cpus, caplog, batch_type, device): kwargs["device"] = device predictor = BatchPredictor.from_checkpoint( - checkpoint, AssertingHuggingFacePredictor, task="text-generation", **kwargs + checkpoint, AssertingTransformersPredictor, task="text-generation", **kwargs ) # Todo: Ray data does not support numpy string arrays well diff --git a/python/ray/train/tests/test_huggingface_predictor.py b/python/ray/train/tests/test_transformers_predictor.py similarity index 85% rename from python/ray/train/tests/test_huggingface_predictor.py rename to python/ray/train/tests/test_transformers_predictor.py index 4c3f7eb30825..3896c262ba01 100644 --- a/python/ray/train/tests/test_huggingface_predictor.py +++ b/python/ray/train/tests/test_transformers_predictor.py @@ -15,7 +15,10 @@ import ray -from ray.train.huggingface import HuggingFaceCheckpoint, HuggingFacePredictor +from ray.train.hf_transformers import ( + TransformersCheckpoint, + TransformersPredictor, +) from ray.train.tests.dummy_preprocessor import DummyPreprocessor @@ -29,12 +32,12 @@ def test_repr(tmpdir): - predictor = HuggingFacePredictor() + predictor = TransformersPredictor() representation = repr(predictor) assert len(representation) < MAX_REPR_LENGTH - pattern = re.compile("^HuggingFacePredictor\\((.*)\\)$") + pattern = re.compile("^TransformersPredictor\\((.*)\\)$") assert pattern.match(representation) @@ -53,7 +56,7 @@ def test(use_preprocessor): preprocessor = None model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) - predictor = HuggingFacePredictor( + predictor = TransformersPredictor( pipeline=pipeline( task="text-generation", model=model, @@ -79,8 +82,10 @@ def test(): model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) - checkpoint = HuggingFaceCheckpoint.from_model(model, tokenizer, path=tmpdir) - predictor = HuggingFacePredictor.from_checkpoint( + checkpoint = TransformersCheckpoint.from_model( + model, tokenizer, path=tmpdir + ) + predictor = TransformersPredictor.from_checkpoint( checkpoint, task="text-generation", ) @@ -97,9 +102,9 @@ def create_checkpoint(): model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint) - checkpoint = HuggingFaceCheckpoint.from_model(model, tokenizer, path=tmpdir) + checkpoint = TransformersCheckpoint.from_model(model, tokenizer, path=tmpdir) # Serialize to dict so we can remove the temporary directory - return HuggingFaceCheckpoint.from_dict(checkpoint.to_dict()) + return TransformersCheckpoint.from_dict(checkpoint.to_dict()) # TODO(ml-team): Add np.ndarray to batch_type @@ -107,7 +112,7 @@ def create_checkpoint(): def test_predict_batch(ray_start_4_cpus, batch_type): checkpoint = create_checkpoint() predictor = BatchPredictor.from_checkpoint( - checkpoint, HuggingFacePredictor, task="text-generation" + checkpoint, TransformersPredictor, task="text-generation" ) # Todo: Ray data does not support numpy string arrays well diff --git a/python/ray/train/tests/test_huggingface_trainer.py b/python/ray/train/tests/test_transformers_trainer.py similarity index 85% rename from python/ray/train/tests/test_huggingface_trainer.py rename to python/ray/train/tests/test_transformers_trainer.py index e66f68442ac0..1098dad5e0cf 100644 --- a/python/ray/train/tests/test_huggingface_trainer.py +++ b/python/ray/train/tests/test_transformers_trainer.py @@ -11,10 +11,10 @@ import ray.data from ray.train.batch_predictor import BatchPredictor -from ray.train.huggingface import ( - HuggingFacePredictor, - HuggingFaceTrainer, - HuggingFaceCheckpoint, +from ray.train.hf_transformers import ( + TransformersPredictor, + TransformersTrainer, + TransformersCheckpoint, ) from ray.train.trainer import TrainingFailedError from ray.air.config import ScalingConfig @@ -92,12 +92,39 @@ def train_function_local_dataset(train_dataset, eval_dataset=None, **config): return train_function(train_dataset, eval_dataset, **config) +def test_deprecations(ray_start_4_cpus): + """Tests that soft deprecations warn but still can be used""" + from ray.train.huggingface import ( + HuggingFaceCheckpoint, + HuggingFacePredictor, + HuggingFaceTrainer, + ) + + ray_train = ray.data.from_pandas(train_df) + ray_validation = ray.data.from_pandas(validation_df) + + with pytest.warns(DeprecationWarning): + obj = HuggingFaceCheckpoint.from_dict({"foo": "bar"}) + assert isinstance(obj, TransformersCheckpoint) + + with pytest.warns(DeprecationWarning): + obj = HuggingFacePredictor() + assert isinstance(obj, TransformersPredictor) + + with pytest.warns(DeprecationWarning): + obj = HuggingFaceTrainer( + train_function, + datasets={"train": ray_train, "evaluation": ray_validation}, + ) + assert isinstance(obj, TransformersTrainer) + + @pytest.mark.parametrize("save_strategy", ["no", "epoch"]) def test_e2e(ray_start_4_cpus, save_strategy): ray_train = ray.data.from_pandas(train_df) ray_validation = ray.data.from_pandas(validation_df) scaling_config = ScalingConfig(num_workers=2, use_gpu=False) - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_per_worker=train_function, trainer_init_config={"epochs": 4, "save_strategy": save_strategy}, scaling_config=scaling_config, @@ -108,10 +135,10 @@ def test_e2e(ray_start_4_cpus, save_strategy): assert result.metrics["epoch"] == 4 assert result.metrics["training_iteration"] == 4 assert result.checkpoint - assert isinstance(result.checkpoint, HuggingFaceCheckpoint) + assert isinstance(result.checkpoint, TransformersCheckpoint) assert "eval_loss" in result.metrics - trainer2 = HuggingFaceTrainer( + trainer2 = TransformersTrainer( trainer_init_per_worker=train_function, trainer_init_config={ "epochs": 5, @@ -126,12 +153,12 @@ def test_e2e(ray_start_4_cpus, save_strategy): assert result2.metrics["epoch"] == 5 assert result2.metrics["training_iteration"] == 1 assert result2.checkpoint - assert isinstance(result2.checkpoint, HuggingFaceCheckpoint) + assert isinstance(result2.checkpoint, TransformersCheckpoint) assert "eval_loss" in result2.metrics predictor = BatchPredictor.from_checkpoint( result2.checkpoint, - HuggingFacePredictor, + TransformersPredictor, task="text-generation", tokenizer=AutoTokenizer.from_pretrained(tokenizer_checkpoint), ) @@ -142,7 +169,7 @@ def test_e2e(ray_start_4_cpus, save_strategy): def test_training_local_dataset(ray_start_4_cpus): scaling_config = ScalingConfig(num_workers=2, use_gpu=False) - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_per_worker=train_function_local_dataset, trainer_init_config={"epochs": 1, "save_strategy": "no"}, scaling_config=scaling_config, @@ -152,7 +179,7 @@ def test_training_local_dataset(ray_start_4_cpus): assert result.metrics["epoch"] == 1 assert result.metrics["training_iteration"] == 1 assert result.checkpoint - assert isinstance(result.checkpoint, HuggingFaceCheckpoint) + assert isinstance(result.checkpoint, TransformersCheckpoint) assert "eval_loss" in result.metrics @@ -172,7 +199,7 @@ def fit_and_check_for_error(trainer, error_type=ValueError): ) # load_best_model_at_end set to True should raise an exception - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_config={ "epochs": 1, "load_best_model_at_end": True, @@ -183,7 +210,7 @@ def fit_and_check_for_error(trainer, error_type=ValueError): fit_and_check_for_error(trainer) # logging strategy set to no should raise an exception - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_config={ "epochs": 1, "logging_strategy": "no", @@ -193,7 +220,7 @@ def fit_and_check_for_error(trainer, error_type=ValueError): fit_and_check_for_error(trainer) # logging steps != eval steps should raise an exception - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_config={ "epochs": 1, "logging_strategy": "steps", @@ -212,7 +239,7 @@ def fit_and_check_for_error(trainer, error_type=ValueError): ("epoch", "steps", "epoch"), ("steps", "epoch", "steps"), ): - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_config={ "epochs": 1, "load_best_model_at_end": True, @@ -232,7 +259,7 @@ def test_tune(ray_start_8_cpus): scaling_config = ScalingConfig( num_workers=2, use_gpu=False, trainer_resources={"CPU": 0} ) - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_per_worker=train_function, scaling_config=scaling_config, datasets={"train": ray_train, "evaluation": ray_validation}, @@ -280,7 +307,7 @@ def train_function_with_metric(train_dataset, eval_dataset=None, **config): print(metric) return train_function(train_dataset, eval_dataset=eval_dataset, **config) - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_per_worker=train_function_with_metric, trainer_init_config={"epochs": 1}, scaling_config=scaling_config, diff --git a/python/ray/train/tests/test_huggingface_trainer_steps.py b/python/ray/train/tests/test_transformers_trainer_steps.py similarity index 95% rename from python/ray/train/tests/test_huggingface_trainer_steps.py rename to python/ray/train/tests/test_transformers_trainer_steps.py index d7c3838846f9..edae952bbbcb 100644 --- a/python/ray/train/tests/test_huggingface_trainer_steps.py +++ b/python/ray/train/tests/test_transformers_trainer_steps.py @@ -12,7 +12,10 @@ import ray.data from ray.train.batch_predictor import BatchPredictor -from ray.train.huggingface import HuggingFacePredictor, HuggingFaceTrainer +from ray.train.hf_transformers import ( + TransformersPredictor, + TransformersTrainer, +) from ray.train.trainer import TrainingFailedError from ray.air.config import ScalingConfig from ray.train.tests._huggingface_data import train_data, validation_data @@ -73,7 +76,7 @@ def test_e2e_steps(ray_start_4_cpus, save_steps, logging_steps): scaling_config = ScalingConfig(num_workers=2, use_gpu=False) epochs = 4 - trainer = HuggingFaceTrainer( + trainer = TransformersTrainer( trainer_init_per_worker=train_function, trainer_init_config={ "epochs": epochs, @@ -99,7 +102,7 @@ def test_e2e_steps(ray_start_4_cpus, save_steps, logging_steps): assert result.checkpoint assert "eval_loss" in result.metrics - trainer2 = HuggingFaceTrainer( + trainer2 = TransformersTrainer( trainer_init_per_worker=train_function, trainer_init_config={ "epochs": epochs + 1, @@ -122,7 +125,7 @@ def test_e2e_steps(ray_start_4_cpus, save_steps, logging_steps): predictor = BatchPredictor.from_checkpoint( result2.checkpoint, - HuggingFacePredictor, + TransformersPredictor, task="text-generation", tokenizer=AutoTokenizer.from_pretrained(tokenizer_checkpoint), ) diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py index 3745313dd98d..900d036d19c6 100644 --- a/python/ray/train/tests/test_tune.py +++ b/python/ray/train/tests/test_tune.py @@ -185,7 +185,7 @@ def train_func(config): tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 10}}, - ).restore(trial.local_dir) + ).restore(trial.local_dir, trainable=trainer) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5 @@ -261,14 +261,12 @@ def train_func(config): ) caplog.clear() with caplog.at_level(logging.WARNING, logger="ray.tune.impl.tuner_internal"): - with pytest.warns() as warn_record: - tuner = Tuner.restore( - str(tmpdir / "restore_new_trainer"), - trainable=trainer, - resume_errored=True, - ) - # Should warn about the RunConfig being ignored - assert any("RunConfig" in str(record.message) for record in warn_record) + tuner = Tuner.restore( + str(tmpdir / "restore_new_trainer"), + trainable=trainer, + resume_errored=True, + ) + assert "they will be ignored in the resumed run" in caplog.text results = tuner.fit() assert not results.errors diff --git a/python/ray/train/tests/test_worker_group.py b/python/ray/train/tests/test_worker_group.py index e40be2ff16a0..06770c6e6af0 100644 --- a/python/ray/train/tests/test_worker_group.py +++ b/python/ray/train/tests/test_worker_group.py @@ -89,7 +89,11 @@ def test_move_workers_with_ip_to_front(ray_start_2_cpus): Worker( actor=None, metadata=WorkerMetadata( - node_id="dummy", node_ip=f"10.1.10.{i}", hostname="dummy", gpu_ids=None + node_id="dummy", + node_ip=f"10.1.10.{i}", + hostname="dummy", + gpu_ids=None, + pid=0, ), ) for i in range(1, 17) diff --git a/python/ray/train/tests/test_xgboost_predictor.py b/python/ray/train/tests/test_xgboost_predictor.py index 2ddb72c5a568..82609e637b74 100644 --- a/python/ray/train/tests/test_xgboost_predictor.py +++ b/python/ray/train/tests/test_xgboost_predictor.py @@ -68,7 +68,10 @@ def test_predict_batch(ray_start_4_cpus, batch_type): data_batch = _convert_pandas_to_batch_type(raw_batch, type=TYPE_TO_ENUM[batch_type]) if batch_type == np.ndarray: + # TODO(ekl) how do we fix this to work with "data" column? dataset = ray.data.from_numpy(dummy_data) + dataset = dataset.add_column("__value__", lambda b: b["data"]) + dataset = dataset.drop_columns(["data"]) elif batch_type == pd.DataFrame: dataset = ray.data.from_pandas(data_batch) elif batch_type == pa.Table: diff --git a/python/ray/train/tests/test_xgboost_trainer.py b/python/ray/train/tests/test_xgboost_trainer.py index b61af80619fe..8ec1c0a56d9d 100644 --- a/python/ray/train/tests/test_xgboost_trainer.py +++ b/python/ray/train/tests/test_xgboost_trainer.py @@ -115,7 +115,7 @@ def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): scaling_config=scale_config, label_column="target", params=params, - num_boost_round=5, + num_boost_round=10, datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset}, resume_from_checkpoint=resume_from, ) @@ -246,7 +246,7 @@ def test_validation(ray_start_4_cpus): def test_distributed_data_loading(ray_start_4_cpus): - """Checks that XGBoostTrainer does distributed data loading for Ray Datasets.""" + """Checks that XGBoostTrainer does distributed data loading for Datasets.""" class DummyXGBoostTrainer(XGBoostTrainer): def _train(self, params, dtrain, **kwargs): diff --git a/python/ray/train/torch/torch_detection_predictor.py b/python/ray/train/torch/torch_detection_predictor.py index 7de5eee0934a..4e6ff2dd9ca9 100644 --- a/python/ray/train/torch/torch_detection_predictor.py +++ b/python/ray/train/torch/torch_detection_predictor.py @@ -4,7 +4,6 @@ import numpy as np import torch -from ray.air.util.tensor_extensions.utils import create_ragged_ndarray from ray.train._internal.dl_predictor import TensorDtype from ray.train.torch.torch_predictor import TorchPredictor from ray.util.annotations import PublicAPI @@ -134,7 +133,5 @@ def _convert_outputs_to_ndarray_batch( batch = collections.defaultdict(list) for output in outputs: for key, value in output.items(): - batch[key].append(value.cpu().detach().numpy()) - for key, value in batch.items(): - batch[key] = create_ragged_ndarray(value) + batch[key].append(value.cpu().detach()) return batch diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py index 6c43776bbe9a..e40eef526bba 100644 --- a/python/ray/train/torch/torch_trainer.py +++ b/python/ray/train/torch/torch_trainer.py @@ -56,7 +56,7 @@ def train_loop_per_worker(): # Get dict of last saved checkpoint. session.get_checkpoint() - # Session returns the Ray Dataset shard for the given key. + # Session returns the Dataset shard for the given key. session.get_dataset_shard("my_dataset") # Get the total number of workers executing training. @@ -247,7 +247,7 @@ def train_loop_per_worker(): scaling_config: Configuration for how to scale data parallel training. dataset_config: Configuration for dataset ingest. run_config: Configuration for the execution of the training run. - datasets: Any Ray Datasets to use for training. Use + datasets: Any Datasets to use for training. Use the key "train" to denote which dataset is the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed diff --git a/python/ray/train/torch/train_loop_utils.py b/python/ray/train/torch/train_loop_utils.py index 952e97fb3944..c41177f73a80 100644 --- a/python/ray/train/torch/train_loop_utils.py +++ b/python/ray/train/torch/train_loop_utils.py @@ -3,7 +3,7 @@ import random import types import collections -from distutils.version import LooseVersion +from packaging.version import Version from typing import Any, Dict, List, Optional, Callable, Union @@ -19,7 +19,7 @@ from torch.cuda.amp import autocast, GradScaler from torch.nn.parallel import DistributedDataParallel -if LooseVersion(torch.__version__) < LooseVersion("1.11.0"): +if Version(torch.__version__) < Version("1.11.0"): FullyShardedDataParallel = None else: from torch.distributed.fsdp import FullyShardedDataParallel diff --git a/python/ray/train/trainer.py b/python/ray/train/trainer.py index 34dac3276111..52ea9d292873 100644 --- a/python/ray/train/trainer.py +++ b/python/ray/train/trainer.py @@ -4,6 +4,8 @@ from ray.air.checkpoint import Checkpoint from ray.air.config import CheckpointConfig +from ray.air import session +from ray.air._internal.uri_utils import URI from ray.air._internal.util import StartTraceback from ray.train._internal.backend_executor import ( BackendExecutor, @@ -25,6 +27,7 @@ GenDataset, TrainingFailedError, ) +from ray.tune.trainable.util import TrainableUtil from ray.util.annotations import DeveloperAPI T = TypeVar("T") @@ -47,6 +50,7 @@ def __init__( checkpoint: Optional[Union[Dict, str, Path, Checkpoint]], checkpoint_strategy: Optional[CheckpointConfig], run_dir: Optional[Path] = None, + storage_path: Optional[str] = None, ): self._backend_executor = backend_executor self._backend = backend_config.backend_cls() @@ -55,12 +59,12 @@ def __init__( self._run_dir = run_dir self._checkpoint_manager = checkpoint_manager self._checkpoint_strategy = checkpoint_strategy + self._storage_path = storage_path self._start_training( train_func=train_func, run_dir=run_dir, dataset_spec=self._dataset_spec, checkpoint=checkpoint, - checkpoint_strategy=checkpoint_strategy, ) self._final_results = None @@ -75,11 +79,10 @@ def _start_training( run_dir, dataset_spec, checkpoint, - checkpoint_strategy, latest_checkpoint_id=None, ): self._checkpoint_manager.on_start_training( - checkpoint_strategy=checkpoint_strategy, + checkpoint_strategy=self._checkpoint_strategy, run_dir=run_dir, latest_checkpoint_id=latest_checkpoint_id, ) @@ -92,6 +95,12 @@ def _start_training( ) ) + # Session has started. Set current cloud checkpoint dir if necessary. + if self._checkpoint_strategy._checkpoint_upload_from_workers: + self._backend_executor._set_checkpoint_uri( + self.__get_cloud_checkpoint_dir() + ) + def _run_with_error_handling(self, func: Callable): try: return func() @@ -109,8 +118,7 @@ def _run_with_error_handling(self, func: Callable): self._run_dir, self._dataset_spec, self._checkpoint_manager.latest_checkpoint, - self._checkpoint_strategy, - latest_checkpoint_id=self._checkpoint_manager.latest_checkpoint_id, + self._checkpoint_manager.latest_checkpoint_id, ) return self._run_with_error_handling(func) except InactiveWorkerGroupError: @@ -174,9 +182,22 @@ def _fetch_next_result(self) -> Optional[List[Dict]]: result_data = [r.data for r in results] return result_data elif result_type is TrainingResultType.CHECKPOINT: - self._checkpoint_manager._process_checkpoint( + self._checkpoint_manager._process_checkpoints( results, decode_checkpoint_fn=self._backend._decode_data ) + + # Note(jungong) : This is kinda funky. We update the cloud + # checkpoint dir on every distributed worker right after + # an existing checkpoint is processed. We must do this because + # Trainers do not have the concept of iterations or steps, + # which must be synced between Trainable driver and the trainers. + # TODO(jungong) : It would be nicer if we find a cleaner way + # to sync the current cloud checkpointing directory between + # Tuner, Trainable, and Trainers. + if self._checkpoint_strategy._checkpoint_upload_from_workers: + self._backend_executor._set_checkpoint_uri( + self.__get_cloud_checkpoint_dir() + ) # Iterate until next REPORT call or training has finished. else: raise TrainBackendError( @@ -194,9 +215,13 @@ def _finish_checkpointing(self): result_type = results[0].type # Process checkpoints and ignore other result types. if result_type is TrainingResultType.CHECKPOINT: - self._checkpoint_manager._process_checkpoint( + self._checkpoint_manager._process_checkpoints( results, decode_checkpoint_fn=self._backend._decode_data ) + if self._checkpoint_strategy._checkpoint_upload_from_workers: + self._backend_executor._set_checkpoint_uri( + self.__get_cloud_checkpoint_dir() + ) def _finish_training(self): """Finish training and return final results. Propagate any exceptions. @@ -248,3 +273,24 @@ def get_final_results(self, force: bool = False) -> List[T]: ) return self._final_results + + # This is extremely hacky and fragile. + # TODO(jungong) : We should refactor things so Tuner, Trinable, and + # Trainers have a consistent view of the current cloud checkpointing + # directory. + # We should probably also refactor things so Syncer and SyncConfig + # are available everywhere session is available. + def __get_cloud_checkpoint_dir(self): + if not self._storage_path: + # Can't run cloud upload if storage path is not set. + return None + + base_dir = URI(self._storage_path) + path = Path(session.get_trial_dir()) + trial_dir_name = path.name + exp_dir_name = path.parent.name + checkpoint_dir_name = TrainableUtil._make_checkpoint_dir_name( + self._checkpoint_manager._latest_checkpoint_id + ) + + return str(base_dir / exp_dir_name / trial_dir_name / checkpoint_dir_name) diff --git a/python/ray/train/xgboost/xgboost_trainer.py b/python/ray/train/xgboost/xgboost_trainer.py index c0a19d9b096c..154a001129b5 100644 --- a/python/ray/train/xgboost/xgboost_trainer.py +++ b/python/ray/train/xgboost/xgboost_trainer.py @@ -1,5 +1,10 @@ from typing import Any, Dict, Optional, Tuple, TYPE_CHECKING +try: + from packaging.version import Version +except ImportError: + from distutils.version import LooseVersion as Version + from ray.air.checkpoint import Checkpoint from ray.train.gbdt_trainer import GBDTTrainer from ray.train.xgboost.xgboost_checkpoint import XGBoostCheckpoint @@ -46,7 +51,7 @@ class XGBoostTrainer(GBDTTrainer): result = trainer.fit() Args: - datasets: Ray Datasets to use for training and validation. Must include a + datasets: Datasets to use for training and validation. Must include a "train" key denoting the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if @@ -61,6 +66,11 @@ class XGBoostTrainer(GBDTTrainer): :class:`xgboost_ray.RayDMatrix` initializations, which in turn are passed to ``xgboost.DMatrix`` objects created on each worker. For example, this can be used to add sample weights with the ``weights`` parameter. + num_boost_round: Target number of boosting iterations (trees in the model). + Note that unlike in ``xgboost.train``, this is the target number + of trees, meaning that if you set ``num_boost_round=10`` and pass a model + that has already been trained for 5 iterations, it will be trained for 5 + iterations more, instead of 10 more. scaling_config: Configuration for how to scale data parallel training. run_config: Configuration for the execution of the training run. preprocessor: A ray.data.Preprocessor to preprocess the @@ -97,3 +107,12 @@ def _model_iteration(self, model: xgboost.Booster) -> int: # Compatibility with XGBoost < 1.4 return len(model.get_dump()) return model.num_boosted_rounds() + + def preprocess_datasets(self) -> None: + super().preprocess_datasets() + + # XGBoost/LightGBM-Ray requires each dataset to have at least as many + # blocks as there are workers. + # This is only applicable for xgboost-ray<0.1.16 + if Version(xgboost_ray.__version__) < Version("0.1.16"): + self._repartition_datasets_to_match_num_actors() diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index a24da02b83f1..78ecaabdeed4 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -121,7 +121,7 @@ py_test( name = "test_experiment_analysis", size = "medium", srcs = ["tests/test_experiment_analysis.py"], - deps = [":tune_lib"], + deps = [":tune_lib", ":conftest"], tags = ["team:ml", "exclusive"], ) @@ -236,7 +236,7 @@ py_test( name = "test_result_grid", size = "medium", srcs = ["tests/test_result_grid.py"], - deps = [":tune_lib"], + deps = [":tune_lib", ":conftest"], tags = ["team:ml", "exclusive"], ) @@ -308,7 +308,7 @@ py_test( name = "test_syncer", size = "medium", srcs = ["tests/test_syncer.py"], - deps = [":tune_lib"], + deps = [":tune_lib", ":conftest"], tags = ["team:ml", "exclusive"], ) @@ -518,6 +518,54 @@ py_test( tags = ["team:ml", "exclusive"] ) +py_test( + name = "test_controller_callback_integration", + size = "large", + srcs = ["tests/execution/test_controller_callback_integration.py"], + deps = [":tune_lib"], + tags = ["team:ml", "exclusive"] +) + +py_test( + name = "test_controller_checkpointing_integration", + size = "large", + srcs = ["tests/execution/test_controller_checkpointing_integration.py"], + deps = [":tune_lib"], + tags = ["team:ml", "exclusive"] +) + +py_test( + name = "test_controller_control_integration", + size = "large", + srcs = ["tests/execution/test_controller_control_integration.py"], + deps = [":tune_lib"], + tags = ["team:ml", "exclusive"] +) + +py_test( + name = "test_controller_errors_integration", + size = "large", + srcs = ["tests/execution/test_controller_errors_integration.py"], + deps = [":tune_lib"], + tags = ["team:ml", "exclusive"] +) + +py_test( + name = "test_controller_resources_integration", + size = "large", + srcs = ["tests/execution/test_controller_resources_integration.py"], + deps = [":tune_lib"], + tags = ["team:ml", "exclusive"] +) + +py_test( + name = "test_controller_search_alg_integration", + size = "large", + srcs = ["tests/execution/test_controller_search_alg_integration.py"], + deps = [":tune_lib"], + tags = ["team:ml", "exclusive"] +) + # -------------------------------------------------------------------- # Examples from the python/ray/tune/examples directory. # Please keep these sorted alphabetically. diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py index cb432d134e58..f3a680ed8374 100644 --- a/python/ray/tune/analysis/experiment_analysis.py +++ b/python/ray/tune/analysis/experiment_analysis.py @@ -1,11 +1,19 @@ import json import logging import os +import tempfile import traceback +from typing import Any, Dict, List, Optional, Tuple, Union from numbers import Number from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from ray.air._internal.remote_storage import ( + download_from_uri, + is_directory, + is_local_path, + list_at_uri, +) +from ray.air._internal.uri_utils import _join_path_or_uri, URI from ray.air.checkpoint import Checkpoint from ray.tune.syncer import SyncConfig from ray.tune.utils import flatten_dict @@ -80,12 +88,29 @@ def __init__( # Deprecate: Raise in 2.6, remove in 2.7 sync_config: Optional[SyncConfig] = None, ): + self._local_experiment_path: str = None + self._remote_experiment_path: Optional[str] = None + + # If the user passes in a remote checkpoint path, + # Set the remote experiment path to this path, and set + # the local experiment path to a temp directory. + if not is_local_path(experiment_checkpoint_path): + self._remote_experiment_path = experiment_checkpoint_path + + # Create a temp directory to store downloaded checkpoint files if + # they are pulled from a remote `experiment_checkpoint_path`. + self._local_experiment_path = tempfile.TemporaryDirectory( + prefix="experiment_analysis_" + ).name + os.makedirs(self._local_experiment_path, exist_ok=True) + # Load the experiment checkpoints and their parent paths. # This is important for when experiment folders have been # relocated (e.g. from a ray cluster to local disk or GCS/S3)- self._experiment_states = [] self._checkpoints_and_paths: List[Tuple[dict, os.PathLike]] = [] self._load_checkpoints(experiment_checkpoint_path) + assert self._checkpoints_and_paths self.trials = trials @@ -102,7 +127,18 @@ def __init__( # If only a mode was passed, use anonymous metric self.default_metric = DEFAULT_METRIC - self._local_experiment_path = self._checkpoints_and_paths[0][1] + # TODO(ml-team): Remove in 2.7 along with sync_config parameter + if sync_config and sync_config.upload_dir: + remote_storage_path = sync_config.upload_dir + + if not self._local_experiment_path: + self._local_experiment_path = str(self._checkpoints_and_paths[0][1]) + + if not self._remote_experiment_path and remote_storage_path: + self._remote_experiment_path = str( + URI(remote_storage_path) / Path(self._local_experiment_path).name + ) + if not pd: logger.warning( "pandas not installed. Run `pip install pandas` for " @@ -111,18 +147,13 @@ def __init__( else: self.fetch_trial_dataframes() - if sync_config and sync_config.upload_dir: - remote_storage_path = sync_config.upload_dir - - self._remote_storage_path = remote_storage_path - @property def _local_path(self) -> str: - return str(self._local_experiment_path) + return self._local_experiment_path @property - def _remote_path(self) -> Optional[str]: - return self._parse_cloud_path(self._local_path) + def _remote_path(self) -> str: + return self._remote_experiment_path @property def experiment_path(self) -> str: @@ -136,24 +167,36 @@ def experiment_path(self) -> str: """ return self._remote_path or self._local_path - def _parse_cloud_path(self, local_path: str): - """Convert local path into cloud storage path""" - if not self._remote_storage_path: + def _convert_local_to_cloud_path(self, local_path: str): + """Convert local path into cloud storage path. + + Example: + local_path = "/a/b/c.json" + self._remote_experiment_path = "s3://bucket?param=abcd" + self._local_experiment_path = "/a/b" + + -> "s3://bucket/c?param=abcd" + """ + if not self._remote_experiment_path: return None - return local_path.replace( - str(Path(self._local_experiment_path).parent), self._remote_storage_path - ) + rel_path = str(Path(local_path).relative_to(self._local_experiment_path)) + return str(URI(self._remote_experiment_path) / rel_path) def _load_checkpoints(self, experiment_checkpoint_path: str) -> List[str]: - experiment_checkpoint_path = Path(experiment_checkpoint_path).expanduser() # Get the latest checkpoints from the checkpoint_path. - latest_checkpoint = self._get_latest_checkpoint(experiment_checkpoint_path) + latest_checkpoints = self._get_latest_checkpoint(experiment_checkpoint_path) + if not latest_checkpoints: + raise ValueError( + f"`{experiment_checkpoint_path}` must either be a path to an " + "experiment checkpoint file, or a directory containing an experiment " + "checkpoint file." + ) # Collect all checkpoints and their directory paths. # These are used to infer the `local_dir` from the checkpoints # in case the experiment folder had been moved from its original # location (e.g. from a ray cluster to a GCS/S3 bucket or to local disk). - self._load_checkpoints_from_latest(latest_checkpoint) + self._load_checkpoints_from_latest(latest_checkpoints) def _load_checkpoints_from_latest(self, latest_checkpoint: List[str]) -> None: # Collect all checkpoints and their directory paths. @@ -169,46 +212,118 @@ def _load_checkpoints_from_latest(self, latest_checkpoint: List[str]) -> None: (cp, Path(path).parent) for cp in experiment_state["checkpoints"] ] - def _get_latest_checkpoint(self, experiment_checkpoint_path: Path) -> List[str]: - # Case 1: Dir specified, find latest checkpoint. - if experiment_checkpoint_path.is_dir(): - latest_checkpoint = _find_newest_experiment_checkpoint( - str(experiment_checkpoint_path) - ) + def _maybe_download_experiment_checkpoint( + self, experiment_checkpoint_path: str + ) -> Optional[str]: + """Downloads the experiment checkpoint from a remote path if needed. + + Args: + experiment_checkpoint_path: The local or remote path to the experiment + checkpoint file. + + Returns: + str: The local copy of the experiment checkpoint. + If a local path is passed in, this method will return that immediately. + If a remote path is passed in, this will try to download that file. + Will return None if the download failed. + """ + if is_local_path(experiment_checkpoint_path): + return os.path.expanduser(experiment_checkpoint_path) + + assert self._local_path and self._remote_path + + experiment_path = Path(URI(self._remote_path).path) + # s3://bucket/exp_dir/nested/experiment_state.json + # -> bucket/exp_dir/nested/experiment_state.json + checkpoint_path = Path(URI(experiment_checkpoint_path).path) + + assert experiment_path in checkpoint_path.parents + # -> nested/experiment_state.json + relative_path = checkpoint_path.relative_to(experiment_path) + + # Download to: + # -> {self._local_path}/nested/experiment_state.json + local_path = os.path.join(self._local_path, relative_path) + try: + download_from_uri(experiment_checkpoint_path, local_path) + except FileNotFoundError: + return None + + return local_path + + def _get_latest_checkpoint_from_dir( + self, experiment_checkpoint_path: str, top_level: bool = True + ) -> List[str]: + """Gets the latest experiment checkpoints from a given directory. + + Args: + experiment_checkpoint_path: A local or remote path to a directory + containing at least one experiment checkpoint file. + top_level: True if this is the first directory level. False if + we are searching in a subdirectory. (Max recursion depth of 1.) + + Returns: + list: A list of local paths pointing to the latest experiment checkpoint + file for each experiment found within the given directory. + """ + latest_checkpoint = _find_newest_experiment_checkpoint( + experiment_checkpoint_path + ) + + latest_checkpoints = [] + if latest_checkpoint: + assert not is_directory( + latest_checkpoint + ), "This should point to an actual experiment checkpoint file." + latest_checkpoints.extend(self._get_latest_checkpoint(latest_checkpoint)) + + if not latest_checkpoint and top_level: # If no checkpoint in this folder the sub-directory is searched. # In this case also multiple experiment folders could exist in # the same root. In this case the length of `latest_checkpoint` # will be greater than 1. - if not latest_checkpoint: - latest_checkpoint = [] - for fname in experiment_checkpoint_path.iterdir(): - fname = experiment_checkpoint_path.joinpath(fname) - latest_checkpoint_subdir = _find_newest_experiment_checkpoint( - str(fname) + for subdir in list_at_uri(experiment_checkpoint_path): + full_path = _join_path_or_uri(experiment_checkpoint_path, subdir) + if is_directory(full_path): + latest_checkpoints.extend( + self._get_latest_checkpoint_from_dir(full_path, top_level=False) ) - if latest_checkpoint_subdir: - latest_checkpoint.append(latest_checkpoint_subdir) - if not latest_checkpoint: - # This avoid nested experiment directories of the form - # `experiment_name1/experiment_name2/experiment_state.json`. - experiment_checkpoint_path = str(experiment_checkpoint_path) - raise ValueError( - f"The directory `{experiment_checkpoint_path}` does not " - "contain a Ray Tune experiment checkpoint." - ) - elif not experiment_checkpoint_path.is_file(): - # Case 2: File specified, but does not exist. - experiment_checkpoint_path = str(experiment_checkpoint_path) + + return latest_checkpoints + + def _get_latest_checkpoint(self, experiment_checkpoint_path: str) -> List[str]: + """Gets the latest experiment checkpoints corresponding to a given path. + + Acceptable path inputs (either local or remote): + - A path to an experiment checkpoint file. + - A path to an experiment directory, which contains an experiment checkpoint + file at the directory's top-level. + - A path to a directory that contains multiple experiment directories, + where each subdirectory contains an experiment checkpoint file. + + Returns: + list: A list of local paths pointing to the latest experiment checkpoint + file for each experiment corresponding to the given path. + """ + if is_directory(experiment_checkpoint_path): + return self._get_latest_checkpoint_from_dir(experiment_checkpoint_path) + + local_experiment_checkpoint_path = self._maybe_download_experiment_checkpoint( + experiment_checkpoint_path + ) + + if ( + not local_experiment_checkpoint_path + or not Path(local_experiment_checkpoint_path).exists() + ): raise ValueError( f"The file `{experiment_checkpoint_path}` does not " f"exist and cannot be loaded for experiment analysis." ) - else: - # Case 3: File specified, use as latest checkpoint. - latest_checkpoint = str(experiment_checkpoint_path) - if not isinstance(latest_checkpoint, list): - latest_checkpoint = [latest_checkpoint] - return latest_checkpoint + + assert Path(local_experiment_checkpoint_path).is_file() + + return [local_experiment_checkpoint_path] @property def best_trial(self) -> Trial: @@ -508,7 +623,7 @@ def get_best_checkpoint( best_path_metrics = sorted(checkpoint_paths, key=lambda x: a * x[1]) best_path, best_metric = best_path_metrics[0] - cloud_path = self._parse_cloud_path(best_path) + cloud_path = self._convert_local_to_cloud_path(best_path) if cloud_path: # Prefer cloud path over local path for downsteam processing @@ -545,8 +660,15 @@ def get_all_configs(self, prefix: bool = False) -> Dict[str, Dict]: their trial dir. """ fail_count = 0 + failed_paths = [] for path in self._get_trial_paths(): try: + param_file = os.path.join(path, EXPR_PARAM_FILE) + if not os.path.exists(param_file) and self._remote_path: + download_from_uri( + self._convert_local_to_cloud_path(param_file), param_file + ) + with open(os.path.join(path, EXPR_PARAM_FILE)) as f: config = json.load(f) if prefix: @@ -554,10 +676,20 @@ def get_all_configs(self, prefix: bool = False) -> Dict[str, Dict]: else: self._configs[path] = config except Exception: + logger.debug( + f"Exception occurred when loading trial configs. " + f"See traceback:\n{traceback.format_exc()}" + ) fail_count += 1 + failed_paths.append(path) if fail_count: - logger.warning("Couldn't read config from {} paths".format(fail_count)) + failed_paths_str = "\n".join([f"- {path}" for path in failed_paths]) + logger.warning( + f"Failed to read the config for {fail_count} trials:\n" + f"{failed_paths_str}" + ) + return self._configs def get_best_trial( @@ -742,23 +874,44 @@ def fetch_trial_dataframes(self) -> Dict[str, DataFrame]: A dictionary containing "trial dir" to Dataframe. """ fail_count = 0 + failed_paths = [] force_dtype = {"trial_id": str} # Never convert trial_id to float. for path in self._get_trial_paths(): try: if self._file_type == "json": - with open(os.path.join(path, EXPR_RESULT_FILE), "r") as f: + json_file = os.path.join(path, EXPR_RESULT_FILE) + if not os.path.exists(json_file) and self._remote_path: + download_from_uri( + self._convert_local_to_cloud_path(json_file), json_file + ) + + with open(json_file, "r") as f: json_list = [json.loads(line) for line in f if line] df = pd.json_normalize(json_list, sep="/") elif self._file_type == "csv": - df = pd.read_csv( - os.path.join(path, EXPR_PROGRESS_FILE), dtype=force_dtype - ) + csv_file = os.path.join(path, EXPR_PROGRESS_FILE) + if not os.path.exists(csv_file) and self._remote_path: + download_from_uri( + self._convert_local_to_cloud_path(csv_file), csv_file + ) + + df = pd.read_csv(csv_file, dtype=force_dtype) self.trial_dataframes[path] = df except Exception: + logger.debug( + f"Exception occurred when loading trial results. See traceback:\n" + f"{traceback.format_exc()}" + ) fail_count += 1 + failed_paths.append(path) if fail_count: - logger.debug("Couldn't read results from {} paths".format(fail_count)) + failed_paths_str = "\n".join([f"- {path}" for path in failed_paths]) + logger.warning( + f"Failed to read the results for {fail_count} trials:\n" + f"{failed_paths_str}" + ) + return self.trial_dataframes def stats(self) -> Dict: @@ -811,9 +964,11 @@ def _get_trial_paths(self) -> List[str]: _trial_paths = [str(t.local_path) for t in self.trials] else: logger.info( - "No `self.trials`. Drawing logdirs from checkpoint " - "file. This may result in some information that is " - "out of sync, as checkpointing is periodic." + "No trial data passed in during `ExperimentAnalysis` initialization -- " + "you are most likely loading the experiment after it has completed.\n" + "Loading trial data from the experiment checkpoint file. " + "This may result in loading some stale information, " + "since checkpointing is periodic." ) self.trials = [] for trial_json_state, path in self._checkpoints_and_paths: diff --git a/python/ray/tune/callback.py b/python/ray/tune/callback.py index 20d57bac93a0..ef057bb30c4f 100644 --- a/python/ray/tune/callback.py +++ b/python/ray/tune/callback.py @@ -292,18 +292,49 @@ def get_state(self) -> Optional[Dict]: This method should be implemented by subclasses to return a dictionary representation of the object's current state. + This is called automatically by Tune to periodically checkpoint callback state. + Upon :ref:`Tune experiment restoration `, + callback state will be restored via :meth:`~ray.tune.Callback.set_state`. + + .. code-block:: python + + from typing import Dict, List, Optional + + from ray.tune import Callback + from ray.tune.experiment import Trial + + class MyCallback(Callback): + def __init__(self): + self._trial_ids = set() + + def on_trial_start( + self, iteration: int, trials: List["Trial"], trial: "Trial", **info + ): + self._trial_ids.add(trial.trial_id) + + def get_state(self) -> Optional[Dict]: + return {"trial_ids": self._trial_ids.copy()} + + def set_state(self, state: Dict) -> Optional[Dict]: + self._trial_ids = state["trial_ids"] + Returns: - state: State of the callback. Should be `None` if the callback does not - have any state to save (this is the default). + dict: State of the callback. Should be `None` if the callback does not + have any state to save (this is the default). """ return None def set_state(self, state: Dict): - """Get the state of the callback. + """Set the state of the callback. This method should be implemented by subclasses to restore the callback's state based on the given dict state. + This is used automatically by Tune to restore checkpoint callback state + on :ref:`Tune experiment restoration `. + + See :meth:`~ray.tune.Callback.get_state` for an example implementation. + Args: state: State of the callback. """ diff --git a/python/ray/tune/cli/commands.py b/python/ray/tune/cli/commands.py index 5ce5fefe8d4b..f1a821c2571e 100644 --- a/python/ray/tune/cli/commands.py +++ b/python/ray/tune/cli/commands.py @@ -17,11 +17,7 @@ ) from ray.tune.analysis import ExperimentAnalysis from ray.tune import TuneError - -try: - from tabulate import tabulate -except ImportError: - tabulate = None +from ray._private.thirdparty.tabulate.tabulate import tabulate logger = logging.getLogger(__name__) diff --git a/python/ray/tune/constants.py b/python/ray/tune/constants.py new file mode 100644 index 000000000000..0b6698aecb26 --- /dev/null +++ b/python/ray/tune/constants.py @@ -0,0 +1,40 @@ +# ================================================== +# Environment Variables +# ================================================== + +# NOTE: When adding a new environment variable, please track it in this list. +TUNE_ENV_VARS = { + "RAY_AIR_LOCAL_CACHE_DIR", + "TUNE_DISABLE_AUTO_CALLBACK_LOGGERS", + "TUNE_DISABLE_AUTO_CALLBACK_SYNCER", + "TUNE_DISABLE_AUTO_INIT", + "TUNE_DISABLE_DATED_SUBDIR", + "TUNE_NEW_EXECUTION", + "TUNE_DISABLE_STRICT_METRIC_CHECKING", + "TUNE_DISABLE_SIGINT_HANDLER", + "TUNE_FALLBACK_TO_LATEST_CHECKPOINT", + "TUNE_FORCE_TRIAL_CLEANUP_S", + "TUNE_GET_EXECUTOR_EVENT_WAIT_S", + "TUNE_FUNCTION_THREAD_TIMEOUT_S", + "TUNE_GLOBAL_CHECKPOINT_S", + "TUNE_MAX_LEN_IDENTIFIER", + "TUNE_MAX_PENDING_TRIALS_PG", + "TUNE_NODE_SYNCING_MIN_ITER_THRESHOLD", + "TUNE_NODE_SYNCING_MIN_TIME_S_THRESHOLD", + "TUNE_PLACEMENT_GROUP_PREFIX", + "TUNE_PLACEMENT_GROUP_RECON_INTERVAL", + "TUNE_PRINT_ALL_TRIAL_ERRORS", + "TUNE_RESULT_DIR", + "TUNE_RESULT_BUFFER_LENGTH", + "TUNE_RESULT_DELIM", + "TUNE_RESULT_BUFFER_MAX_TIME_S", + "TUNE_RESULT_BUFFER_MIN_TIME_S", + "TUNE_WARN_THRESHOLD_S", + "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", + "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER", + "TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S", + "TUNE_STATE_REFRESH_PERIOD", + "TUNE_RESTORE_RETRY_NUM", + "TUNE_CHECKPOINT_CLOUD_RETRY_NUM", + "TUNE_CHECKPOINT_CLOUD_RETRY_WAIT_TIME_S", +} diff --git a/python/ray/tune/examples/tune-default.yaml b/python/ray/tune/examples/tune-default.yaml index 31df4bf29f5e..79d5f003d02d 100644 --- a/python/ray/tune/examples/tune-default.yaml +++ b/python/ray/tune/examples/tune-default.yaml @@ -11,4 +11,4 @@ available_node_types: node_config: {InstanceType: c5.xlarge, ImageId: ami-0b294f219d14e6a82} head_node_type: head_node setup_commands: # Set up each node. - - pip install ray torch torchvision tabulate tensorboard + - pip install ray torch torchvision tensorboard diff --git a/python/ray/tune/examples/tune-local-default.yaml b/python/ray/tune/examples/tune-local-default.yaml index 8cf5a3fe1a4b..58331670ea6a 100644 --- a/python/ray/tune/examples/tune-local-default.yaml +++ b/python/ray/tune/examples/tune-local-default.yaml @@ -8,4 +8,4 @@ auth: {ssh_user: YOUR_USERNAME, ssh_private_key: ~/.ssh/id_rsa} min_workers: 3 max_workers: 3 setup_commands: # Set up each node. - - pip install ray torch torchvision tabulate tensorboard + - pip install ray torch torchvision tensorboard diff --git a/python/ray/tune/execution/experiment_state.py b/python/ray/tune/execution/experiment_state.py index daf89075012f..f03297a32051 100644 --- a/python/ray/tune/execution/experiment_state.py +++ b/python/ray/tune/execution/experiment_state.py @@ -8,12 +8,13 @@ import time import warnings -from ray.tune.impl.out_of_band_serialize_dataset import out_of_band_serialize_dataset -from ray.tune import TuneError +from ray.air._internal.remote_storage import list_at_uri +from ray.air._internal.uri_utils import _join_path_or_uri -from ray.tune.syncer import SyncConfig, get_node_to_storage_syncer +from ray.tune import TuneError from ray.tune.experiment import Trial - +from ray.tune.impl.out_of_band_serialize_dataset import out_of_band_serialize_dataset +from ray.tune.syncer import SyncConfig, get_node_to_storage_syncer logger = logging.getLogger(__name__) @@ -69,14 +70,23 @@ def _experiment_checkpoint_exists(experiment_dir: str) -> bool: def _find_newest_experiment_checkpoint(experiment_dir: str) -> Optional[str]: - """Returns file name of most recently modified checkpoint.""" + """Returns file name of most recently created experiment checkpoint. + + Args: + experiment_dir: Local or remote path to the experiment directory + containing at least one experiment checkpoint file. + + Returns: + str: The local or remote path to the latest experiment checkpoint file + based on timestamp. None if no experiment checkpoints were found. + """ def construct(file: str) -> str: - return os.path.join(experiment_dir, file) + return _join_path_or_uri(experiment_dir, file) candidate_paths = [ construct(file) - for file in os.listdir(experiment_dir) + for file in list_at_uri(experiment_dir) if file.startswith("experiment_state") and file.endswith(".json") ] if not candidate_paths: @@ -138,6 +148,13 @@ def __init__( # Upload triggered by trial checkpoints self._sync_every_n_trial_checkpoints = sync_every_n_trial_checkpoints self._trial_num_checkpoints_since_last_sync: Dict[Trial, int] = Counter() + + self._slow_sync_threshold = float( + os.environ.get( + "TUNE_WARN_SLOW_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S", "30" + ) + ) + self._excessive_sync_threshold = float( os.environ.get( "TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S", "30" @@ -206,7 +223,7 @@ def checkpoint( # Checkpoint checkpoint_time_start = time.monotonic() - # NOTE: This context manager is for Ray Datasets captured in a trial config. + # NOTE: This context manager is for Datasets captured in a trial config. # This is the case when *tuning over datasets*. # If the datasets have already been full executed, then serializing # block refs means that this checkpoint is not usable in a new Ray cluster. @@ -266,17 +283,41 @@ def sync_up(self, force: bool = False, wait: bool = False) -> bool: exclude=exclude, ) + start_time = time.monotonic() if wait: self._syncer.wait() + now = time.monotonic() + sync_time_taken = now - start_time + + if sync_time_taken > self._slow_sync_threshold: + try: + import fsspec + except Exception: + fsspec = None + + fsspec_msg = "" + if fsspec is None: + fsspec_msg = ( + "If your data is small, try installing fsspec " + "(`pip install fsspec`) for more efficient local file parsing. " + ) + + logger.warning( + "Syncing the experiment checkpoint to cloud took a long time with " + f"{sync_time_taken:.2f} seconds. This can be due to a large number " + f"of trials, large logfiles, or throttling from the " + f"remote storage provider for too frequent syncs. {fsspec_msg}" + f"If your `CheckpointConfig.num_to_keep` is a low number, this can " + f"trigger frequent syncing, in which case you should increase it. " + ) + if not synced: return False self._should_force_cloud_sync = False self._trial_num_checkpoints_since_last_sync.clear() - # syncing might have taken some time, so we grab the current timestamp again - now = time.time() if now - self._last_sync_time < self._excessive_sync_threshold: logger.warning( "Experiment checkpoint syncing has been triggered multiple " diff --git a/python/ray/tune/execution/insufficient_resources_manager.py b/python/ray/tune/execution/insufficient_resources_manager.py index cc255ed9b267..c0755914c8ac 100644 --- a/python/ray/tune/execution/insufficient_resources_manager.py +++ b/python/ray/tune/execution/insufficient_resources_manager.py @@ -3,7 +3,7 @@ import os import ray import time -from typing import Dict +from typing import Dict, Optional, Tuple from ray.tune.execution.cluster_info import _is_ray_cluster from ray.tune.experiment import Trial @@ -18,10 +18,10 @@ def _get_cluster_resources_no_autoscaler() -> Dict: return ray.cluster_resources() -def _get_trial_cpu_and_gpu(trial: Trial) -> Dict: +def _get_trial_cpu_and_gpu(trial: Trial) -> Tuple[int, int]: cpu = trial.placement_group_factory.required_resources.get("CPU", 0) gpu = trial.placement_group_factory.required_resources.get("GPU", 0) - return {"CPU": cpu, "GPU": gpu} + return cpu, gpu def _can_fulfill_no_autoscaler(trial: Trial) -> bool: @@ -30,11 +30,11 @@ def _can_fulfill_no_autoscaler(trial: Trial) -> bool: For no autoscaler case. """ assert trial.status == Trial.PENDING - trial_cpu_gpu = _get_trial_cpu_and_gpu(trial) + asked_cpus, asked_gpus = _get_trial_cpu_and_gpu(trial) - return trial_cpu_gpu["CPU"] <= _get_cluster_resources_no_autoscaler().get( + return asked_cpus <= _get_cluster_resources_no_autoscaler().get( "CPU", 0 - ) and trial_cpu_gpu["GPU"] <= _get_cluster_resources_no_autoscaler().get("GPU", 0) + ) and asked_gpus <= _get_cluster_resources_no_autoscaler().get("GPU", 0) @lru_cache() @@ -52,38 +52,68 @@ def _get_insufficient_resources_warning_threshold() -> float: return float(os.environ.get("TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", "60")) +MSG_TRAIN_START = ( + "Training has not started in the last {wait_time:.0f} seconds. " + "This could be due to the cluster not having enough resources available. " +) +MSG_TRAIN_INSUFFICIENT = ( + "You asked for {asked_cpus} CPUs and {asked_gpus} GPUs, but the cluster only " + "has {cluster_cpus} CPUs and {cluster_gpus} GPUs available. " +) +MSG_TRAIN_END = ( + "Stop the training and adjust the required resources (e.g. via the " + "`ScalingConfig` or `resources_per_trial`, or `num_workers` for rllib), " + "or add more resources to your cluster." +) + +MSG_TUNE_START = ( + "No trial is running and no new trial has been started within " + "the last {wait_time:.0f} seconds. " + "This could be due to the cluster not having enough resources available. " +) +MSG_TUNE_INSUFFICIENT = ( + "You asked for {asked_cpus} CPUs and {asked_gpus} GPUs per trial, " + "but the cluster only has {cluster_cpus} CPUs and {cluster_gpus} GPUs available. " +) +MSG_TUNE_END = ( + "Stop the tuning and adjust the required resources (e.g. via the " + "`ScalingConfig` or `resources_per_trial`, or `num_workers` for rllib), " + "or add more resources to your cluster." +) + + # TODO(xwjiang): Consider having a help page with more detailed instructions. @lru_cache() -def _get_insufficient_resources_warning_msg() -> str: - msg = ( - f"No trial is running and no new trial has been started within" - f" at least the last " - f"{_get_insufficient_resources_warning_threshold()} seconds. " - f"This could be due to the cluster not having enough " - f"resources available to start the next trial. " - f"Stop the tuning job and adjust the resources requested per trial " - f"(possibly via `resources_per_trial` or via `num_workers` for rllib) " - f"and/or add more resources to your Ray runtime." - ) - if _is_ray_cluster(): - return "Ignore this message if the cluster is autoscaling. " + msg +def _get_insufficient_resources_warning_msg( + for_train: bool = False, trial: Optional[Trial] = None +) -> str: + msg = "Ignore this message if the cluster is autoscaling. " + + if for_train: + start = MSG_TRAIN_START + insufficient = MSG_TRAIN_INSUFFICIENT + end = MSG_TRAIN_END else: - return msg + start = MSG_TUNE_START + insufficient = MSG_TUNE_INSUFFICIENT + end = MSG_TUNE_END + + msg += start.format(wait_time=_get_insufficient_resources_warning_threshold()) + + if trial: + asked_cpus, asked_gpus = _get_trial_cpu_and_gpu(trial) + cluster_resources = _get_cluster_resources_no_autoscaler() + + msg += insufficient.format( + asked_cpus=asked_cpus, + asked_gpus=asked_gpus, + cluster_cpus=cluster_resources.get("CPU", 0), + cluster_gpus=cluster_resources.get("GPU", 0), + ) + msg += end -# A beefed up version when Tune Error is raised. -def _get_insufficient_resources_error_msg(trial: Trial) -> str: - trial_cpu_gpu = _get_trial_cpu_and_gpu(trial) - return ( - f"Ignore this message if the cluster is autoscaling. " - f"You asked for {trial_cpu_gpu['CPU']} cpu and " - f"{trial_cpu_gpu['GPU']} gpu per trial, but the cluster only has " - f"{_get_cluster_resources_no_autoscaler().get('CPU', 0)} cpu and " - f"{_get_cluster_resources_no_autoscaler().get('GPU', 0)} gpu. " - f"Stop the tuning job and adjust the resources requested per trial " - f"(possibly via `resources_per_trial` or via `num_workers` for rllib) " - f"and/or add more resources to your Ray runtime." - ) + return msg class _InsufficientResourcesManager: @@ -94,10 +124,11 @@ class _InsufficientResourcesManager: act upon. """ - def __init__(self): + def __init__(self, for_train: bool = False): # The information tracked across the life time of Tune loop. self._no_running_trials_since = -1 self._last_trial_num = -1 + self._for_train = for_train def on_no_available_trials(self, all_trials): """Tracks information across the life of Tune loop and makes guesses @@ -115,22 +146,21 @@ def on_no_available_trials(self, all_trials): time.monotonic() - self._no_running_trials_since > _get_insufficient_resources_warning_threshold() ): - if not _is_ray_cluster(): # autoscaler not enabled - # If any of the pending trial cannot be fulfilled, - # that's a good enough hint of trial resources not enough. - for trial in all_trials: - if ( - trial.status is Trial.PENDING - and not _can_fulfill_no_autoscaler(trial) - ): - # TODO(xwjiang): - # Raise an Error once #18608 is resolved. - logger.warning(_get_insufficient_resources_error_msg(trial)) - break - else: - # TODO(xwjiang): #17799. - # Output a more helpful msg for autoscaler. - logger.warning(_get_insufficient_resources_warning_msg()) + can_fulfill_any = any( + trial.status == Trial.PENDING and _can_fulfill_no_autoscaler(trial) + for trial in all_trials + ) + + if can_fulfill_any: + # If one trial can be fulfilled, it will be fulfilled eventually + self._no_running_trials_since = -1 + return + + # Otherwise, can fulfill none + msg = _get_insufficient_resources_warning_msg( + for_train=self._for_train, trial=all_trials[0] + ) + logger.warning(msg) self._no_running_trials_since = time.monotonic() else: self._no_running_trials_since = -1 diff --git a/python/ray/tune/execution/ray_trial_executor.py b/python/ray/tune/execution/ray_trial_executor.py index 0a31e063f808..5869b0100053 100644 --- a/python/ray/tune/execution/ray_trial_executor.py +++ b/python/ray/tune/execution/ray_trial_executor.py @@ -7,7 +7,6 @@ import traceback from collections import deque from enum import Enum -from functools import partial from typing import Callable, Dict, Iterable, Optional, Set, Union import ray @@ -57,6 +56,10 @@ COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV, "TUNE_CHECKPOINT_CLOUD_RETRY_NUM", "TUNE_CHECKPOINT_CLOUD_RETRY_WAIT_TIME_S", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SECURITY_TOKEN", + "AWS_SESSION_TOKEN", } @@ -946,13 +949,6 @@ def save( dir_or_data=value, storage_mode=storage, metrics=result, - local_to_remote_path_fn=partial( - TrainableUtil.get_remote_storage_path, - logdir=trial.local_path, - remote_checkpoint_dir=trial.remote_path, - ) - if trial.uses_cloud_checkpointing - else None, ) trial.saving_to = checkpoint self._futures[value] = (_ExecutorEventType.SAVING_RESULT, trial) diff --git a/python/ray/tune/execution/trial_runner.py b/python/ray/tune/execution/trial_runner.py index 185ec391b3c0..196291a21a53 100644 --- a/python/ray/tune/execution/trial_runner.py +++ b/python/ray/tune/execution/trial_runner.py @@ -1,3 +1,4 @@ +import uuid from typing import Any, Dict, List, Optional, Union, Tuple, Set from datetime import datetime @@ -133,12 +134,15 @@ def __init__( callbacks: Optional[List[Callback]] = None, metric: Optional[str] = None, trial_checkpoint_config: Optional[CheckpointConfig] = None, + _trainer_api: bool = False, ): self._search_alg = search_alg or BasicVariantGenerator() self._placeholder_resolvers = placeholder_resolvers self._scheduler_alg = scheduler or FIFOScheduler() self._callbacks = CallbackList(callbacks or []) - self._insufficient_resources_manager = _InsufficientResourcesManager() + self._insufficient_resources_manager = _InsufficientResourcesManager( + for_train=_trainer_api + ) self._pending_trial_queue_times = {} self._max_pending_trials = _get_max_pending_trials(self._search_alg) @@ -315,10 +319,15 @@ def experiment_state_file_name(self) -> str: @property def experiment_state_path(self) -> str: + """Returns the local experiment checkpoint path.""" return os.path.join( self._local_experiment_path, self.experiment_state_file_name ) + @property + def experiment_path(self) -> str: + return self._remote_experiment_path or self._local_experiment_path + def _create_checkpoint_manager(self): return _ExperimentCheckpointManager( local_checkpoint_dir=self._local_experiment_path, @@ -328,12 +337,6 @@ def _create_checkpoint_manager(self): sync_every_n_trial_checkpoints=self._trial_checkpoint_config.num_to_keep, ) - @property - def _remote_checkpoint_dir(self): - if self._sync_config.upload_dir and self._experiment_dir_name: - return str(URI(self._sync_config.upload_dir) / self._experiment_dir_name) - return None - @classmethod def checkpoint_exists(cls, directory: str) -> bool: if not os.path.exists(directory): @@ -365,7 +368,9 @@ def save_to_dir(self, experiment_dir: Optional[str] = None): }, } - tmp_file_name = os.path.join(experiment_dir, ".tmp_experiment_state") + tmp_file_name = os.path.join( + experiment_dir, f".tmp_experiment_state_{uuid.uuid4()}" + ) with open(tmp_file_name, "w") as f: json.dump(runner_state, f, indent=2, cls=TuneFunctionEncoder) @@ -517,6 +522,11 @@ def resume( trial_to_add.status = Trial.TERMINATED self.add_trial(trial_to_add) + def update_max_pending_trials(self, max_pending_trials: Optional[int] = None): + self._max_pending_trials = max_pending_trials or _get_max_pending_trials( + self._search_alg + ) + def update_pending_trial_resources( self, resources: Union[dict, PlacementGroupFactory] ): @@ -1250,6 +1260,7 @@ def __init__( callbacks: Optional[List[Callback]] = None, metric: Optional[str] = None, trial_checkpoint_config: Optional[CheckpointConfig] = None, + _trainer_api: bool = False, # Deprecated local_checkpoint_dir: Optional[str] = None, ): @@ -1285,6 +1296,7 @@ def __init__( callbacks=callbacks, metric=metric, trial_checkpoint_config=trial_checkpoint_config, + _trainer_api=_trainer_api, ) self.trial_executor.setup( @@ -1306,6 +1318,10 @@ def _wrapped(self): executor_whitelist_attr={"has_resources_for_trial", "pause_trial", "save"}, ) + def update_max_pending_trials(self, max_pending_trials: Optional[int] = None): + super().update_max_pending_trials(max_pending_trials=max_pending_trials) + self.trial_executor._max_staged_actors = self._max_pending_trials + def _used_resources_string(self) -> str: return self.trial_executor.debug_string() @@ -1602,7 +1618,9 @@ def _get_max_pending_trials(search_alg: SearchAlgorithm) -> int: # Use a minimum of 16 to trigger fast autoscaling # Scale up to at most the number of available cluster CPUs cluster_cpus = ray.cluster_resources().get("CPU", 1.0) - max_pending_trials = max(16, int(cluster_cpus * 1.1)) + max_pending_trials = min( + max(search_alg.total_samples, 16), max(16, int(cluster_cpus * 1.1)) + ) if max_pending_trials > 128: logger.warning( diff --git a/python/ray/tune/execution/tune_controller.py b/python/ray/tune/execution/tune_controller.py index 8ad4aebcc288..95d2fa11121c 100644 --- a/python/ray/tune/execution/tune_controller.py +++ b/python/ray/tune/execution/tune_controller.py @@ -14,6 +14,7 @@ from ray.air._internal.checkpoint_manager import CheckpointStorage, _TrackedCheckpoint from ray.air.execution import ResourceManager, PlacementGroupResourceManager from ray.air.execution._internal import RayActorManager, TrackedActor +from ray.exceptions import RayActorError from ray.tune.error import _AbortTrialExecution from ray.tune.execution.ray_trial_executor import _class_cache from ray.tune.execution.trial_runner import _TuneControllerBase, TrialRunnerWrapper @@ -34,6 +35,7 @@ from ray.tune.syncer import SyncConfig from ray.tune.experiment import Trial from ray.tune.utils import warn_if_slow +from ray.tune.utils.log import _dedup_logs from ray.tune.utils.object_cache import _ObjectCache from ray.tune.utils.resource_updater import _ResourceUpdater from ray.util.annotations import DeveloperAPI @@ -65,13 +67,14 @@ def __init__( chdir_to_trial_dir: bool = False, reuse_actors: bool = False, resource_manager_factory: Optional[Callable[[], ResourceManager]] = None, + _trainer_api: bool = False, ): if resource_manager_factory: - self._resource_manager = resource_manager_factory() + resource_manager = resource_manager_factory() else: - self._resource_manager = PlacementGroupResourceManager() + resource_manager = PlacementGroupResourceManager() - self._actor_manager = RayActorManager(resource_manager=self._resource_manager) + self._actor_manager = RayActorManager(resource_manager=resource_manager) self._class_cache = _class_cache @@ -143,6 +146,7 @@ def __init__( callbacks=callbacks, metric=metric, trial_checkpoint_config=trial_checkpoint_config, + _trainer_api=_trainer_api, ) def _wrapped(self): @@ -215,6 +219,11 @@ def _cleanup_stopping_actors(self, force_all: bool = False): continue _, tracked_actor = times.popleft() + + if tracked_actor not in self._stopping_actors: + # Actor stopping has been handled by the block above + continue + if self._actor_manager.is_actor_started(tracked_actor=tracked_actor): logger.debug(f"Forcefully killing actor: {tracked_actor}") self._actor_manager.remove_actor(tracked_actor=tracked_actor, kill=True) @@ -376,8 +385,6 @@ def _maybe_update_trial_queue(self): def _cleanup_trials(self): logger.debug("CLEANING UP all trials") - self._cleanup_cached_actors(force_all=True) - for tracked_actor in list(self._actor_to_trial): trial = self._actor_to_trial[tracked_actor] logger.debug( @@ -386,9 +393,15 @@ def _cleanup_trials(self): ) self._schedule_trial_stop(trial) + # Clean up cached actors now + self._cleanup_cached_actors(force_all=True) + start = time.monotonic() while time.monotonic() - start < 5 and self._actor_manager.num_total_actors: - logger.debug("Waiting for actor manager to clean up final state") + if _dedup_logs("actor_manager_cleanup", str(start)): + logger.debug( + "Waiting for actor manager to clean up final state [dedup]" + ) self._actor_manager.next(timeout=1) logger.debug("Force cleanup of remaining actors") @@ -431,19 +444,27 @@ def _maybe_add_actors(self) -> None: trial_to_run = self._scheduler_alg.choose_trial_to_run(self._wrapped()) if trial_to_run: - logger.debug(f"Chose trial to run from scheduler: {trial_to_run}") + if _dedup_logs("trial_to_run_chosen", trial_to_run.trial_id): + logger.debug( + f"Chose trial to run from scheduler: {trial_to_run} [dedup]" + ) if ( trial_to_run not in self._staged_trials and trial_to_run not in self._trial_to_actor ): logger.debug(f"Staging trial to run: {trial_to_run}") + self._set_trial_status(trial_to_run, Trial.PENDING) self._staged_trials.add(trial_to_run) self._actor_cache.increase_max(trial_to_run.placement_group_factory) # schedule_trial_actor also potentially uses cached actors self._schedule_trial_actor(trial_to_run) else: # Otherwise, only try to use the cached actor - logger.debug(f"Trying to re-use actor for trial to run: {trial_to_run}") + if _dedup_logs("trial_to_run_reuse", trial_to_run.trial_id): + logger.debug( + f"Trying to re-use actor for trial to run: {trial_to_run} " + f"[dedup]" + ) self._maybe_reuse_cached_actor(trial_to_run) ### @@ -452,7 +473,7 @@ def _maybe_add_actors(candidates: List[Trial]): new_candidates = [] while candidates: - if len(self._staged_trials) >= self._max_pending_trials: + if self._actor_manager.num_pending_actors >= self._max_pending_trials: break trial = candidates.pop(0) @@ -460,7 +481,7 @@ def _maybe_add_actors(candidates: List[Trial]): # If the trial is part of the list, but not of the set, # we just ignore it. Removing it from the list on status # change is too expensive. - if trial not in (self._pending_trials | self._paused_trials): + if trial not in self._pending_trials: continue if trial in self._trial_to_actor: @@ -517,6 +538,7 @@ def _maybe_reuse_cached_actor(self, trial: Trial) -> bool: if trial in self._trial_to_actor: original_actor = self._trial_to_actor.pop(trial) self._actor_to_trial.pop(original_actor) + logger.debug(f"Removing ORIGINAL ACTOR for trial {trial}: {original_actor}") self._remove_actor(tracked_actor=original_actor) @@ -541,7 +563,7 @@ def _schedule_trial_actor(self, trial: Trial): """ logger.debug(f"Trying to schedule new ACTOR for trial {trial}") - self._set_trial_status(trial, Trial.PENDING) + assert trial.status == Trial.PENDING trial.init_logdir() # We checkpoint metadata here to try mitigating logdir duplication @@ -563,10 +585,13 @@ def _schedule_trial_actor(self, trial: Trial): trainable_cls = trial.get_trainable_cls() if not trainable_cls: - raise _AbortTrialExecution( + exception = _AbortTrialExecution( f"Invalid trainable: {trial.trainable_name}. If you passed " f"a string, make sure the trainable was registered before." ) + self._schedule_trial_stop(trial, exception=exception) + return + _actor_cls = self._class_cache.get(trainable_cls) trial.set_location(_Location()) @@ -741,6 +766,14 @@ def _actor_failed(self, tracked_actor: TrackedActor, exception: Exception): self._unstage_trial_with_resources(trial) self._trial_task_failure(trial, exception=exception) + self._actor_manager.clear_actor_task_futures(tracked_actor) + + # Clean up actor + tracked_actor.set_on_stop(None) + tracked_actor.set_on_error(None) + self._actor_manager.remove_actor(tracked_actor, kill=False) + + # Trigger actor stopped callback self._actor_stopped(tracked_actor) def _schedule_trial_task( @@ -793,7 +826,12 @@ def _on_result(tracked_actor: TrackedActor, *args, **kwargs): if on_error: def _on_error(tracked_actor: TrackedActor, exception: Exception): - assert trial == self._actor_to_trial[tracked_actor] + # If the actor failed, it has already been cleaned up. + if tracked_actor not in self._actor_to_trial: + assert isinstance(exception, RayActorError), type(exception) + else: + assert trial == self._actor_to_trial[tracked_actor] + logger.debug( f"Future {method_name.upper()} FAILED for trial {trial}: " f"{exception}" @@ -832,7 +870,7 @@ def _trial_task_failure(self, trial: Trial, exception: Exception): raise exception else: if self._print_trial_errors: - logger.error("Trial task failed", exc_info=exception) + logger.error(f"Trial task failed for trial {trial}", exc_info=exception) self._process_trial_failure(trial, exception=exception) def _schedule_trial_stop(self, trial: Trial, exception: Optional[Exception] = None): @@ -971,13 +1009,6 @@ def _schedule_trial_save( dir_or_data=future, storage_mode=storage, metrics=result, - local_to_remote_path_fn=partial( - TrainableUtil.get_remote_storage_path, - logdir=trial.logdir, - remote_checkpoint_dir=trial.remote_checkpoint_dir, - ) - if trial.uses_cloud_checkpointing - else None, ) trial.saving_to = checkpoint @@ -1120,7 +1151,6 @@ def _on_trial_reset(self, trial: Trial, success: bool): def __getstate__(self): state = super().__getstate__() for exclude in [ - "_resource_manager", "_actor_manager", "_class_cache", "_resource_updater", diff --git a/python/ray/tune/experiment/trial.py b/python/ray/tune/experiment/trial.py index 453aa9b06ab9..c67bb045be42 100644 --- a/python/ray/tune/experiment/trial.py +++ b/python/ray/tune/experiment/trial.py @@ -16,6 +16,7 @@ import uuid import ray +from ray._private.dict import unflatten_dict from ray.air import CheckpointConfig from ray.air._internal.uri_utils import URI from ray.air._internal.checkpoint_manager import _TrackedCheckpoint, CheckpointStorage @@ -626,6 +627,7 @@ def last_result(self) -> dict: @last_result.setter def last_result(self, val: dict): self._last_result = val + self.invalidate_json_state() def get_runner_ip(self) -> Optional[str]: if self.location.hostname: @@ -987,7 +989,8 @@ def on_checkpoint(self, checkpoint: _TrackedCheckpoint): def on_restore(self): """Handles restoration completion.""" assert self.is_restoring - self.last_result = self.restoring_from.metrics + self.last_result = unflatten_dict(self.restoring_from.metrics) + self.last_result.setdefault("config", self.config) self.restoring_from = None self.num_restore_failures = 0 self.invalidate_json_state() @@ -1058,7 +1061,8 @@ def update_last_result(self, result): self.metric_analysis[metric][key] = sum( self.metric_n_steps[metric][str(n)] ) / len(self.metric_n_steps[metric][str(n)]) - self.invalidate_json_state() + + # json state is invalidated in last_result.setter def get_trainable_cls(self): if self.stub: diff --git a/python/ray/tune/experimental/output.py b/python/ray/tune/experimental/output.py index 726e4696b01c..cbd75aa45dfe 100644 --- a/python/ray/tune/experimental/output.py +++ b/python/ray/tune/experimental/output.py @@ -1,4 +1,15 @@ -from typing import List, Dict, Optional, Tuple, Any, TYPE_CHECKING +import sys +from typing import ( + Any, + Collection, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, + TYPE_CHECKING, +) import contextlib import collections @@ -11,10 +22,12 @@ import numpy as np import os import pandas as pd -from tabulate import tabulate import textwrap import time +from ray.tune.search.sample import Domain +from ray.tune.utils.log import Verbosity + try: import rich import rich.layout @@ -23,10 +36,15 @@ rich = None import ray -from ray._private.dict import unflattened_lookup +from ray._private.dict import unflattened_lookup, flatten_dict +from ray._private.thirdparty.tabulate.tabulate import ( + tabulate, + TableFormat, + Line, + DataRow, +) from ray.air._internal.checkpoint_manager import _TrackedCheckpoint from ray.tune.callback import Callback -from ray.tune.logger import pretty_print from ray.tune.result import ( AUTO_RESULT_KEYS, EPISODE_REWARD_MEAN, @@ -85,10 +103,35 @@ class AirVerbosity(IntEnum): IS_NOTEBOOK = ray.widgets.util.in_notebook() -def get_air_verbosity() -> Optional[AirVerbosity]: - verbosity = os.environ.get("AIR_VERBOSITY", None) - if verbosity: - return AirVerbosity(int(verbosity)) if verbosity else None +def get_air_verbosity( + verbose: Union[int, AirVerbosity, Verbosity] +) -> Optional[AirVerbosity]: + if os.environ.get("RAY_AIR_NEW_OUTPUT", "0") == "0": + return None + + if isinstance(verbose, AirVerbosity): + return verbose + + verbose_int = verbose if isinstance(verbose, int) else verbose.value + + # Verbosity 2 and 3 both map to AirVerbosity 2 + verbose_int = min(2, verbose_int) + + return AirVerbosity(verbose_int) + + +def _infer_params(config: Dict[str, Any]) -> List[str]: + params = [] + flat_config = flatten_dict(config) + for key, val in flat_config.items(): + if isinstance(val, Domain): + params.append(key) + # Grid search is a special named field. Because we flattened + # the whole config, we look it up per string + if key.endswith("/grid_search"): + # Truncate `/grid_search` + params.append(key[:-12]) + return params def _get_time_str(start_time: float, current_time: float) -> Tuple[str, str]: @@ -106,22 +149,27 @@ def _get_time_str(start_time: float, current_time: float) -> Tuple[str, str]: delta: datetime.timedelta = current_time_dt - start_time_dt rest = delta.total_seconds() - days = rest // (60 * 60 * 24) + days = int(rest // (60 * 60 * 24)) rest -= days * (60 * 60 * 24) - hours = rest // (60 * 60) + hours = int(rest // (60 * 60)) rest -= hours * (60 * 60) - minutes = rest // 60 + minutes = int(rest // 60) - seconds = rest - minutes * 60 + seconds = int(rest - minutes * 60) + running_for_str = "" if days > 0: - running_for_str = f"{days:.0f} days, " - else: - running_for_str = "" + running_for_str += f"{days:d}d " + + if hours > 0 or running_for_str: + running_for_str += f"{hours:d}hr " - running_for_str += f"{hours:02.0f}:{minutes:02.0f}:{seconds:05.2f}" + if minutes > 0 or running_for_str: + running_for_str += f"{minutes:d}min " + + running_for_str += f"{seconds:d}s" return f"{current_time_dt:%Y-%m-%d %H:%M:%S}", running_for_str @@ -230,17 +278,31 @@ def _max_len(value: Any, max_len: int = 20, wrap: bool = False) -> Any: return result -def _get_trial_info(trial: Trial, metric_keys: List[str]) -> List[str]: +def _get_trial_info( + trial: Trial, param_keys: List[str], metric_keys: List[str] +) -> List[str]: """Returns the following information about a trial: name | status | metrics... Args: trial: Trial to get information for. + param_keys: Names of parameters to include. metric_keys: Names of metrics to include. """ result = trial.last_result trial_info = [str(trial), trial.status] + + # params + trial_info.extend( + [ + _max_len( + unflattened_lookup(param, trial.config, default=None), + ) + for param in param_keys + ] + ) + # metrics trial_info.extend( [ _max_len( @@ -255,6 +317,7 @@ def _get_trial_info(trial: Trial, metric_keys: List[str]) -> List[str]: def _get_trial_table_data_per_status( status: str, trials: List[Trial], + param_keys: List[str], metric_keys: List[str], force_max_rows: bool = False, ) -> Optional[_PerStatusTrialTableData]: @@ -263,6 +326,7 @@ def _get_trial_table_data_per_status( Args: status: The trial status of interest. trials: all the trials of that status. + param_keys: *Ordered* list of parameters to be displayed in the table. metric_keys: *Ordered* list of metrics to be displayed in the table. Including both default and user defined. force_max_rows: Whether or not to enforce a max row number for this status. @@ -280,23 +344,28 @@ def _get_trial_table_data_per_status( more_info = None for t in trials: if len(trial_infos) >= max_row: - more_info = f"... and {str(len(trials) - max_row)} more {status} ..." + remaining = len(trials) - max_row + more_info = f"{remaining} more {status}" break - trial_infos.append(_get_trial_info(t, metric_keys)) + trial_infos.append(_get_trial_info(t, param_keys, metric_keys)) return _PerStatusTrialTableData(trial_infos, more_info) def _get_trial_table_data( trials: List[Trial], + param_keys: List[str], metric_keys: List[str], + all_rows: bool = False, ) -> _TrialTableData: """Generate a table showing the current progress of tuning trials. Args: trials: List of trials for which progress is to be shown. + param_keys: Ordered list of parameters to be displayed in the table. metric_keys: Ordered list of metrics to be displayed in the table. Including both default and user defined. Will only be shown if at least one trial is having the key. + all_rows: Force to show all rows. Returns: Trial table data, including header and trial table per each status. @@ -320,19 +389,29 @@ def _get_trial_table_data( formatted_metric_columns = [ _max_len(k, max_len=max_column_length, wrap=True) for k in metric_keys ] - # Map to the abbreviated version if necessary. - header = ["Trial name", "status"] + [ - DEFAULT_COLUMNS[key] if key in DEFAULT_COLUMNS else key - for key in formatted_metric_columns + + formatted_param_columns = [ + _max_len(k, max_len=max_column_length, wrap=True) for k in param_keys + ] + + metric_header = [ + DEFAULT_COLUMNS[metric] if metric in DEFAULT_COLUMNS else formatted + for metric, formatted in zip(metric_keys, formatted_metric_columns) ] + param_header = formatted_param_columns + + # Map to the abbreviated version if necessary. + header = ["Trial name", "status"] + param_header + metric_header + trial_data = list() for t_status in ORDER: trial_data_per_status = _get_trial_table_data_per_status( t_status, trials_by_state[t_status], - metric_keys=formatted_metric_columns, - force_max_rows=len(trials) > max_trial_num_to_show, + param_keys=param_keys, + metric_keys=metric_keys, + force_max_rows=not all_rows and len(trials) > max_trial_num_to_show, ) if trial_data_per_status: trial_data.append(trial_data_per_status) @@ -356,6 +435,92 @@ def _best_trial_str( ) +def _render_table_item( + key: str, item: Any, prefix: str = "" +) -> Iterable[Tuple[str, str]]: + key = prefix + key + if isinstance(item, float): + # tabulate does not work well with mixed-type columns, so we format + # numbers ourselves. + yield key, f"{item:.5f}".rstrip("0") + else: + yield key, _max_len(item, 20) + + +def _get_dict_as_table_data( + data: Dict, + include: Optional[Collection] = None, + exclude: Optional[Collection] = None, + upper_keys: Optional[Collection] = None, +): + include = include or set() + exclude = exclude or set() + upper_keys = upper_keys or set() + + upper = [] + lower = [] + + flattened = flatten_dict(data) + + for key, value in sorted(flattened.items()): + if include and key not in include: + continue + if key in exclude: + continue + + for k, v in _render_table_item(str(key), value): + if key in upper_keys: + upper.append([k, v]) + else: + lower.append([k, v]) + + if not upper: + return lower + elif not lower: + return upper + else: + return upper + lower + + +# Copied/adjusted from tabulate +AIR_TABULATE_TABLEFMT = TableFormat( + lineabove=Line("╭", "─", "─", "╮"), + linebelowheader=Line("├", "─", "─", "┤"), + linebetweenrows=None, + linebelow=Line("╰", "─", "─", "╯"), + headerrow=DataRow("│", " ", "│"), + datarow=DataRow("│", " ", "│"), + padding=1, + with_header_hide=None, +) + + +def _print_dict_as_table( + data: Dict, + header: Optional[str] = None, + include: Optional[Collection[str]] = None, + exclude: Optional[Collection[str]] = None, + division: Optional[Collection[str]] = None, +): + table_data = _get_dict_as_table_data( + data=data, include=include, exclude=exclude, upper_keys=division + ) + + headers = [header, ""] if header else [] + + if not table_data: + return + + print( + tabulate( + table_data, + headers=headers, + colalign=("left", "right"), + tablefmt=AIR_TABULATE_TABLEFMT, + ) + ) + + class ProgressReporter: """Periodically prints out status update.""" @@ -374,19 +539,43 @@ def __init__(self, verbosity: AirVerbosity): self._start_time = time.time() self._last_heartbeat_time = 0 + def experiment_started( + self, + experiment_name: str, + experiment_path: str, + searcher_str: str, + scheduler_str: str, + total_num_samples: int, + tensorboard_path: Optional[str] = None, + **kwargs, + ): + print(f"\nView detailed results here: {experiment_path}") + + if tensorboard_path: + print( + f"To visualize your results with TensorBoard, run: " + f"`tensorboard --logdir {tensorboard_path}`" + ) + + print("") + @property def _time_heartbeat_str(self): - current_time_str, running_for_str = _get_time_str(self._start_time, time.time()) - return f"Current time: {current_time_str} " f"(running for {running_for_str})" + current_time_str, running_time_str = _get_time_str( + self._start_time, time.time() + ) + return ( + f"Current time: {current_time_str}. Total running time: " + running_time_str + ) def print_heartbeat(self, trials, *args, force: bool = False): if self._verbosity < self._heartbeat_threshold: return if force or time.time() - self._last_heartbeat_time > self._heartbeat_freq: - self._print_heartbeat(trials, *args) + self._print_heartbeat(trials, *args, force=force) self._last_heartbeat_time = time.time() - def _print_heartbeat(self, trials, *args): + def _print_heartbeat(self, trials, *args, force: bool = False): raise NotImplementedError @@ -395,19 +584,32 @@ def _detect_reporter( num_samples: int, metric: Optional[str] = None, mode: Optional[str] = None, + config: Optional[Dict] = None, ): # TODO: Add JupyterNotebook and Ray Client case later. - rich_enabled = "ENABLE_RICH" in os.environ + rich_enabled = bool(int(os.environ.get("RAY_AIR_RICH_LAYOUT", "0"))) if num_samples and num_samples > 1: if rich_enabled: if not rich: raise ImportError("Please run `pip install rich`. ") - reporter = TuneRichReporter(verbosity, num_samples, metric, mode) + reporter = TuneRichReporter( + verbosity, + num_samples=num_samples, + metric=metric, + mode=mode, + config=config, + ) else: - reporter = TuneTerminalReporter(verbosity, num_samples, metric, mode) + reporter = TuneTerminalReporter( + verbosity, + num_samples=num_samples, + metric=metric, + mode=mode, + config=config, + ) else: if rich_enabled: - logger.warning("`ENABLE_RICH` is only effective with Tune usecase.") + logger.warning("`RAY_AIR_RICH_LAYOUT` is only effective with Tune usecase.") reporter = TrainReporter(verbosity) return reporter @@ -421,12 +623,14 @@ def __init__( num_samples: int, metric: Optional[str] = None, mode: Optional[str] = None, + config: Optional[Dict] = None, ): self._num_samples = num_samples self._metric = metric self._mode = mode # will be populated when first result comes in. self._inferred_metric = None + self._inferred_params = _infer_params(config) super(TuneReporterBase, self).__init__(verbosity=verbosity) def _get_overall_trial_progress_str(self, trials): @@ -439,7 +643,9 @@ def _get_overall_trial_progress_str(self, trials): return f"Trial status: {result}" # TODO: Return a more structured type to share code with Jupyter flow. - def _get_heartbeat(self, trials, *sys_args) -> Tuple[List[str], _TrialTableData]: + def _get_heartbeat( + self, trials, *sys_args, force_full_output: bool = False + ) -> Tuple[List[str], _TrialTableData]: result = list() # Trial status: 1 RUNNING | 7 PENDING result.append(self._get_overall_trial_progress_str(trials)) @@ -460,36 +666,83 @@ def _get_heartbeat(self, trials, *sys_args) -> Tuple[List[str], _TrialTableData] all_metrics = list(DEFAULT_COLUMNS.keys()) + self._inferred_metric - trial_table_data = _get_trial_table_data(trials, all_metrics) + trial_table_data = _get_trial_table_data( + trials, + param_keys=self._inferred_params, + metric_keys=all_metrics, + all_rows=force_full_output, + ) return result, trial_table_data - def _print_heartbeat(self, trials, *sys_args): + def _print_heartbeat(self, trials, *sys_args, force: bool = False): raise NotImplementedError class TuneTerminalReporter(TuneReporterBase): - def _print_heartbeat(self, trials, *sys_args): - if self._verbosity < self._heartbeat_threshold: + def experiment_started( + self, + experiment_name: str, + experiment_path: str, + searcher_str: str, + scheduler_str: str, + total_num_samples: int, + tensorboard_path: Optional[str] = None, + **kwargs, + ): + if total_num_samples > sys.maxsize: + total_num_samples_str = "infinite" + else: + total_num_samples_str = str(total_num_samples) + + print( + tabulate( + [ + ["Search algorithm", searcher_str], + ["Scheduler", scheduler_str], + ["Number of trials", total_num_samples_str], + ], + headers=["Configuration for experiment", experiment_name], + tablefmt=AIR_TABULATE_TABLEFMT, + ) + ) + super().experiment_started( + experiment_name=experiment_name, + experiment_path=experiment_path, + searcher_str=searcher_str, + scheduler_str=scheduler_str, + total_num_samples=total_num_samples, + tensorboard_path=tensorboard_path, + **kwargs, + ) + + def _print_heartbeat(self, trials, *sys_args, force: bool = False): + if self._verbosity < self._heartbeat_threshold and not force: return - heartbeat_strs, table_data = self._get_heartbeat(trials, *sys_args) + heartbeat_strs, table_data = self._get_heartbeat( + trials, *sys_args, force_full_output=force + ) + for s in heartbeat_strs: print(s) # now print the table using Tabulate - all_infos = [] + more_infos = [] + all_data = [] header = table_data.header - table_data_list = table_data.data - for table in table_data_list: - all_infos.extend(table.trial_infos) - if table.more_info: - all_infos.append(table.more_info) + for sub_table in table_data.data: + all_data.extend(sub_table.trial_infos) + if sub_table.more_info: + more_infos.append(sub_table.more_info) + print( tabulate( - all_infos, + all_data, headers=header, - tablefmt="simple", + tablefmt=AIR_TABULATE_TABLEFMT, showindex=False, ) ) + if more_infos: + print(", ".join(more_infos)) print() @@ -554,7 +807,7 @@ def _render_layout(self, heartbeat_strs: List[str], table_data: _TrialTableData) self._live.update(table) - def _print_heartbeat(self, trials, *args): + def _print_heartbeat(self, trials, *args, force: bool = False): if not rich: return if not self._live: @@ -563,7 +816,9 @@ def _print_heartbeat(self, trials, *args): "be called without `with_live` context manager." ) return - heartbeat_strs, table_data = self._get_heartbeat(trials, *args) + heartbeat_strs, table_data = self._get_heartbeat( + trials, *args, force_full_output=force + ) self._render_layout(heartbeat_strs, table_data) @@ -571,7 +826,7 @@ class TrainReporter(ProgressReporter): # the minimal verbosity threshold at which heartbeat starts getting printed. _heartbeat_threshold = AirVerbosity.VERBOSE - def _get_heartbeat(self, trials: List[Trial]): + def _get_heartbeat(self, trials: List[Trial], force_full_output: bool = False): # Training on iteration 1. Current time: 2023-03-22 15:29:25 (running for 00:00:03.24) # noqa if len(trials) == 0: return @@ -588,12 +843,13 @@ def _get_heartbeat(self, trials: List[Trial]): [f"Training on iteration {iter_num}.", self._time_heartbeat_str] ) - def _print_heartbeat(self, trials, *args): - print(self._get_heartbeat(trials)) + def _print_heartbeat(self, trials, *args, force: bool = False): + print(self._get_heartbeat(trials, force_full_output=force)) # These keys are blacklisted for printing out training/tuning intermediate/final result! BLACKLISTED_KEYS = { + "config", "date", "done", "hostname", @@ -611,9 +867,10 @@ def _print_heartbeat(self, trials, *args): class AirResultCallbackWrapper(Callback): # This is only to bypass the issue that by the time default callbacks # are added, there is no information on `num_samples` yet. - def __init__(self, verbosity): + def __init__(self, verbosity: AirVerbosity, metrics: Collection[str] = ()): self._verbosity = verbosity self._callback = None + self._metrics = metrics def setup( self, @@ -623,9 +880,9 @@ def setup( **info, ): self._callback = ( - TuneResultProgressCallback(self._verbosity) + TuneResultProgressCallback(self._verbosity, metrics=self._metrics) if total_num_samples > 1 - else TrainResultProgressCallback(self._verbosity) + else TrainResultProgressCallback(self._verbosity, metrics=self._metrics) ) # everything ELSE is just passing through.. @@ -647,15 +904,33 @@ class AirResultProgressCallback(Callback): _intermediate_result_verbosity = None _addressing_tmpl = None - def __init__(self, verbosity): + def __init__(self, verbosity: AirVerbosity, metrics: Collection[str] = ()): self._verbosity = verbosity self._start_time = time.time() - - def _print_result(self, trial, result=None): - print(pretty_print(result or trial.last_result, BLACKLISTED_KEYS)) + self._trial_last_printed_results = {} + self._metrics = metrics + + def _print_result(self, trial, result: Optional[Dict] = None, force: bool = False): + """Only print result if a different result has been reported, or force=True""" + result = result or trial.last_result + + last_result_iter = self._trial_last_printed_results.get(trial.trial_id, -1) + this_iter = result.get(TRAINING_ITERATION, 0) + + if this_iter != last_result_iter or force: + _print_dict_as_table( + result, + header=f"{self._addressing_tmpl.format(trial)} result", + include=self._metrics, + exclude=BLACKLISTED_KEYS, + division=AUTO_RESULT_KEYS, + ) + self._trial_last_printed_results[trial.trial_id] = this_iter def _print_config(self, trial): - print(pretty_print(trial.config)) + _print_dict_as_table( + trial.config, header=f"{self._addressing_tmpl.format(trial)} config" + ) def on_trial_result( self, @@ -667,15 +942,11 @@ def on_trial_result( ): if self._verbosity < self._intermediate_result_verbosity: return - curr_time, running_time = _get_time_str(self._start_time, time.time()) + curr_time_str, running_time_str = _get_time_str(self._start_time, time.time()) print( - " ".join( - [ - self._addressing_tmpl.format(trial), - f"finished iter {result[TRAINING_ITERATION]} " - f"at {curr_time} (running for {running_time})", - ] - ) + f"{self._addressing_tmpl.format(trial)} " + f"finished iteration {result[TRAINING_ITERATION]} " + f"at {curr_time_str}. Total running time: " + running_time_str ) self._print_result(trial, result) @@ -684,18 +955,14 @@ def on_trial_complete( ): if self._verbosity < self._start_end_verbosity: return - curr_time, running_time = _get_time_str(self._start_time, time.time()) + curr_time_str, running_time_str = _get_time_str(self._start_time, time.time()) finished_iter = 0 if trial.last_result and TRAINING_ITERATION in trial.last_result: finished_iter = trial.last_result[TRAINING_ITERATION] print( - " ".join( - [ - self._addressing_tmpl.format(trial), - f"({finished_iter} iters) " - f"finished at {curr_time} (running for {running_time})", - ] - ) + f"{self._addressing_tmpl.format(trial)} " + f"completed training after {finished_iter} iterations " + f"at {curr_time_str}. Total running time: " + running_time_str ) self._print_result(trial) @@ -714,29 +981,26 @@ def on_checkpoint( if trial.last_result and TRAINING_ITERATION in trial.last_result: saved_iter = trial.last_result[TRAINING_ITERATION] print( - " ".join( - [ - self._addressing_tmpl.format(trial), - f"saved checkpoint for iter {saved_iter}" - f" at {checkpoint.dir_or_data}", - ] - ) + f"{self._addressing_tmpl.format(trial)} " + f"saved a checkpoint for iteration {saved_iter} " + f"at: {checkpoint.dir_or_data}" ) - print() def on_trial_start(self, iteration: int, trials: List[Trial], trial: Trial, **info): if self._verbosity < self._start_end_verbosity: return has_config = bool(trial.config) - print( - " ".join( - [ - self._addressing_tmpl.format(trial), - "started with configuration:" if has_config else "started.", - ] + + if has_config: + print( + f"{self._addressing_tmpl.format(trial)} " f"started with configuration:" + ) + self._print_config(trial) + else: + print( + f"{self._addressing_tmpl.format(trial)} " + f"started without custom configuration." ) - ) - self._print_config(trial) class TuneResultProgressCallback(AirResultProgressCallback): diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py index 4730c22dc103..33134812bacd 100644 --- a/python/ray/tune/impl/tuner_internal.py +++ b/python/ray/tune/impl/tuner_internal.py @@ -7,7 +7,17 @@ import shutil import tempfile from pathlib import Path -from typing import Any, Callable, Dict, Optional, Type, Union, TYPE_CHECKING, Tuple +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Type, + Union, + TYPE_CHECKING, + Tuple, +) import ray import ray.cloudpickle as pickle @@ -23,13 +33,13 @@ from ray.tune.trainable import Trainable from ray.tune.tune import run from ray.tune.tune_config import TuneConfig +from ray.tune.utils import flatten_dict if TYPE_CHECKING: from ray.train.trainer import BaseTrainer from ray.util.queue import Queue -_TRAINABLE_PKL = "trainable.pkl" _TUNER_PKL = "tuner.pkl" _TRAINABLE_KEY = "_trainable" _CONVERTED_TRAINABLE_KEY = "_converted_trainable" @@ -80,6 +90,7 @@ def __init__( tune_config: Optional[TuneConfig] = None, run_config: Optional[RunConfig] = None, _tuner_kwargs: Optional[Dict] = None, + _trainer_api: bool = False, ): from ray.train.trainer import BaseTrainer @@ -90,29 +101,17 @@ def __init__( param_space=param_space, ) - self.trainable = trainable - param_space = param_space or {} - if isinstance(param_space, _Config): - param_space = param_space.to_dict() - if not isinstance(param_space, dict): - raise ValueError( - "The `param_space` passed to the Tuner` must be a dict. " - f"Got '{type(param_space)}' instead." - ) - self.param_space = param_space - self._tune_config = tune_config or TuneConfig() self._run_config = run_config or RunConfig() - - self._missing_params_error_message = None + self._trainer_api = _trainer_api # Restore from Tuner checkpoint. if restore_path: self._restore_from_path_or_uri( path_or_uri=restore_path, - resume_config=resume_config, - overwrite_trainable=trainable, + trainable=trainable, overwrite_param_space=param_space, + resume_config=resume_config, ) return @@ -120,14 +119,18 @@ def __init__( if not trainable: raise TuneError("You need to provide a trainable to tune.") - self._is_restored = False - self._resume_config = None + self.trainable = trainable + assert self.converted_trainable + self._validate_trainable(self.converted_trainable) + + self.param_space = param_space + self._resume_config = None + self._is_restored = False self._tuner_kwargs = copy.deepcopy(_tuner_kwargs) or {} self._experiment_checkpoint_dir = self.setup_create_experiment_checkpoint_dir( self.converted_trainable, self._run_config ) - self._experiment_analysis = None # This needs to happen before `tune.run()` is kicked in. @@ -138,22 +141,7 @@ def __init__( # to restore from. experiment_checkpoint_path = Path(self._experiment_checkpoint_dir) with open(experiment_checkpoint_path / _TUNER_PKL, "wb") as fp: - pickle.dump(self, fp) - - try: - with open(experiment_checkpoint_path / _TRAINABLE_PKL, "wb") as fp: - pickle.dump(self.trainable, fp) - except TypeError as e: - sio = io.StringIO() - inspect_serializability(self.trainable, print_file=sio) - msg = ( - "The provided trainable is not serializable, which is a requirement " - "since the trainable is serialized and deserialized when transferred " - "to remote workers. See below for a trace of the non-serializable " - "objects that were found in your trainable:\n" - f"{sio.getvalue()}" - ) - raise TypeError(msg) from e + pickle.dump(self.__getstate__(), fp) self._maybe_warn_resource_contention() @@ -225,129 +213,211 @@ def _maybe_warn_resource_contention(self): stacklevel=4, ) - def _validate_overwrite_trainable( - self, - original_trainable: TrainableTypeOrTrainer, - overwrite_trainable: Optional[TrainableTypeOrTrainer], + def _validate_trainable( + self, trainable: TrainableType, required_trainable_name: Optional[str] = None ): - """Determines whether the new `overwrite_trainable` is compatible - with the restored experiment with some basic sanity checks - (ensuring same type and name as the original trainable). - """ + """Determines whether or not the trainable is valid. - # TODO(ml-team): Remove (https://github.com/ray-project/ray/issues/33546) - # Check if the trainable was wrapped with `tune.with_parameters`, - # Set the Tuner to fail on fit if the trainable is not re-specified. - trainable_wrapped_params = getattr( - original_trainable, "_attached_param_names", None - ) - if trainable_wrapped_params and not overwrite_trainable: - self._missing_params_error_message = ( - "The original trainable cannot be used to resume training, since " - "`tune.with_parameters` attached references to objects " - "in the Ray object store that may not exist anymore. " - "You must re-supply the trainable with the same parameters " - f"{trainable_wrapped_params} attached:\n\n" - "from ray import tune\n\n" - "# Reconstruct the trainable with the same parameters\n" - "trainable_with_params = tune.with_parameters(trainable, ...)\n" - "tuner = tune.Tuner.restore(\n" - " ..., trainable=trainable_with_params\n" - ")\n\nSee https://docs.ray.io/en/latest/tune/api/doc/" - "ray.tune.with_parameters.html for more details." + This includes checks on the serializability of the trainable, as well + asserting that the trainable name is as expected on restoration. + + This trainable name validation is needed due to an implementation detail + where the trainable name (which is differently generated depending on + the trainable type) is saved in the Trial metadata and needs to match + upon restoration. This does not affect the typical path, since `Tuner.restore` + expects the exact same trainable (which will have the same name). + + Raises: + ValueError: if the trainable name does not match or if the trainable + is not serializable. + """ + try: + pickle.dumps(trainable) + except TypeError as e: + sio = io.StringIO() + inspect_serializability(trainable, print_file=sio) + msg = ( + "The provided trainable is not serializable, which is a requirement " + "since the trainable is serialized and deserialized when transferred " + "to remote workers. See below for a trace of the non-serializable " + "objects that were found in your trainable:\n" + f"{sio.getvalue()}" ) - if not overwrite_trainable: + raise TypeError(msg) from e + + if not required_trainable_name: return - error_message = ( - "Invalid trainable input. To avoid errors, pass in the same trainable " - "that was used to initialize the Tuner." - ) + trainable_name = Experiment.get_trainable_name(trainable) - if type(original_trainable) != type(overwrite_trainable): + if trainable_name != required_trainable_name: raise ValueError( - f"{error_message}\n" - f"Got new trainable of type {type(overwrite_trainable)} " - f"but expected {type(original_trainable)}." + "Invalid `trainable` input to `Tuner.restore()`. To fix this error, " + "pass in the same trainable that was used to initialize the Tuner. " + "Got a trainable with identifier " + f"'{trainable_name}' but expected '{required_trainable_name}'." ) - from ray.train.trainer import BaseTrainer + def _set_trainable_on_restore( + self, trainable: TrainableType, old_trainable_name: Optional[str] + ): + from ray.train.base_trainer import BaseTrainer - if isinstance(overwrite_trainable, BaseTrainer): - if overwrite_trainable.run_config != original_trainable.run_config: - warnings.warn( - "Overwriting the AIR Trainer with a new `RunConfig` is not " - "supported - the restored experiment will continue with the old " - "config. To avoid this warning, revert changes made to `RunConfig`." - ) - overwrite_trainable.run_config = original_trainable.run_config - else: - original_name = Experiment.get_trainable_name(original_trainable) - overwrite_name = Experiment.get_trainable_name(overwrite_trainable) - if original_name != overwrite_name: - raise ValueError( - f"{error_message}\nGot new trainable with identifier " - f"{overwrite_name} but expected {original_name}." + self.trainable = trainable + assert self.converted_trainable + self._validate_trainable( + trainable=self.converted_trainable, + required_trainable_name=old_trainable_name, + ) + + if isinstance(self.trainable, BaseTrainer): + # Log a warning in case the user tries to modify the + # `RunConfig` from the Trainer + trainer: BaseTrainer = self.trainable + + # Only log if the Trainer has a non-default RunConfig + if trainer.run_config != RunConfig(): + logger.warning( + "The Tune experiment will restore using the original run's " + "`RunConfig`. If you made any changes to the `RunConfig` " + "within the Trainer you passed into `Tuner.restore`, " + "they will be ignored in the resumed run." ) - def _restore_from_path_or_uri( + trainer.run_config = self._run_config + + def _validate_param_space_on_restore( self, - path_or_uri: str, - resume_config: Optional[_ResumeConfig], - overwrite_trainable: Optional[TrainableTypeOrTrainer], - overwrite_param_space: Optional[Dict[str, Any]], + new_param_space: Dict[str, Any], + flattened_param_space_keys: Optional[List[str]], ): - # Sync down from cloud storage if needed - synced, experiment_checkpoint_dir = self._maybe_sync_down_tuner_state( - path_or_uri - ) - experiment_checkpoint_path = Path(experiment_checkpoint_dir) + """Determines whether the (optionally) re-specified `param_space` is valid. - if ( - not (experiment_checkpoint_path / _TRAINABLE_PKL).exists() - or not (experiment_checkpoint_path / _TUNER_PKL).exists() - ): + This method performs very loose validation on the new param_space to + prevent users from trying to specify new hyperparameters to tune over. + + Raises: + ValueError: if not all keys match the original param_space. + """ + if flattened_param_space_keys is None: + # Backwards compatibility: skip validation + return + + keys = sorted(flatten_dict(new_param_space).keys()) + if keys != flattened_param_space_keys: + raise ValueError( + "Invalid `param_space` input to `Tuner.restore()`. To fix this error, " + "pass in the same `param_space` that was used to initialize the Tuner. " + "Only re-specify the `param_space` to refresh Ray object references " + "that no longer exist due to restoring from a new Ray cluster session. " + "It should not be used to introduce new hyperparameters to tune." + f"\n\nGot: {keys}\nExpected: {flattened_param_space_keys}" + ) + + def _set_param_space_on_restore( + self, + param_space: Optional[Dict[str, Any]], + flattened_param_space_keys: Optional[List[str]], + ): + self.param_space = param_space + + if self.param_space is not None: + # param_space = None -> use the original param_space + self._validate_param_space_on_restore( + new_param_space=self.param_space, + flattened_param_space_keys=flattened_param_space_keys, + ) + + def _load_tuner_state( + self, tuner_pkl_path: Path + ) -> Tuple[Optional[str], Optional[List[str]]]: + """Loads Tuner state from the previously saved `tuner.pkl`. + + Args: + tuner_pkl_path: pathlib.Path of the `tuner.pkl` file saved during the + original Tuner initialization. + + Returns: + tuple: of `(old_trainable_name, flattened_param_space_keys)` used for + validating the re-specified `trainable` and `param_space`. + """ + if not tuner_pkl_path.exists(): raise RuntimeError( f"Could not find Tuner state in restore directory. Did you pass" - f"the correct path (including experiment directory?) Got: " - f"{path_or_uri}" + f"the correct path (the top-level experiment directory?) Got: " + f"{tuner_pkl_path.parent}" ) - # Load trainable and tuner state - with open(experiment_checkpoint_path / _TRAINABLE_PKL, "rb") as fp: - trainable = pickle.load(fp) + with open(tuner_pkl_path, "rb") as fp: + tuner_state = pickle.load(fp) - with open(experiment_checkpoint_path / _TUNER_PKL, "rb") as fp: - tuner = pickle.load(fp) - self.__dict__.update(tuner.__dict__) + if isinstance(tuner_state, TunerInternal): + # TODO(ml-team): Remove in 2.7. + # Backwards compatibility: ray<=2.4 pickles the full Tuner object + # within `tuner.pkl`. ray>=2.5 pickles the object state as a dict. + tuner: TunerInternal = tuner_state + self.__setstate__(tuner.__getstate__()) - self._validate_overwrite_trainable(trainable, overwrite_trainable) - if overwrite_trainable: - trainable = overwrite_trainable + logger.warning( + "You are restoring a Tune experiment that was run with an older " + "version of Ray. Note that backwards compatibility of restoring " + "this experiment will only be guaranteed until Ray 2.7." + ) - self._is_restored = True - self.trainable = trainable - if overwrite_param_space: - self.param_space = overwrite_param_space - self._resume_config = resume_config + old_trainable_name, flattened_param_space_keys = None, None + else: + # NOTE: These are magic keys used for validating restore args. + old_trainable_name = tuner_state.pop("__trainable_name", None) + flattened_param_space_keys = tuner_state.pop( + "__flattened_param_space_keys", None + ) - if not synced: - # If we didn't sync, use the restore_path local dir - self._experiment_checkpoint_dir = os.path.abspath( - os.path.expanduser(path_or_uri) - ) + self.__setstate__(tuner_state) - # Update local_dir to use the parent of the experiment path - # provided to `Tuner.restore` - experiment_path = Path(self._experiment_checkpoint_dir) - self._run_config.storage_path = str(experiment_path.parent) - self._run_config.name = experiment_path.name - else: - # Set the experiment `name` and `storage_path` according to the URI - uri = URI(path_or_uri) - self._run_config.name = uri.name - self._run_config.storage_path = str(uri.parent) + return old_trainable_name, flattened_param_space_keys + def _restore_from_path_or_uri( + self, + path_or_uri: str, + trainable: TrainableTypeOrTrainer, + overwrite_param_space: Optional[Dict[str, Any]], + resume_config: _ResumeConfig, + ): + # Sync down from cloud storage if needed + ( + restoring_from_cloud, + local_experiment_checkpoint_dir, + ) = self._maybe_sync_down_tuner_state(path_or_uri) + experiment_checkpoint_path = Path(local_experiment_checkpoint_dir) + + old_trainable_name, flattened_param_space_keys = self._load_tuner_state( + experiment_checkpoint_path / _TUNER_PKL + ) + + # Perform validation and set the re-specified `trainable` and `param_space` + self._set_trainable_on_restore( + trainable=trainable, old_trainable_name=old_trainable_name + ) + self._set_param_space_on_restore( + param_space=overwrite_param_space, + flattened_param_space_keys=flattened_param_space_keys, + ) + + # Update RunConfig to reflect changes in the experiment directory + path_or_uri_obj: Union[Path, URI] = ( + URI(path_or_uri) if restoring_from_cloud else experiment_checkpoint_path + ) + # Infer the `storage_path` and run `name` of the restored run using the + # experiment directory. + # Ex: ~/ray_results/exp_name -> ~/ray_results, exp_name + # Ex: s3://bucket/exp_name -> s3://bucket, exp_name + self._run_config.name = path_or_uri_obj.name + self._run_config.storage_path = str(path_or_uri_obj.parent) + + # Set the experiment directory + if not restoring_from_cloud: + self._experiment_checkpoint_dir = local_experiment_checkpoint_dir + else: # If we synced, `experiment_checkpoint_dir` will contain a temporary # directory. Create an experiment checkpoint dir instead and move # our data there. @@ -361,15 +431,19 @@ def _restore_from_path_or_uri( shutil.rmtree(experiment_checkpoint_path) self._experiment_checkpoint_dir = str(new_exp_path) + # Load the experiment results at the point where it left off. try: self._experiment_analysis = ExperimentAnalysis( - self._experiment_checkpoint_dir, + experiment_checkpoint_path=path_or_uri, default_metric=self._tune_config.metric, default_mode=self._tune_config.mode, ) except Exception: self._experiment_analysis = None + self._resume_config = resume_config + self._is_restored = True + def _maybe_sync_down_tuner_state(self, restore_path: str) -> Tuple[bool, str]: """Sync down trainable state from remote storage. @@ -377,14 +451,11 @@ def _maybe_sync_down_tuner_state(self, restore_path: str) -> Tuple[bool, str]: Tuple of (downloaded from remote, local_dir) """ if not is_non_local_path_uri(restore_path): - return False, os.path.expanduser(restore_path) + return False, os.path.abspath(os.path.expanduser(restore_path)) tempdir = Path(tempfile.mkdtemp("tmp_experiment_dir")) restore_uri = URI(restore_path) - download_from_uri( - str(restore_uri / _TRAINABLE_PKL), str(tempdir / _TRAINABLE_PKL) - ) download_from_uri(str(restore_uri / _TUNER_PKL), str(tempdir / _TUNER_PKL)) return True, str(tempdir) @@ -476,13 +547,26 @@ def trainable(self, trainable: TrainableTypeOrTrainer): self._converted_trainable = self._convert_trainable(trainable) @property - def param_space(self) -> Dict[str, Any]: + def param_space(self) -> Optional[Dict[str, Any]]: return self._param_space @param_space.setter - def param_space(self, param_space: Dict[str, Any]): + def param_space(self, param_space: Optional[Dict[str, Any]]): + # Handle any configs that adhere to the `to_dict` interface. + # Ex: AlgorithmConfig from RLlib + if isinstance(param_space, _Config): + param_space = param_space.to_dict() + + if not isinstance(param_space, dict) and param_space is not None: + raise ValueError( + "The `param_space` passed to the `Tuner` must be a dict. " + f"Got '{type(param_space)}' instead." + ) + self._param_space = param_space - self._process_scaling_config() + + if param_space: + self._process_scaling_config() def _convert_trainable(self, trainable: TrainableTypeOrTrainer) -> TrainableType: """Converts an AIR Trainer to a Tune trainable and saves the converted @@ -587,6 +671,12 @@ def _get_tune_run_arguments(self, trainable: TrainableType) -> Dict[str, Any]: ), checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, + checkpoint_keep_all_ranks=( + self._run_config.checkpoint_config._checkpoint_keep_all_ranks + ), + checkpoint_upload_from_workers=( + self._run_config.checkpoint_config._checkpoint_upload_from_workers + ), _experiment_checkpoint_dir=self._experiment_checkpoint_dir, raise_on_failed_trial=False, fail_fast=(self._run_config.failure_config.fail_fast), @@ -599,6 +689,7 @@ def _get_tune_run_arguments(self, trainable: TrainableType) -> Dict[str, Any]: trial_dirname_creator=self._tune_config.trial_dirname_creator, chdir_to_trial_dir=self._tune_config.chdir_to_trial_dir, _tuner_api=True, + _trainer_api=self._trainer_api, ) def _fit_internal( @@ -628,9 +719,6 @@ def _fit_resume( self, trainable: TrainableType, param_space: Optional[Dict[str, Any]] ) -> ExperimentAnalysis: """Fitting for a restored Tuner.""" - if self._missing_params_error_message: - raise ValueError(self._missing_params_error_message) - resume = "AUTO" if self._resume_config: @@ -665,10 +753,24 @@ def __getstate__(self): state["_tuner_kwargs"] = state["_tuner_kwargs"].copy() state["_tuner_kwargs"].pop("_remote_string_queue", None) state.pop(_TRAINABLE_KEY, None) - state.pop(_CONVERTED_TRAINABLE_KEY, None) - state.pop(_PARAM_SPACE_KEY, None) + trainable = state.pop(_CONVERTED_TRAINABLE_KEY, None) + param_space = state.pop(_PARAM_SPACE_KEY, None) state.pop(_EXPERIMENT_ANALYSIS_KEY, None) + + state["__trainable_name"] = ( + Experiment.get_trainable_name(trainable) if trainable else None + ) + state["__flattened_param_space_keys"] = ( + sorted(flatten_dict(param_space).keys()) + if param_space is not None + else None + ) + return state def __setstate__(self, state): + # Make sure the magic metadata gets removed first. + state.pop("__flattened_param_space_keys", None) + state.pop("__trainable_name", None) + self.__dict__.update(state) diff --git a/python/ray/tune/integration/comet.py b/python/ray/tune/integration/comet.py index 4e61bdd7af28..1a741968a4f9 100644 --- a/python/ray/tune/integration/comet.py +++ b/python/ray/tune/integration/comet.py @@ -24,5 +24,5 @@ def __init__( save_checkpoints: bool = False, **experiment_kwargs ): - logging.warning(callback_deprecation_message) - super().__init__(online, tags, save_checkpoints, **experiment_kwargs) + # TODO(ml-team): Remove in 2.6. + raise DeprecationWarning(callback_deprecation_message) diff --git a/python/ray/tune/logger/csv.py b/python/ray/tune/logger/csv.py index f8509990b3a7..b9357c7dd872 100644 --- a/python/ray/tune/logger/csv.py +++ b/python/ray/tune/logger/csv.py @@ -4,10 +4,10 @@ from typing import TYPE_CHECKING, Dict, TextIO -from ray.tune.logger.logger import Logger, LoggerCallback +from ray.tune.logger.logger import _LOGGER_DEPRECATION_WARNING, Logger, LoggerCallback from ray.tune.result import EXPR_PROGRESS_FILE from ray.tune.utils import flatten_dict -from ray.util.annotations import PublicAPI +from ray.util.annotations import Deprecated, PublicAPI if TYPE_CHECKING: from ray.tune.experiment.trial import Trial # noqa: F401 @@ -15,6 +15,12 @@ logger = logging.getLogger(__name__) +@Deprecated( + message=_LOGGER_DEPRECATION_WARNING.format( + old="CSVLogger", new="ray.tune.csv.CSVLoggerCallback" + ), + warning=True, +) @PublicAPI class CSVLogger(Logger): """Logs results to progress.csv under the trial directory. diff --git a/python/ray/tune/logger/json.py b/python/ray/tune/logger/json.py index ef59a455a3ba..efd04d431cd1 100644 --- a/python/ray/tune/logger/json.py +++ b/python/ray/tune/logger/json.py @@ -7,14 +7,14 @@ import ray.cloudpickle as cloudpickle -from ray.tune.logger.logger import Logger, LoggerCallback +from ray.tune.logger.logger import _LOGGER_DEPRECATION_WARNING, Logger, LoggerCallback from ray.tune.utils.util import SafeFallbackEncoder from ray.tune.result import ( EXPR_PARAM_FILE, EXPR_PARAM_PICKLE_FILE, EXPR_RESULT_FILE, ) -from ray.util.annotations import PublicAPI +from ray.util.annotations import Deprecated, PublicAPI if TYPE_CHECKING: from ray.tune.experiment.trial import Trial # noqa: F401 @@ -25,6 +25,12 @@ VALID_SUMMARY_TYPES = [int, float, np.float32, np.float64, np.int32, np.int64] +@Deprecated( + message=_LOGGER_DEPRECATION_WARNING.format( + old="JsonLogger", new="ray.tune.json.JsonLoggerCallback" + ), + warning=True, +) @PublicAPI class JsonLogger(Logger): """Logs trial results in json format. diff --git a/python/ray/tune/logger/logger.py b/python/ray/tune/logger/logger.py index 64dbfe7d909a..7540d3f02d24 100644 --- a/python/ray/tune/logger/logger.py +++ b/python/ray/tune/logger/logger.py @@ -7,7 +7,7 @@ import yaml from ray.air._internal.json import SafeFallbackEncoder from ray.tune.callback import Callback -from ray.util.annotations import PublicAPI, DeveloperAPI +from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI if TYPE_CHECKING: from ray.tune.experiment.trial import Trial # noqa: F401 @@ -18,7 +18,17 @@ # Apply flow style for sequences of this length _SEQUENCE_LEN_FLOW_STYLE = 3 +_LOGGER_DEPRECATION_WARNING = ( + "The `{old} interface is deprecated in favor of the " + "`{new}` interface and will be removed in Ray 2.7." +) + +@Deprecated( + message=_LOGGER_DEPRECATION_WARNING.format( + old="Logger", new="ray.tune.logger.LoggerCallback" + ), +) @DeveloperAPI class Logger(abc.ABC): """Logging interface for ray.tune. diff --git a/python/ray/tune/logger/noop.py b/python/ray/tune/logger/noop.py index 00a3b8f28fbd..a9bae96b7cd7 100644 --- a/python/ray/tune/logger/noop.py +++ b/python/ray/tune/logger/noop.py @@ -1,7 +1,8 @@ from ray.tune.logger.logger import Logger -from ray.util.annotations import PublicAPI +from ray.util.annotations import Deprecated, PublicAPI +@Deprecated(message="`NoopLogger` will be removed in Ray 2.7.") @PublicAPI class NoopLogger(Logger): def on_result(self, result): diff --git a/python/ray/tune/logger/tensorboardx.py b/python/ray/tune/logger/tensorboardx.py index 9e083319c630..e4e3e25e8872 100644 --- a/python/ray/tune/logger/tensorboardx.py +++ b/python/ray/tune/logger/tensorboardx.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Dict -from ray.tune.logger.logger import Logger, LoggerCallback +from ray.tune.logger.logger import _LOGGER_DEPRECATION_WARNING, Logger, LoggerCallback from ray.util.debug import log_once from ray.tune.result import ( TRAINING_ITERATION, @@ -11,7 +11,7 @@ TIMESTEPS_TOTAL, ) from ray.tune.utils import flatten_dict -from ray.util.annotations import PublicAPI +from ray.util.annotations import Deprecated, PublicAPI if TYPE_CHECKING: from ray.tune.experiment.trial import Trial # noqa: F401 @@ -21,6 +21,12 @@ VALID_SUMMARY_TYPES = [int, float, np.float32, np.float64, np.int32, np.int64] +@Deprecated( + message=_LOGGER_DEPRECATION_WARNING.format( + old="TBXLogger", new="ray.tune.tensorboardx.TBXLoggerCallback" + ), + warning=True, +) @PublicAPI class TBXLogger(Logger): """TensorBoardX Logger. diff --git a/python/ray/tune/logger/unified.py b/python/ray/tune/logger/unified.py index 73aaf00ce081..ede689829c40 100644 --- a/python/ray/tune/logger/unified.py +++ b/python/ray/tune/logger/unified.py @@ -4,7 +4,8 @@ from ray.tune.logger import DEFAULT_LOGGERS from ray.tune.logger.json import JsonLogger from ray.tune.logger.logger import Logger -from ray.util import log_once, PublicAPI +from ray.util import log_once +from ray.util.annotations import Deprecated, PublicAPI logger = logging.getLogger(__name__) @@ -13,6 +14,7 @@ from ray.tune.experiment.trial import Trial # noqa: F401 +@Deprecated(message="`UnifiedLogger` will be removed in Ray 2.7.", warning=True) @PublicAPI class UnifiedLogger(Logger): """Unified result logger for TensorBoard, rllab/viskit, plain json. diff --git a/python/ray/tune/progress_reporter.py b/python/ray/tune/progress_reporter.py index 9bc46281fe9f..d829793ce0b7 100644 --- a/python/ray/tune/progress_reporter.py +++ b/python/ray/tune/progress_reporter.py @@ -9,13 +9,14 @@ import textwrap import time import warnings -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd import ray from ray._private.dict import flatten_dict +from ray._private.thirdparty.tabulate.tabulate import tabulate from ray.experimental.tqdm_ray import safe_print from ray.air.util.node import _force_on_current_node from ray.tune.callback import Callback @@ -49,14 +50,6 @@ except ImportError: from collections import Mapping, MutableMapping -try: - from tabulate import tabulate -except ImportError: - raise ImportError( - "ray.tune in ray > 0.7.5 requires 'tabulate'. " - "Please re-run 'pip install ray[tune]' or " - "'pip install ray[rllib]'." - ) IS_NOTEBOOK = ray.widgets.util.in_notebook() @@ -1524,7 +1517,7 @@ def _detect_reporter(**kwargs) -> TuneReporterBase: def _detect_progress_metrics( trainable: Optional[Union["Trainable", Callable]] -) -> Optional[List[str]]: +) -> Optional[Collection[str]]: """Detect progress metrics to report.""" if not trainable: return None diff --git a/python/ray/tune/requirements-dev.txt b/python/ray/tune/requirements-dev.txt index 1122f99be905..e4432a5471c6 100644 --- a/python/ray/tune/requirements-dev.txt +++ b/python/ray/tune/requirements-dev.txt @@ -4,7 +4,6 @@ gym>=0.21.0,<0.24.0 scikit-image pandas requests -tabulate tensorflow black==22.10.0 yq diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py index 098035627450..f1e495ef7c45 100644 --- a/python/ray/tune/result_grid.py +++ b/python/ray/tune/result_grid.py @@ -1,7 +1,7 @@ +from functools import partial import os -from typing import Optional, Union - import pandas as pd +from typing import Optional, Union from ray.air.result import Result from ray.cloudpickle import cloudpickle @@ -9,6 +9,7 @@ from ray.tune.analysis import ExperimentAnalysis from ray.tune.error import TuneError from ray.tune.experiment import Trial +from ray.tune.trainable.util import TrainableUtil from ray.util import PublicAPI @@ -243,9 +244,23 @@ def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError] return None def _trial_to_result(self, trial: Trial) -> Result: - checkpoint = trial.checkpoint.to_air_checkpoint() + local_to_remote_path_fn = ( + partial( + TrainableUtil.get_remote_storage_path, + local_path_prefix=trial.local_path, + remote_path_prefix=trial.remote_path, + ) + if trial.uses_cloud_checkpointing + else None + ) + checkpoint = trial.checkpoint.to_air_checkpoint( + local_to_remote_path_fn, + ) best_checkpoints = [ - (checkpoint.to_air_checkpoint(), checkpoint.metrics) + ( + checkpoint.to_air_checkpoint(local_to_remote_path_fn), + checkpoint.metrics, + ) for checkpoint in trial.get_trial_checkpoints() ] diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 402259258863..ce8c9644812b 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -1,4 +1,5 @@ import abc +import urllib.parse from functools import partial import threading from typing import ( @@ -18,7 +19,18 @@ import time from dataclasses import dataclass +try: + import fsspec +except Exception: + fsspec = None + +try: + import s3fs +except Exception: + s3fs = None + import ray +from ray._private.thirdparty.tabulate.tabulate import tabulate from ray.air._internal.checkpoint_manager import CheckpointStorage, _TrackedCheckpoint from ray.air._internal.remote_storage import ( fs_hint, @@ -33,6 +45,7 @@ from ray.tune.callback import Callback from ray.tune.result import TRAINING_ITERATION, TIME_TOTAL_S from ray.tune.utils.file_transfer import sync_dir_between_nodes +from ray.util import log_once from ray.util.annotations import PublicAPI, DeveloperAPI from ray.widgets import Template @@ -145,14 +158,6 @@ def _repr_html_(self) -> str: Note that self.syncer is omitted here; seems to have some overlap with existing configuration settings here in the SyncConfig class. """ - try: - from tabulate import tabulate - except ImportError: - return ( - "Tabulate isn't installed. Run " - "`pip install tabulate` for rich notebook output." - ) - return Template("scrollableTable.html.j2").render( table=tabulate( { @@ -198,6 +203,31 @@ def validate_upload_dir(self, upload_dir: Optional[str] = None) -> bool: if not upload_dir and isinstance(self.syncer, Syncer): raise ValueError("Must specify an `upload_dir` to use a custom `syncer`.") + parsed = urllib.parse.urlparse(upload_dir) + # Todo: Only warn for pyarrow versions that are affected by + # https://github.com/apache/arrow/issues/32372#issuecomment-1421097792 + if ( + parsed.scheme + and not s3fs + and parsed.scheme.startswith("s3") + and log_once("fsspec_missing") + ): + logger.warning( + "You are using S3 for remote storage, but you don't have `s3fs` " + "installed. Due to a bug in PyArrow, this can lead to significant " + "slowdowns. To avoid this, install s3fs with " + "`pip install fsspec s3fs`." + ) + elif not fsspec and log_once("fsspec_missing"): + logger.warning( + "You are using remote storage, but you don't have `fsspec` " + "installed. This can lead to inefficient syncing behavior. " + "To avoid this, install fsspec with " + "`pip install fsspec`. Depending on your remote storage provider, " + "consider installing the respective fsspec-package " + "(see https://github.com/fsspec)." + ) + if isinstance(self.syncer, Syncer): return self.syncer.validate_upload_dir(upload_dir or self.upload_dir) else: diff --git a/python/ray/tune/tests/conftest.py b/python/ray/tune/tests/conftest.py index ad1b6d49c3bd..fd6fd0563759 100644 --- a/python/ray/tune/tests/conftest.py +++ b/python/ray/tune/tests/conftest.py @@ -1,3 +1,31 @@ # Trigger pytest hook to automatically zip test cluster logs to archive dir on failure from ray.tests.conftest import pytest_runtest_makereport # noqa from ray.tests.conftest import propagate_logs # noqa + + +import logging +import boto3 +import pytest + +from ray.air._internal.uri_utils import URI +from ray._private.test_utils import simulate_storage + + +@pytest.fixture +def mock_s3_bucket_uri(): + port = 5002 + region = "us-west-2" + with simulate_storage("s3", port=port, region=region) as s3_uri: + s3 = boto3.client( + "s3", region_name=region, endpoint_url=f"http://localhost:{port}" + ) + # Bucket name will be autogenerated/unique per test + bucket_name = URI(s3_uri).name + s3.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={"LocationConstraint": region}, + ) + # Disable server HTTP request logging + logging.getLogger("werkzeug").setLevel(logging.WARNING) + yield s3_uri + logging.getLogger("werkzeug").setLevel(logging.INFO) diff --git a/python/ray/tune/tests/execution/test_controller_callback_integration.py b/python/ray/tune/tests/execution/test_controller_callback_integration.py new file mode 100644 index 000000000000..84befbaa1d8e --- /dev/null +++ b/python/ray/tune/tests/execution/test_controller_callback_integration.py @@ -0,0 +1,67 @@ +from typing import Dict, Optional + +import pytest +import sys + +import ray +from ray.air.execution import FixedResourceManager, PlacementGroupResourceManager +from ray.tune import Callback +from ray.tune.execution.tune_controller import TuneController +from ray.tune.experiment import Trial + + +@pytest.fixture(scope="function") +def ray_start_4_cpus_2_gpus_extra(): + address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) + yield address_info + ray.shutdown() + + +class StatefulCallback(Callback): + CKPT_FILE_TMPL = "test-callback-state-{}.json" + + def __init__(self): + self.counter = 0 + + def on_trial_result(self, iteration, trials, trial, result, **info): + self.counter += 1 + + def get_state(self) -> Optional[Dict]: + return {"counter": self.counter} + + def set_state(self, state: Dict): + self.counter = state["counter"] + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_callback_save_restore( + ray_start_4_cpus_2_gpus_extra, resource_manager_cls, tmpdir +): + """Check that callback state is restored correctly. + + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testCallbackSaveRestore + """ + runner = TuneController( + callbacks=[StatefulCallback()], + experiment_path=str(tmpdir), + ) + runner.add_trial(Trial("__fake", stub=True)) + for i in range(3): + runner._callbacks.on_trial_result( + iteration=i, trials=None, trial=None, result=None + ) + runner.checkpoint(force=True) + callback = StatefulCallback() + runner2 = TuneController( + callbacks=[callback], + experiment_path=str(tmpdir), + ) + assert callback.counter == 0 + runner2.resume() + assert callback.counter == 3 + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py b/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py new file mode 100644 index 000000000000..2e13f5d84574 --- /dev/null +++ b/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py @@ -0,0 +1,303 @@ +import json +import os +import shutil + +import pytest +import sys + +import ray +from ray.air import CheckpointConfig +from ray.air._internal.checkpoint_manager import _TrackedCheckpoint, CheckpointStorage +from ray.air.execution import FixedResourceManager, PlacementGroupResourceManager +from ray.tune import PlacementGroupFactory +from ray.tune.execution.tune_controller import TuneController +from ray.tune.experiment import Trial +from ray.tune.result import TRAINING_ITERATION, DONE +from ray.tune.schedulers import FIFOScheduler +from ray.tune.search import BasicVariantGenerator +from ray.tune.trainable import TrainableUtil + + +@pytest.fixture(scope="function") +def ray_start_4_cpus_2_gpus_extra(): + address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) + yield address_info + ray.shutdown() + + +def create_mock_components(): + class _MockScheduler(FIFOScheduler): + errored_trials = [] + + def on_trial_error(self, trial_runner, trial): + self.errored_trials += [trial] + + class _MockSearchAlg(BasicVariantGenerator): + errored_trials = [] + + def on_trial_complete(self, trial_id, error=False, **kwargs): + if error: + self.errored_trials += [trial_id] + + searchalg = _MockSearchAlg() + scheduler = _MockScheduler() + return searchalg, scheduler + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_checkpoint_save_restore( + ray_start_4_cpus_2_gpus_extra, resource_manager_cls, tmpdir +): + """Test that a checkpoint is saved and can be used to restore a trainable. + + The trainable saves a checkpoint and terminates. We then start another trial + that should restore from the saved checkpoint and assert that it picks up + the state and continues to run to termination. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testCheckpointing + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testRestoreMetricsAfterCheckpointing # noqa + """ + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + experiment_path=str(tmpdir), + ) + kwargs = { + "stopping_criterion": {"training_iteration": 1}, + "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]), + "checkpoint_config": CheckpointConfig(checkpoint_frequency=1), + } + runner.add_trial(Trial("__fake", **kwargs)) + trials = runner.get_trials() + + runner.step() # Start trial + + while trials[0].status != Trial.RUNNING: + runner.step() + + # Set some state that will be saved in the checkpoint + assert ray.get(trials[0].runner.set_info.remote(1)) == 1 + + while trials[0].status != Trial.TERMINATED: + runner.step() + + assert trials[0].checkpoint.metrics[TRAINING_ITERATION] == 1 + assert trials[0].last_result[TRAINING_ITERATION] == 1 + assert trials[0].last_result["iterations_since_restore"] == 1 + + # Prepare new trial + kwargs["restore_path"] = trials[0].checkpoint.dir_or_data + runner.add_trial(Trial("__fake", **kwargs)) + trials = runner.get_trials() + + assert trials[1].status == Trial.PENDING + + # Start trial, restore, run to termination + while trials[1].status != Trial.RUNNING: + runner.step() + + # Restore + runner.step() + + assert ray.get(trials[1].runner.get_info.remote()) == 1 + + # Run to termination + while trials[1].status != Trial.TERMINATED: + runner.step() + + assert trials[1].checkpoint.metrics[TRAINING_ITERATION] == 2 + assert trials[1].last_result[TRAINING_ITERATION] == 2 + assert trials[1].last_result["iterations_since_restore"] == 1 + assert trials[1].last_result["time_since_restore"] > 0 + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_checkpoint_at_end(ray_start_4_cpus_2_gpus_extra, resource_manager_cls, tmpdir): + """Test that a checkpoint is saved at end for class trainables with that config. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testCheckpointingAtEnd + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testResultDone + """ + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + experiment_path=str(tmpdir), + ) + kwargs = { + "stopping_criterion": {"training_iteration": 2}, + "checkpoint_config": CheckpointConfig(checkpoint_at_end=True), + "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]), + } + runner.add_trial(Trial("__fake", **kwargs)) + trials = runner.get_trials() + + while not runner.is_finished(): + runner.step() + + assert trials[0].has_checkpoint() + assert trials[0].last_result[DONE] + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_pause_resume_trial( + ray_start_4_cpus_2_gpus_extra, resource_manager_cls, tmpdir +): + """Test that trial that is paused and resumed picks up its last checkpoint. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testPauseThenResume + """ + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + experiment_path=str(tmpdir), + ) + kwargs = { + "stopping_criterion": {"training_iteration": 2}, + "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]), + "checkpoint_config": CheckpointConfig(checkpoint_frequency=1), + } + runner.add_trial(Trial("__fake", **kwargs)) + trials = runner.get_trials() + + while trials[0].status != Trial.RUNNING: + runner.step() + + assert ray.get(trials[0].runner.get_info.remote()) is None + assert ray.get(trials[0].runner.set_info.remote(1)) == 1 + + runner._schedule_trial_pause(trials[0], should_checkpoint=True) + + while trials[0].status != Trial.PAUSED: + runner.step() + + assert trials[0].has_checkpoint() + assert DONE not in trials[0].last_result + + # Start again + runner._set_trial_status(trials[0], Trial.PENDING) + + while trials[0].status != Trial.RUNNING: + runner.step() + + assert ray.get(trials[0].runner.get_info.remote()) == 1 + + while trials[0].status != Trial.TERMINATED: + runner.step() + + assert trials[0].checkpoint.metrics[TRAINING_ITERATION] == 2 + assert trials[0].last_result[TRAINING_ITERATION] == 2 + assert trials[0].last_result["iterations_since_restore"] == 1 + assert trials[0].last_result["time_since_restore"] > 0 + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_checkpoint_num_to_keep( + ray_start_4_cpus_2_gpus_extra, resource_manager_cls, tmpdir +): + """Test that only num_to_keep checkpoints are kept. + + This should also hold true when the experiment is resumed. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testPauseResumeCheckpointCount + """ + trial = Trial( + "__fake", + experiment_path=str(tmpdir), + checkpoint_config=CheckpointConfig(num_to_keep=2), + ) + trial.init_local_path() + trial.checkpoint_manager.set_delete_fn(lambda cp: shutil.rmtree(cp.dir_or_data)) + + def write_checkpoint(trial: Trial, index: int): + checkpoint_dir = TrainableUtil.make_checkpoint_dir( + trial.local_path, index=index + ) + result = {"training_iteration": index} + with open(os.path.join(checkpoint_dir, "cp.json"), "w") as f: + json.dump(result, f) + + tune_cp = _TrackedCheckpoint( + dir_or_data=checkpoint_dir, + storage_mode=CheckpointStorage.PERSISTENT, + metrics=result, + ) + trial.saving_to = tune_cp + + return checkpoint_dir + + def get_checkpoint_dirs(trial: Trial): + return [d for d in os.listdir(trial.local_path) if d.startswith("checkpoint_")] + + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + experiment_path=str(tmpdir), + ) + + runner.add_trial(trial) + + # Write 1 checkpoint + result = write_checkpoint(trial, 1) + runner._on_saving_result(trial, result) + + # Expect 1 checkpoint + cp_dirs = get_checkpoint_dirs(trial) + assert len(cp_dirs) == 1, f"Checkpoint dirs: {cp_dirs}" + + # Write second checkpoint + result = write_checkpoint(trial, 2) + runner._on_saving_result(trial, result) + + # Expect 2 checkpoints + cp_dirs = get_checkpoint_dirs(trial) + assert len(cp_dirs) == 2, f"Checkpoint dirs: {cp_dirs}" + + # Write third checkpoint + result = write_checkpoint(trial, 3) + runner._on_saving_result(trial, result) + + # Expect 2 checkpoints because num_to_keep = 2 + cp_dirs = get_checkpoint_dirs(trial) + assert len(cp_dirs) == 2, f"Checkpoint dirs: {cp_dirs}" + + # Re-instantiate trial runner and resume + runner.checkpoint(force=True) + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + experiment_path=str(tmpdir), + ) + runner.resume() + + trial = runner.get_trials()[0] + trial.checkpoint_manager.set_delete_fn(lambda cp: shutil.rmtree(cp.dir_or_data)) + + # Write fourth checkpoint + result = write_checkpoint(trial, 4) + runner._on_saving_result(trial, result) + + # Expect 2 checkpoints because num_to_keep = 2 + cp_dirs = get_checkpoint_dirs(trial) + assert len(cp_dirs) == 2, f"Checkpoint dirs: {cp_dirs}" + + # Write fifth checkpoint + result = write_checkpoint(trial, 5) + runner._on_saving_result(trial, result) + + # Expect 2 checkpoints because num_to_keep = 2 + cp_dirs = get_checkpoint_dirs(trial) + assert len(cp_dirs) == 2, f"Checkpoint dirs: {cp_dirs}" + + # Checkpoints before restore should be deleted + assert "checkpoint_000004" in cp_dirs + assert "checkpoint_000005" in cp_dirs + + assert "checkpoint_000002" not in cp_dirs + assert "checkpoint_000003" not in cp_dirs + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/execution/test_controller_control_integration.py b/python/ray/tune/tests/execution/test_controller_control_integration.py new file mode 100644 index 000000000000..dc677ed94eb8 --- /dev/null +++ b/python/ray/tune/tests/execution/test_controller_control_integration.py @@ -0,0 +1,91 @@ +from collections import Counter + +import pytest +import sys + +import ray +from ray.air.execution import FixedResourceManager, PlacementGroupResourceManager +from ray.tune import PlacementGroupFactory +from ray.tune.execution.tune_controller import TuneController +from ray.tune.experiment import Trial + + +@pytest.fixture(scope="function") +def ray_start_4_cpus_2_gpus_extra(): + address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) + yield address_info + ray.shutdown() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_stop_trial(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Stopping a trial while RUNNING or PENDING should work. + + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testStopTrial + """ + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + ) + kwargs = { + "stopping_criterion": {"training_iteration": 10}, + "placement_group_factory": PlacementGroupFactory([{"CPU": 2, "GPU": 1}]), + "config": {"sleep": 1}, + } + trials = [ + Trial("__fake", **kwargs), + Trial("__fake", **kwargs), + Trial("__fake", **kwargs), + Trial("__fake", **kwargs), + ] + for t in trials: + runner.add_trial(t) + + counter = Counter(t.status for t in trials) + + # Wait until 2 trials started + while counter.get("RUNNING", 0) != 2: + runner.step() + counter = Counter(t.status for t in trials) + + assert counter.get("RUNNING", 0) == 2 + assert counter.get("PENDING", 0) == 2 + + # Stop trial that is running + for trial in trials: + if trial.status == Trial.RUNNING: + runner._schedule_trial_stop(trial) + break + + counter = Counter(t.status for t in trials) + + # Wait until the next trial started + while counter.get("RUNNING", 0) < 2: + runner.step() + counter = Counter(t.status for t in trials) + + assert counter.get("RUNNING", 0) == 2 + assert counter.get("TERMINATED", 0) == 1 + assert counter.get("PENDING", 0) == 1 + + # Stop trial that is pending + for trial in trials: + if trial.status == Trial.PENDING: + runner._schedule_trial_stop(trial) + break + + counter = Counter(t.status for t in trials) + + # Wait until 2 trials are running again + while counter.get("RUNNING", 0) < 2: + runner.step() + counter = Counter(t.status for t in trials) + + assert counter.get("RUNNING", 0) == 2 + assert counter.get("TERMINATED", 0) == 2 + assert counter.get("PENDING", 0) == 0 + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/execution/test_controller_errors_integration.py b/python/ray/tune/tests/execution/test_controller_errors_integration.py new file mode 100644 index 000000000000..6b69fd272343 --- /dev/null +++ b/python/ray/tune/tests/execution/test_controller_errors_integration.py @@ -0,0 +1,196 @@ +import os +from collections import Counter + +import pytest +import sys + +import ray +from ray.air import CheckpointConfig +from ray.air.execution import FixedResourceManager, PlacementGroupResourceManager +from ray.tune import PlacementGroupFactory, TuneError +from ray.tune.execution.tune_controller import TuneController +from ray.tune.experiment import Trial +from ray.tune.registry import TRAINABLE_CLASS, _global_registry +from ray.tune.schedulers import FIFOScheduler +from ray.tune.search import BasicVariantGenerator +from ray.tune.tests.execution.utils import BudgetResourceManager + + +@pytest.fixture(scope="function") +def ray_start_4_cpus_2_gpus_extra(): + address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) + yield address_info + ray.shutdown() + + +def create_mock_components(): + class _MockScheduler(FIFOScheduler): + errored_trials = [] + + def on_trial_error(self, trial_runner, trial): + self.errored_trials += [trial] + + class _MockSearchAlg(BasicVariantGenerator): + errored_trials = [] + + def on_trial_complete(self, trial_id, error=False, **kwargs): + if error: + self.errored_trials += [trial_id] + + searchalg = _MockSearchAlg() + scheduler = _MockScheduler() + return searchalg, scheduler + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_invalid_trainable(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """An invalid trainable should make the trial fail on startup. + + The controller itself should continue. Other trials should run. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testErrorHandling + """ + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + ) + kwargs = { + "stopping_criterion": {"training_iteration": 1}, + "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]), + } + _global_registry.register(TRAINABLE_CLASS, "asdf", None) + trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)] + for t in trials: + runner.add_trial(t) + + while not trials[1].status == Trial.RUNNING: + runner.step() + assert trials[0].status == Trial.ERROR + assert trials[1].status == Trial.RUNNING + + +def test_overstep(ray_start_4_cpus_2_gpus_extra): + """Stepping when trials are finished should raise a TuneError. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testThrowOnOverstep + """ + os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" + runner = TuneController( + resource_manager_factory=lambda: BudgetResourceManager({"CPU": 4}), + ) + runner.step() + with pytest.raises(TuneError): + runner.step() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +@pytest.mark.parametrize("max_failures_persistent", [(0, False), (1, False), (2, True)]) +def test_failure_recovery( + ray_start_4_cpus_2_gpus_extra, resource_manager_cls, max_failures_persistent +): + """Test failure recover with `max_failures`. + + Trials should be retried up to `max_failures` times. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testFailureRecoveryDisabled + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testFailureRecoveryEnabled + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testFailureRecoveryMaxFailures + """ + max_failures, persistent_error = max_failures_persistent + searchalg, scheduler = create_mock_components() + + runner = TuneController( + search_alg=searchalg, + scheduler=scheduler, + resource_manager_factory=lambda: resource_manager_cls(), + ) + kwargs = { + "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]), + "stopping_criterion": {"training_iteration": 2}, + "checkpoint_config": CheckpointConfig(checkpoint_frequency=1), + "max_failures": max_failures, + "config": {"mock_error": True, "persistent_error": persistent_error}, + } + runner.add_trial(Trial("__fake", **kwargs)) + trials = runner.get_trials() + + while not runner.is_finished(): + runner.step() + + if persistent_error or not max_failures: + assert trials[0].status == Trial.ERROR + + num_failures = max_failures + 1 + assert trials[0].num_failures == num_failures + # search alg receives on_complete, so only after the max failures + # have been exhausted. Thus, it only has errored_trials if the + # trial fails even in the last try. + assert len(searchalg.errored_trials) == 1 + # search alg receives on_error, so every failure is registered. + assert len(scheduler.errored_trials) == num_failures + else: + assert trials[0].status == Trial.TERMINATED + assert trials[0].num_failures == 1 + assert len(searchalg.errored_trials) == 0 + assert len(scheduler.errored_trials) == 1 + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +@pytest.mark.parametrize("fail_fast", [True, TuneController.RAISE]) +def test_fail_fast(ray_start_4_cpus_2_gpus_extra, resource_manager_cls, fail_fast): + """Test fail_fast feature. + + If fail_fast=True, after the first failure, all other trials should be terminated + (because we end the experiment). + + If fail_fast=RAISE, after the first failure, we should raise an error. + + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testFailFast + Legacy test: test_trial_runner_2.py::TrialRunnerTest::testFailFastRaise + """ + + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), fail_fast=fail_fast + ) + kwargs = { + "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]), + "checkpoint_config": CheckpointConfig(checkpoint_frequency=1), + "max_failures": 0, + "config": { + "mock_error": True, + "persistent_error": True, + }, + } + runner.add_trial(Trial("__fake", **kwargs)) + runner.add_trial(Trial("__fake", **kwargs)) + trials = runner.get_trials() + + if fail_fast == TuneController.RAISE: + with pytest.raises(Exception): + while not runner.is_finished(): + runner.step() + runner.cleanup() + return + else: + while not runner.is_finished(): + runner.step() + + status_count = Counter(t.status for t in trials) + + # One trial failed + assert status_count.get(Trial.ERROR) == 1 + # The other one was pre-empted + assert status_count.get(Trial.TERMINATED) == 1 + + # Controller finished + with pytest.raises(TuneError): + runner.step() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/execution/test_controller_resources_integration.py b/python/ray/tune/tests/execution/test_controller_resources_integration.py new file mode 100644 index 000000000000..40f775c066c0 --- /dev/null +++ b/python/ray/tune/tests/execution/test_controller_resources_integration.py @@ -0,0 +1,259 @@ +import os +import time +from collections import Counter + +import pytest +import sys + +import ray +from ray import tune +from ray.air.execution import FixedResourceManager, PlacementGroupResourceManager +from ray.tune import PlacementGroupFactory, TuneError +from ray.tune.execution.tune_controller import TuneController +from ray.tune.experiment import Trial +from ray.tune.schedulers import FIFOScheduler, TrialScheduler +from ray.tune.search import BasicVariantGenerator +from ray.tune.utils.mock import TrialStatusSnapshot, TrialStatusSnapshotTaker + + +@pytest.fixture(scope="function") +def ray_start_4_cpus_2_gpus_extra(): + address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) + yield address_info + ray.shutdown() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +@pytest.mark.parametrize( + "bundles", + [ + [{"CPU": 1}, {"CPU": 3, "GPU": 1}], + [{"CPU": 1, "a": 2}], + [{"CPU": 1}, {"a": 2}], + [{"CPU": 1, "GPU": 1}], + ], +) +def test_resource_parallelism_single( + ray_start_4_cpus_2_gpus_extra, resource_manager_cls, bundles +): + """Test that extra and custom resources are respected for parallelism. + + We schedule two trials with resources according to the bundle. If only + the head bundle or only CPU/GPU resources were considered, both trials + could run in parallel. + + However, we assert that the resources in child bundles and extra resources + are respected and only one trial runs in parallel. + + Legacy test: test_trial_runner.py::TrialRunnerTest::testExtraResources + Legacy test: test_trial_runner.py::TrialRunnerTest::testCustomResources + Legacy test: test_trial_runner.py::TrialRunnerTest::testExtraCustomResources + Legacy test: test_trial_runner.py::TrialRunnerTest::testResourceScheduler + """ + snapshot = TrialStatusSnapshot() + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + callbacks=[TrialStatusSnapshotTaker(snapshot)], + ) + kwargs = { + "stopping_criterion": {"training_iteration": 1}, + "placement_group_factory": PlacementGroupFactory(bundles), + } + trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] + for t in trials: + runner.add_trial(t) + + while not runner.is_finished(): + runner.step() + + assert snapshot.max_running_trials() == 1 + assert snapshot.all_trials_are_terminated() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_fractional_gpus(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Test that fractional GPUs lead to more parallelism. + + We schedule four trials with 0.75 GPUs each. Since our cluster has 2 GPUs, + we should be able to run 2 trials in parallel. + + Legacy test: test_trial_runner.py::TrialRunnerTest::testFractionalGpus + """ + snapshot = TrialStatusSnapshot() + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + callbacks=[TrialStatusSnapshotTaker(snapshot)], + ) + kwargs = { + "stopping_criterion": {"training_iteration": 1}, + "placement_group_factory": PlacementGroupFactory([{"GPU": 0.75}]), + "config": { + "sleep": 1, + }, + } + trials = [Trial("__fake", **kwargs) for i in range(4)] + for t in trials: + runner.add_trial(t) + + while not runner.is_finished(): + runner.step() + + assert snapshot.max_running_trials() == 2 + assert snapshot.all_trials_are_terminated() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_multi_step(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Test that trials can run for more than one iteration. + + Todo (krfricke): This is not a resource test, so it should be moved. + + Legacy test: test_trial_runner.py::TrialRunnerTest::testMultiStepRun + Legacy test: test_trial_runner.py::TrialRunnerTest::testMultiStepRun2 + """ + snapshot = TrialStatusSnapshot() + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + callbacks=[TrialStatusSnapshotTaker(snapshot)], + ) + kwargs = { + "stopping_criterion": {"training_iteration": 5}, + "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]), + } + trials = [Trial("__fake", **kwargs) for i in range(2)] + for t in trials: + runner.add_trial(t) + + while not runner.is_finished(): + runner.step() + + # Overstepping should throw error + # test_trial_runner.py::TrialRunnerTest::testMultiStepRun2 + with pytest.raises(TuneError): + runner.step() + + assert snapshot.all_trials_are_terminated() + assert all(t.last_result["training_iteration"] == 5 for t in runner.get_trials()) + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_resources_changing(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Checks that resource requirements can be changed on fly. + + Legacy test: test_trial_runner.py::TrialRunnerTest::testChangeResources + """ + + class ChangingScheduler(FIFOScheduler): + def __init__(self): + self._has_received_one_trial_result = False + + # For figuring out how many runner.step there are. + def has_received_one_trial_result(self): + return self._has_received_one_trial_result + + def on_trial_result(self, trial_runner, trial, result): + if result["training_iteration"] == 1: + self._has_received_one_trial_result = True + executor = trial_runner.trial_executor + executor.pause_trial(trial) + trial.update_resources(dict(cpu=4, gpu=0)) + return TrialScheduler.NOOP + + scheduler = ChangingScheduler() + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), scheduler=scheduler + ) + kwargs = { + "stopping_criterion": {"training_iteration": 2}, + "placement_group_factory": PlacementGroupFactory([{"CPU": 2, "GPU": 0}]), + } + trials = [Trial("__fake", **kwargs)] + for t in trials: + runner.add_trial(t) + + while not trials[0].status == Trial.RUNNING: + runner.step() + + assert trials[0].status == Trial.RUNNING + assert runner._actor_manager.get_live_actors_resources().get("CPU") == 2 + + with pytest.raises(ValueError): + trials[0].update_resources(dict(cpu=4, gpu=0)) + + while not scheduler.has_received_one_trial_result(): + runner.step() + + assert trials[0].status == Trial.PAUSED + + while not trials[0].status == Trial.RUNNING: + runner.step() + + assert runner._actor_manager.get_live_actors_resources().get("CPU") == 4 + + runner.step() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_queue_filling(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Checks that the trial queue is filled even if only 1 pending trial is allowed. + + Legacy test: test_trial_runner.py::TrialRunnerTest::testQueueFilling + """ + os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" + + def f1(config): + for i in range(10): + yield i + time.sleep(1) + + tune.register_trainable("f1", f1) + + search_alg = BasicVariantGenerator() + search_alg.add_configurations( + { + "foo": { + "run": "f1", + "num_samples": 100, + "config": { + "a": tune.sample_from(lambda spec: 5.0 / 7), + "b": tune.sample_from(lambda spec: "long" * 40), + }, + "resources_per_trial": {"cpu": 2}, + } + } + ) + + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), search_alg=search_alg + ) + + while len(runner.get_trials()) < 3: + runner.step() + + # All trials are enqueued + assert len(runner.get_trials()) == 3 + + status_count = Counter(t.status for t in runner.get_trials()) + while status_count.get(Trial.RUNNING, 0) < 2 and not runner.is_finished(): + runner.step() + status_count = Counter(t.status for t in runner.get_trials()) + + assert len(runner.get_trials()) == 3 + + status_count = Counter(t.status for t in runner.get_trials()) + assert status_count.get(Trial.RUNNING) == 2 + assert status_count.get(Trial.PENDING) == 1 + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/execution/test_controller_search_alg_integration.py b/python/ray/tune/tests/execution/test_controller_search_alg_integration.py new file mode 100644 index 000000000000..fc3e3e2cd0ed --- /dev/null +++ b/python/ray/tune/tests/execution/test_controller_search_alg_integration.py @@ -0,0 +1,366 @@ +import os +import pickle +from collections import Counter + +import pytest +import sys + +import ray +from ray.air.execution import FixedResourceManager, PlacementGroupResourceManager +from ray.tune import Experiment, PlacementGroupFactory +from ray.tune.execution.tune_controller import TuneController +from ray.tune.experiment import Trial +from ray.tune.result import TRAINING_ITERATION +from ray.tune.schedulers import FIFOScheduler, TrialScheduler +from ray.tune.search import Searcher, ConcurrencyLimiter, Repeater, SearchGenerator +from ray.tune.search._mock import _MockSuggestionAlgorithm + + +@pytest.fixture(scope="function") +def ray_start_8_cpus(): + address_info = ray.init(num_cpus=8, num_gpus=0) + yield address_info + ray.shutdown() + + +@pytest.fixture(scope="function") +def ray_start_4_cpus_2_gpus_extra(): + address_info = ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) + yield address_info + ray.shutdown() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_search_alg_notification(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Check that the searchers gets notified of trial results + completions. + + Also check that the searcher is "finished" before the runner, i.e. the runner + continues processing trials when the searcher finished. + + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testSearchAlgNotification + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testSearchAlgFinished + """ + + experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} + experiments = [Experiment.from_json("test", experiment_spec)] + search_alg = _MockSuggestionAlgorithm() + searcher = search_alg.searcher + search_alg.add_configurations(experiments) + + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), search_alg=search_alg + ) + + # Run until trial is running + while not search_alg.is_finished(): + runner.step() + + trials = runner.get_trials() + + # Make sure trial started + while trials[0].status != Trial.RUNNING: + runner.step() + + assert trials[0].status == Trial.RUNNING + assert search_alg.is_finished() + assert not runner.is_finished() + + # Run until everything finished + while not runner.is_finished(): + runner.step() + + assert trials[0].status == Trial.TERMINATED + assert search_alg.is_finished() + assert runner.is_finished() + + assert searcher.counter["result"] == 1 + assert searcher.counter["complete"] == 1 + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_search_alg_scheduler_stop(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Check that a scheduler-issued stop also notifies the search algorithm. + + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testSearchAlgSchedulerInteraction # noqa + """ + + class _MockScheduler(FIFOScheduler): + def on_trial_result(self, *args, **kwargs): + return TrialScheduler.STOP + + experiment_spec = {"run": "__fake", "stop": {"training_iteration": 5}} + experiments = [Experiment.from_json("test", experiment_spec)] + search_alg = _MockSuggestionAlgorithm() + searcher = search_alg.searcher + search_alg.add_configurations(experiments) + + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + search_alg=search_alg, + scheduler=_MockScheduler(), + ) + + trials = runner.get_trials() + + while not runner.is_finished(): + runner.step() + + # Result is not processed because trial stop takes precedence + assert searcher.counter["result"] == 0 + # But on_trial_complete is triggered... + assert searcher.counter["complete"] == 1 + # ... and still updates the last result. + assert trials[0].last_result[TRAINING_ITERATION] == 1 + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_search_alg_stalled(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Checks that runner and searcher state is maintained when stalled. + + We use a concurrency limit of 1, meaning each trial is added one-by-one + from the searchers. + + We then run three samples. During the second trial, we stall the searcher, + which means we don't suggest new trials after it finished. + + In this case, the runner should still be considered "running". Once we unstall, + the experiment finishes regularly. + + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testSearchAlgStalled + """ + experiment_spec = { + "run": "__fake", + "num_samples": 3, + "stop": {"training_iteration": 1}, + } + experiments = [Experiment.from_json("test", experiment_spec)] + search_alg = _MockSuggestionAlgorithm(max_concurrent=1) + search_alg.add_configurations(experiments) + searcher = search_alg.searcher + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + search_alg=search_alg, + ) + runner.step() + trials = runner.get_trials() + while trials[0].status != Trial.TERMINATED: + runner.step() + + # On next step, trials[1] is created + runner.step() + + trials = runner.get_trials() + + while trials[1].status != Trial.RUNNING: + runner.step() + + assert trials[1].status == Trial.RUNNING + assert len(searcher.live_trials) == 1 + + # Stall: We don't suggest new algorithms + searcher.stall = True + + while trials[1].status != Trial.TERMINATED: + runner.step() + + assert trials[1].status == Trial.TERMINATED + assert len(searcher.live_trials) == 0 + + assert all(trial.is_finished() for trial in trials) + assert not search_alg.is_finished() + assert not runner.is_finished() + + # Unstall + searcher.stall = False + + # Create trials[2] + runner.step() + + trials = runner.get_trials() + + while trials[2].status != Trial.RUNNING: + runner.step() + + assert trials[2].status == Trial.RUNNING + assert len(searcher.live_trials) == 1 + + while trials[2].status != Trial.TERMINATED: + runner.step() + + assert len(searcher.live_trials) == 0 + assert search_alg.is_finished() + assert runner.is_finished() + + +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_search_alg_finishes(ray_start_4_cpus_2_gpus_extra, resource_manager_cls): + """Empty SearchAlg changing state in `next_trials` does not crash. + + The search algorithm changes to ``finished`` mid-run. This should not + affect processing of the experiment. + + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testSearchAlgFinishes + """ + os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" + + class FinishFastAlg(_MockSuggestionAlgorithm): + _index = 0 + + def next_trial(self): + spec = self._experiment.spec + trial = None + if self._index < spec["num_samples"]: + trial = Trial(spec.get("run"), stopping_criterion=spec.get("stop")) + self._index += 1 + + if self._index > 4: + self.set_finished() + + return trial + + def suggest(self, trial_id): + return {} + + experiment_spec = { + "run": "__fake", + "num_samples": 2, + "stop": {"training_iteration": 1}, + } + searcher = FinishFastAlg() + experiments = [Experiment.from_json("test", experiment_spec)] + searcher.add_configurations(experiments) + + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + search_alg=searcher, + ) + + assert not runner.is_finished() + + while len(runner.get_trials()) < 2: + runner.step() # Launch 2 runs + + assert not searcher.is_finished() + assert not runner.is_finished() + + searcher_finished_before = False + while not runner.is_finished(): + runner.step() + searcher_finished_before = searcher.is_finished() + + # searcher_finished_before will be True if the searcher was finished before + # the controller. + assert searcher_finished_before + + +# Todo (krfricke): Fix in next batch +@pytest.mark.skip("This test is currently flaky as it can fail due to timing issues.") +@pytest.mark.parametrize( + "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] +) +def test_searcher_save_restore(ray_start_8_cpus, resource_manager_cls, tmpdir): + """Searchers state should be saved and restored in the experiment checkpoint. + + Legacy test: test_trial_runner_3.py::TrialRunnerTest::testSearcherSaveRestore + """ + + def create_searcher(): + class TestSuggestion(Searcher): + def __init__(self, index): + self.index = index + self.returned_result = [] + super().__init__(metric="episode_reward_mean", mode="max") + + def suggest(self, trial_id): + self.index += 1 + return {"test_variable": self.index} + + def on_trial_complete(self, trial_id, result=None, **kwargs): + self.returned_result.append(result) + + def save(self, checkpoint_path): + with open(checkpoint_path, "wb") as f: + pickle.dump(self.__dict__, f) + + def restore(self, checkpoint_path): + with open(checkpoint_path, "rb") as f: + self.__dict__.update(pickle.load(f)) + + searcher = TestSuggestion(0) + searcher = ConcurrencyLimiter(searcher, max_concurrent=2) + searcher = Repeater(searcher, repeat=3, set_index=False) + search_alg = SearchGenerator(searcher) + experiment_spec = { + "run": "__fake", + "num_samples": 20, + "config": {"sleep": 10}, + "stop": {"training_iteration": 2}, + "resources_per_trial": PlacementGroupFactory([{"CPU": 1}]), + } + experiments = [Experiment.from_json("test", experiment_spec)] + search_alg.add_configurations(experiments) + return search_alg + + searcher = create_searcher() + + runner = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + search_alg=searcher, + checkpoint_period=-1, + experiment_path=str(tmpdir), + ) + + while len(runner.get_trials()) < 6: + runner.step() + + assert len(runner.get_trials()) == 6, [t.config for t in runner.get_trials()] + runner.checkpoint() + trials = runner.get_trials() + [runner._schedule_trial_stop(t) for t in trials if t.status is not Trial.ERROR] + + runner.cleanup() + + del runner + + searcher = create_searcher() + + runner2 = TuneController( + resource_manager_factory=lambda: resource_manager_cls(), + search_alg=searcher, + experiment_path=str(tmpdir), + resume="LOCAL", + ) + + assert len(runner2.get_trials()) == 6, [t.config for t in runner2.get_trials()] + + def trial_statuses(): + return [t.status for t in runner2.get_trials()] + + def num_running_trials(): + return sum(t.status == Trial.RUNNING for t in runner2.get_trials()) + + while num_running_trials() < 6: + runner2.step() + + assert len(set(trial_statuses())) == 1 + assert Trial.RUNNING in trial_statuses() + + for i in range(20): + runner2.step() + assert 1 <= num_running_trials() <= 6 + + evaluated = [t.evaluated_params["test_variable"] for t in runner2.get_trials()] + count = Counter(evaluated) + assert all(v <= 3 for v in count.values()) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/execution/utils.py b/python/ray/tune/tests/execution/utils.py index e284d1a80e19..62eaaa232d06 100644 --- a/python/ray/tune/tests/execution/utils.py +++ b/python/ray/tune/tests/execution/utils.py @@ -1,9 +1,9 @@ import os import uuid -from collections import Counter from typing import Any, Callable, Dict, Optional, Tuple, Type, Union import ray +from ray.air.execution import FixedResourceManager from ray.air.execution._internal import RayActorManager from ray.air.execution.resources import ( ResourceManager, @@ -20,22 +20,12 @@ def get(self, trainable_name: str): return trainable_name -class NoopResourceManager(ResourceManager): - def __init__(self): - self.requested_resources = [] - self.canceled_resource_requests = [] - self.currently_requested_resources = Counter() - - def request_resources(self, resource_request: ResourceRequest): - self.requested_resources.append(resource_request) - self.currently_requested_resources[resource_request] += 1 - - def cancel_resource_request(self, resource_request: ResourceRequest): - self.canceled_resource_requests.append(resource_request) - self.currently_requested_resources[resource_request] -= 1 - - def has_resources_ready(self, resource_request: ResourceRequest) -> bool: - return True +class BudgetResourceManager(FixedResourceManager): + def __init__(self, total_resources: Dict[str, float]): + self._allow_strict_pack = True + self._total_resources = total_resources + self._requested_resources = [] + self._used_resources = [] class NoopActorManager(RayActorManager): @@ -68,6 +58,7 @@ def remove_actor( self, tracked_actor: TrackedActor, kill: bool = False, + stop_future: Optional[ray.ObjectRef] = None, ) -> None: self.removed_actors.append(tracked_actor) @@ -106,14 +97,18 @@ def create_placement_group_factory(self): pass -def create_execution_test_objects(tmpdir, max_pending_trials: int = 8): +def create_execution_test_objects( + tmpdir, max_pending_trials: int = 8, resources: Optional[Dict[str, float]] = None +): os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = str(max_pending_trials) + resources = resources or {"CPU": 4} + tune_controller = TuneController( experiment_path=str(tmpdir), reuse_actors=True, ) - resource_manager = NoopResourceManager() + resource_manager = BudgetResourceManager(total_resources=resources) actor_manger = NoopActorManager(resource_manager) tune_controller._actor_manager = actor_manger tune_controller._class_cache = NoopClassCache() diff --git a/python/ray/tune/tests/output/test_output.py b/python/ray/tune/tests/output/test_output.py index c13248b79068..4aa225d05205 100644 --- a/python/ray/tune/tests/output/test_output.py +++ b/python/ray/tune/tests/output/test_output.py @@ -1,9 +1,9 @@ import pytest import sys -import time from freezegun import freeze_time +from ray import tune from ray.tune.experimental.output import ( _get_time_str, _get_trials_by_state, @@ -13,6 +13,8 @@ _current_best_trial, _best_trial_str, _get_trial_table_data, + _get_dict_as_table_data, + _infer_params, ) from ray.tune.experiment.trial import Trial @@ -54,8 +56,28 @@ @freeze_time("Mar 27th, 2023", auto_tick_seconds=15) def test_get_time_str(): - result = _get_time_str(time.time(), time.time()) - assert result == ("2023-03-27 00:00:15", "00:00:15.00") + base = 1679875200 # 2023-03-27 00:00:00 + + assert _get_time_str(base, base) == ("2023-03-27 00:00:00", "0s") + assert _get_time_str(base, base + 15) == ("2023-03-27 00:00:15", "15s") + assert _get_time_str(base, base + 60) == ("2023-03-27 00:01:00", "1min 0s") + assert _get_time_str(base, base + 65) == ("2023-03-27 00:01:05", "1min 5s") + assert _get_time_str(base, base + 3600) == ( + "2023-03-27 01:00:00", + "1hr 0min 0s", + ) + assert _get_time_str(base, base + 3605) == ( + "2023-03-27 01:00:05", + "1hr 0min 5s", + ) + assert _get_time_str(base, base + 3660) == ( + "2023-03-27 01:01:00", + "1hr 1min 0s", + ) + assert _get_time_str(base, base + 86400) == ( + "2023-03-28 00:00:00", + "1d 0hr 0min 0s", + ) def test_get_trials_by_state(): @@ -115,7 +137,8 @@ def test_get_trial_info(): t.last_result = LAST_RESULT assert _get_trial_info( t, - [ + param_keys=[], + metric_keys=[ "episode_reward_mean", "episode_reward_max", "episode_reward_min", @@ -132,10 +155,11 @@ def test_get_trial_table_data_less_than_20(): t.trial_id = str(i) t.set_status(Trial.RUNNING) t.last_result = {"episode_reward_mean": 100 + i} + t.config = {"param": i} trials.append(t) - table_data = _get_trial_table_data(trials, ["episode_reward_mean"]) + table_data = _get_trial_table_data(trials, ["param"], ["episode_reward_mean"]) header = table_data.header - assert header == ["Trial name", "status", "reward"] + assert header == ["Trial name", "status", "param", "reward"] table_data = table_data.data assert len(table_data) == 1 # only the running category assert len(table_data[0].trial_infos) == 20 @@ -151,17 +175,87 @@ def test_get_trial_table_data_more_than_20(): t.trial_id = str(i) t.set_status(status) t.last_result = {"episode_reward_mean": 100 + i} + t.config = {"param": i} trials.append(t) - table_data = _get_trial_table_data(trials, ["episode_reward_mean"]) + table_data = _get_trial_table_data(trials, ["param"], ["episode_reward_mean"]) header = table_data.header - assert header == ["Trial name", "status", "reward"] + assert header == ["Trial name", "status", "param", "reward"] table_data = table_data.data assert len(table_data) == 3 # only the running category for i in range(3): assert len(table_data[i].trial_infos) == 5 - assert table_data[0].more_info == "... and 5 more RUNNING ..." - assert table_data[1].more_info == "... and 5 more TERMINATED ..." - assert table_data[2].more_info == "... and 5 more PENDING ..." + assert table_data[0].more_info == "5 more RUNNING" + assert table_data[1].more_info == "5 more TERMINATED" + assert table_data[2].more_info == "5 more PENDING" + + +def test_infer_params(): + assert _infer_params({}) == [] + assert _infer_params({"some": "val"}) == [] + assert _infer_params({"some": "val", "param": tune.uniform(0, 1)}) == ["param"] + assert _infer_params({"some": "val", "param": tune.grid_search([0, 1])}) == [ + "param" + ] + assert sorted( + _infer_params( + { + "some": "val", + "param": tune.grid_search([0, 1]), + "other": tune.choice([0, 1]), + } + ) + ) == ["other", "param"] + + +def test_result_table_no_divison(): + data = _get_dict_as_table_data( + { + "b": 6, + "a": 8, + "x": 19.123123123, + "c": 5, + "ignore": 9, + "y": 20, + "z": {"m": 4, "n": {"o": "p"}}, + }, + exclude={"ignore"}, + ) + + assert data == [ + ["a", 8], + ["b", 6], + ["c", 5], + ["x", "19.12312"], + ["y", 20], + ["z/m", 4], + ["z/n/o", "p"], + ] + + +def test_result_table_divison(): + data = _get_dict_as_table_data( + { + "b": 6, + "a": 8, + "x": 19.123123123, + "c": 5, + "ignore": 9, + "y": 20, + "z": {"m": 4, "n": {"o": "p"}}, + }, + exclude={"ignore"}, + upper_keys={"x", "y", "z", "z/m", "z/n/o"}, + ) + + assert data == [ + ["x", "19.12312"], + ["y", 20], + ["z/m", 4], + ["z/n/o", "p"], + ["a", 8], + ["b", 6], + ["c", 5], + ] if __name__ == "__main__": diff --git a/python/ray/tune/tests/test_client.py b/python/ray/tune/tests/test_client.py index 6252574fead1..b8eaf0e26136 100644 --- a/python/ray/tune/tests/test_client.py +++ b/python/ray/tune/tests/test_client.py @@ -29,6 +29,17 @@ def start_client_server_2_cpus(): ray.shutdown() +@pytest.fixture +def legacy_progress_reporter(): + old_val = os.environ.get("RAY_AIR_NEW_OUTPUT") + os.environ["RAY_AIR_NEW_OUTPUT"] = "0" + yield + if old_val is None: + os.environ.pop("RAY_AIR_NEW_OUTPUT") + else: + os.environ["RAY_AIR_NEW_OUTPUT"] = old_val + + @pytest.fixture def start_client_server_4_cpus(): ray.init(num_cpus=4) @@ -37,49 +48,51 @@ def start_client_server_4_cpus(): ray.shutdown() -def test_pbt_function(start_client_server_2_cpus): +def test_pbt_function(legacy_progress_reporter, start_client_server_2_cpus): assert ray.util.client.ray.is_connected() from ray.tune.examples.pbt_function import run_tune_pbt run_tune_pbt() -def test_optuna_example(start_client_server): +def test_optuna_example(legacy_progress_reporter, start_client_server): assert ray.util.client.ray.is_connected() from ray.tune.examples.optuna_example import run_optuna_tune run_optuna_tune(smoke_test=True) -def test_cifar10_pytorch(start_client_server_2_cpus): +def test_cifar10_pytorch(legacy_progress_reporter, start_client_server_2_cpus): assert ray.util.client.ray.is_connected() from ray.tune.examples.cifar10_pytorch import main main(num_samples=1, max_num_epochs=1, gpus_per_trial=0) -def test_tune_mnist_keras(start_client_server_4_cpus): +def test_tune_mnist_keras(legacy_progress_reporter, start_client_server_4_cpus): assert ray.util.client.ray.is_connected() from ray.tune.examples.tune_mnist_keras import tune_mnist tune_mnist(num_training_iterations=5) -def test_mnist_ptl_mini(start_client_server): +def test_mnist_ptl_mini(legacy_progress_reporter, start_client_server): assert ray.util.client.ray.is_connected() from ray.tune.examples.mnist_ptl_mini import tune_mnist tune_mnist(num_samples=1, num_epochs=1, gpus_per_trial=0) -def test_xgboost_example(start_client_server): +def test_xgboost_example(legacy_progress_reporter, start_client_server): assert ray.util.client.ray.is_connected() from ray.tune.examples.xgboost_example import tune_xgboost tune_xgboost() -def test_xgboost_dynamic_resources_example(start_client_server): +def test_xgboost_dynamic_resources_example( + legacy_progress_reporter, start_client_server +): assert ray.util.client.ray.is_connected() from ray.tune.examples.xgboost_dynamic_resources_example import tune_xgboost @@ -87,7 +100,7 @@ def test_xgboost_dynamic_resources_example(start_client_server): tune_xgboost(use_class_trainable=False) -def test_mlflow_example(start_client_server): +def test_mlflow_example(legacy_progress_reporter, start_client_server): assert ray.util.client.ray.is_connected() from ray.tune.examples.mlflow_example import tune_with_callback, tune_with_setup @@ -96,14 +109,14 @@ def test_mlflow_example(start_client_server): tune_with_setup(mlflow_tracking_uri, finish_fast=True) -def test_pbt_transformers(start_client_server): +def test_pbt_transformers(legacy_progress_reporter, start_client_server): assert ray.util.client.ray.is_connected() from ray.tune.examples.pbt_transformers.pbt_transformers import tune_transformer tune_transformer(num_samples=1, gpus_per_trial=0, smoke_test=True) -def test_jupyter_rich_output(start_client_server_4_cpus): +def test_jupyter_rich_output(legacy_progress_reporter, start_client_server_4_cpus): assert ray.util.client.ray.is_connected() def dummy_objective(config): diff --git a/python/ray/tune/tests/test_cluster_searcher.py b/python/ray/tune/tests/test_cluster_searcher.py index b10312d1ccc0..5a3095019792 100644 --- a/python/ray/tune/tests/test_cluster_searcher.py +++ b/python/ray/tune/tests/test_cluster_searcher.py @@ -40,7 +40,7 @@ def start_connected_cluster(): @pytest.mark.skipif( - os.environ.get("TUNE_NEW_EXECUTION") == "1", + os.environ.get("TUNE_NEW_EXECUTION") != "0", reason=( "This test uses the TrialRunner directly and needs to be rewritten " "for the new execution backend." diff --git a/python/ray/tune/tests/test_experiment_analysis.py b/python/ray/tune/tests/test_experiment_analysis.py index dc17d26e5b03..0d966fff3e73 100644 --- a/python/ray/tune/tests/test_experiment_analysis.py +++ b/python/ray/tune/tests/test_experiment_analysis.py @@ -9,6 +9,7 @@ import ray from ray import tune +from ray.air._internal.remote_storage import upload_to_uri from ray.tune import ExperimentAnalysis import ray.tune.registry from ray.tune.tests.utils.experiment import create_test_experiment_checkpoint @@ -355,6 +356,21 @@ def train(config): self.assertEqual(var, 1) +def run_test_exp(path: str) -> ExperimentAnalysis: + with create_test_experiment_checkpoint(path) as creator: + for i in range(10): + trial = creator.create_trial(f"trial_{i}", config={"id": i, "hparam": 1}) + creator.trial_result( + trial, + { + "training_iteration": 1, + "episode_reward_mean": 10 + int(90 * random.random()), + }, + ) + + return ExperimentAnalysis(path, trials=creator.get_trials()) + + class ExperimentAnalysisStubSuite(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp() @@ -362,27 +378,12 @@ def setUp(self): self.num_samples = 2 self.metric = "episode_reward_mean" self.test_path = os.path.join(self.test_dir, self.test_name) - self.run_test_exp() def tearDown(self): shutil.rmtree(self.test_dir, ignore_errors=True) - def run_test_exp(self): - with create_test_experiment_checkpoint(self.test_path) as creator: - for i in range(10): - trial = creator.create_trial(f"trial_{i}", config={}) - creator.trial_result( - trial, - { - "training_iteration": 1, - "episode_reward_mean": 10 + int(90 * random.random()), - }, - ) - - return ExperimentAnalysis(self.test_dir, trials=creator.get_trials()) - def testPickling(self): - analysis = self.run_test_exp() + analysis = run_test_exp(self.test_path) pickle_path = os.path.join(self.test_dir, "analysis.pickle") with open(pickle_path, "wb") as f: pickle.dump(analysis, f) @@ -394,8 +395,8 @@ def testPickling(self): self.assertTrue(analysis.get_best_trial(metric=self.metric, mode="max")) - def testFromPath(self): - self.run_test_exp() + def testFromLocalPath(self): + run_test_exp(self.test_path) analysis = ExperimentAnalysis(self.test_path) self.assertTrue(analysis.get_best_trial(metric=self.metric, mode="max")) @@ -433,6 +434,37 @@ def testEmptyCheckpoint(self): assert len(ea.trials) == 10 +def test_create_from_remote_path(tmp_path, mock_s3_bucket_uri): + run_test_exp(str(tmp_path)) + upload_to_uri(str(tmp_path), mock_s3_bucket_uri) + + local_analysis = ExperimentAnalysis(str(tmp_path)) + remote_analysis = ExperimentAnalysis(mock_s3_bucket_uri) + + metric = "episode_reward_mean" + mode = "max" + + # Tracked metric data is the same + assert ( + local_analysis.get_best_trial(metric=metric, mode=mode).trial_id + == remote_analysis.get_best_trial(metric=metric, mode=mode).trial_id + ) + + # Trial result dataframes are the same + assert all( + local_df.equals(remote_df) + for local_df, remote_df in zip( + local_analysis.trial_dataframes.values(), + remote_analysis.trial_dataframes.values(), + ) + ) + + # Trial configs are the same + assert list(local_analysis.get_all_configs().values()) == list( + remote_analysis.get_all_configs().values() + ) + + if __name__ == "__main__": import pytest import sys diff --git a/python/ray/tune/tests/test_progress_reporter.py b/python/ray/tune/tests/test_progress_reporter.py index e04bb33d7181..d1e8a0c93119 100644 --- a/python/ray/tune/tests/test_progress_reporter.py +++ b/python/ray/tune/tests/test_progress_reporter.py @@ -325,12 +325,10 @@ def train(config): # Add "verbose=3)" etc -@pytest.mark.skipif( - "AIR_VERBOSITY" in os.environ, reason="console v2 doesn't work with this v1 test." -) class ProgressReporterTest(unittest.TestCase): def setUp(self) -> None: os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "auto" + os.environ["RAY_AIR_NEW_OUTPUT"] = "0" def mock_trial(self, status, i): mock = MagicMock() @@ -402,7 +400,7 @@ def test(config): for i in range(3): tune.report(**test_result) - analysis = tune.run(test, num_samples=3) + analysis = tune.run(test, num_samples=3, verbose=3) all_trials = analysis.trials inferred_results = reporter._infer_user_metrics(all_trials) for metric in inferred_results: @@ -421,7 +419,7 @@ def report(self, *args, **kwargs): self._output.append(progress_str) reporter = TestReporter() - analysis = tune.run(test, num_samples=3, progress_reporter=reporter) + analysis = tune.run(test, num_samples=3, progress_reporter=reporter, verbose=3) found = {k: False for k in test_result} for output in reporter._output: for key in test_result: @@ -688,7 +686,7 @@ def testEndToEndReporting(self): output = run_string_as_driver(END_TO_END_COMMAND) try: # New execution path is too fast, trials are already terminated - if os.environ.get("TUNE_NEW_EXECUTION") != "1": + if os.environ.get("TUNE_NEW_EXECUTION") == "0": assert EXPECTED_END_TO_END_START in output assert EXPECTED_END_TO_END_END in output assert "(raylet)" not in output, "Unexpected raylet log messages" @@ -713,7 +711,7 @@ def testVerboseReporting(self): self.assertIsNone(re.search(VERBOSE_TRIAL_NORM_2_PATTERN, output)) self.assertNotIn(VERBOSE_TRIAL_NORM_3, output) self.assertNotIn(VERBOSE_TRIAL_NORM_4, output) - if os.environ.get("TUNE_NEW_EXECUTION") != "1": + if os.environ.get("TUNE_NEW_EXECUTION") == "0": self.assertNotIn(VERBOSE_TRIAL_DETAIL, output) except Exception: print("*** BEGIN OUTPUT ***") @@ -725,14 +723,14 @@ def testVerboseReporting(self): output = run_string_as_driver(verbose_1_cmd) try: # New execution path is too fast, trials are already terminated - if os.environ.get("TUNE_NEW_EXECUTION") != "1": + if os.environ.get("TUNE_NEW_EXECUTION") == "0": self.assertIn(VERBOSE_EXP_OUT_1, output) self.assertIn(VERBOSE_EXP_OUT_2, output) self.assertNotIn(VERBOSE_TRIAL_NORM_1, output) self.assertIsNone(re.search(VERBOSE_TRIAL_NORM_2_PATTERN, output)) self.assertNotIn(VERBOSE_TRIAL_NORM_3, output) self.assertNotIn(VERBOSE_TRIAL_NORM_4, output) - if os.environ.get("TUNE_NEW_EXECUTION") != "1": + if os.environ.get("TUNE_NEW_EXECUTION") == "0": self.assertNotIn(VERBOSE_TRIAL_DETAIL, output) except Exception: print("*** BEGIN OUTPUT ***") @@ -743,7 +741,7 @@ def testVerboseReporting(self): verbose_2_cmd = VERBOSE_CMD + "verbose=2)" output = run_string_as_driver(verbose_2_cmd) try: - if os.environ.get("TUNE_NEW_EXECUTION") != "1": + if os.environ.get("TUNE_NEW_EXECUTION") == "0": self.assertIn(VERBOSE_EXP_OUT_1, output) self.assertIn(VERBOSE_EXP_OUT_2, output) self.assertIn(VERBOSE_TRIAL_NORM_1, output) @@ -760,14 +758,14 @@ def testVerboseReporting(self): verbose_3_cmd = VERBOSE_CMD + "verbose=3)" output = run_string_as_driver(verbose_3_cmd) try: - if os.environ.get("TUNE_NEW_EXECUTION") != "1": + if os.environ.get("TUNE_NEW_EXECUTION") == "0": self.assertIn(VERBOSE_EXP_OUT_1, output) self.assertIn(VERBOSE_EXP_OUT_2, output) self.assertNotIn(VERBOSE_TRIAL_NORM_1, output) self.assertIsNone(re.search(VERBOSE_TRIAL_NORM_2_PATTERN, output)) self.assertNotIn(VERBOSE_TRIAL_NORM_3, output) self.assertNotIn(VERBOSE_TRIAL_NORM_4, output) - if os.environ.get("TUNE_NEW_EXECUTION") != "1": + if os.environ.get("TUNE_NEW_EXECUTION") == "0": self.assertIn(VERBOSE_TRIAL_DETAIL, output) # Check that we don't print duplicate results at the end self.assertTrue(output.count(VERBOSE_TRIAL_WITH_ONCE_RESULT) == 1) @@ -799,7 +797,12 @@ def should_report(self, trials, done=False): def report(self, trials, done, *sys_info): pass - tune.run(lambda config: 2, num_samples=1, progress_reporter=CustomReporter()) + tune.run( + lambda config: 2, + num_samples=1, + progress_reporter=CustomReporter(), + verbose=3, + ) def testMaxLen(self): trials = [] diff --git a/python/ray/tune/tests/test_ray_trial_executor.py b/python/ray/tune/tests/test_ray_trial_executor.py index ad8a99d09b47..b289bcd2bd49 100644 --- a/python/ray/tune/tests/test_ray_trial_executor.py +++ b/python/ray/tune/tests/test_ray_trial_executor.py @@ -99,13 +99,14 @@ def train(config): ) msg = ( "Ignore this message if the cluster is autoscaling. " - "You asked for 5.0 cpu and 3.0 gpu per trial, " - "but the cluster only has 4.0 cpu and 2.0 gpu. " - "Stop the tuning job and " - "adjust the resources requested per trial " - "(possibly via `resources_per_trial` " - "or via `num_workers` for rllib) " - "and/or add more resources to your Ray runtime." + "No trial is running and no new trial has been started " + "within the last 0 seconds. This could be due to the cluster not having " + "enough resources available. You asked for 5.0 CPUs and 3.0 GPUs per " + "trial, but the cluster only has 4.0 CPUs and 2.0 GPUs available. " + "Stop the tuning and adjust the required resources " + "(e.g. via the `ScalingConfig` or `resources_per_trial`, " + "or `num_workers` for rllib), " + "or add more resources to your cluster." ) mocked_warn.assert_called_once_with(msg) diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py index 020a9ca127a0..c32fec617c2f 100644 --- a/python/ray/tune/tests/test_result_grid.py +++ b/python/ray/tune/tests/test_result_grid.py @@ -354,7 +354,7 @@ def test_num_errors_terminated(tmpdir): trials[i].status = Trial.TERMINATED create_tune_experiment_checkpoint(trials, local_checkpoint_dir=str(tmpdir)) - result_grid = ResultGrid(tune.ExperimentAnalysis(tmpdir)) + result_grid = ResultGrid(tune.ExperimentAnalysis(str(tmpdir))) assert len(result_grid.errors) == 3 assert result_grid.num_errors == 3 assert result_grid.num_terminated == 2 diff --git a/python/ray/tune/tests/test_syncer.py b/python/ray/tune/tests/test_syncer.py index 853b512e211b..e3bff59fb67e 100644 --- a/python/ray/tune/tests/test_syncer.py +++ b/python/ray/tune/tests/test_syncer.py @@ -6,22 +6,29 @@ import tempfile import time from typing import List, Optional +import unittest from unittest.mock import patch -import boto3 -import pytest from freezegun import freeze_time +import numpy as np +import pyarrow.fs +import pytest import ray import ray.cloudpickle as pickle from ray import tune from ray.air import session, Checkpoint, RunConfig +from ray.air.config import CheckpointConfig, ScalingConfig +from ray.air._internal.remote_storage import ( + upload_to_uri, + download_from_uri, + get_fs_and_path, +) from ray.air._internal.uri_utils import URI +from ray.train.torch import TorchTrainer from ray.tune import TuneError from ray.tune.syncer import _DefaultSyncer, Syncer, SyncConfig from ray.tune.utils.file_transfer import _pack_dir, _unpack_dir -from ray.air._internal.remote_storage import upload_to_uri, download_from_uri -from ray._private.test_utils import simulate_storage @pytest.fixture @@ -74,26 +81,6 @@ def temp_data_dirs(): shutil.rmtree(tmp_target) -@pytest.fixture -def mock_s3_bucket_uri(): - port = 5002 - region = "us-west-2" - with simulate_storage("s3", port=port, region=region) as s3_uri: - s3 = boto3.client( - "s3", region_name=region, endpoint_url=f"http://localhost:{port}" - ) - # Bucket name will be autogenerated/unique per test - bucket_name = URI(s3_uri).name - s3.create_bucket( - Bucket=bucket_name, - CreateBucketConfiguration={"LocationConstraint": region}, - ) - # Disable server HTTP request logging - logging.getLogger("werkzeug").setLevel(logging.WARNING) - yield s3_uri - logging.getLogger("werkzeug").setLevel(logging.INFO) - - def assert_file(exists: bool, root: str, path: str): full_path = os.path.join(root, path) @@ -919,7 +906,7 @@ def train_func(config): ) -def test_sync_folder_with_many_files_s3(mock_s3_bucket_uri, tmp_path): +def _test_sync_folder_with_many_files_s3(mock_s3_bucket_uri, tmp_path): source_dir = tmp_path / "source" check_dir = tmp_path / "check" source_dir.mkdir() @@ -934,6 +921,28 @@ def test_sync_folder_with_many_files_s3(mock_s3_bucket_uri, tmp_path): assert (check_dir / "255").exists() +def test_sync_folder_with_many_files_s3_native(mock_s3_bucket_uri, tmp_path): + with patch("ray.air._internal.remote_storage.fsspec", None): + fs, path = get_fs_and_path(mock_s3_bucket_uri) + + assert isinstance(fs, pyarrow.fs.S3FileSystem) + + _test_sync_folder_with_many_files_s3(mock_s3_bucket_uri, tmp_path) + + +def test_sync_folder_with_many_files_s3_fsspec(mock_s3_bucket_uri, tmp_path): + try: + import s3fs # noqa: F401 + except Exception as exc: + raise AssertionError("This test requires s3fs to be installed") from exc + + fs, path = get_fs_and_path(mock_s3_bucket_uri) + + assert isinstance(fs, pyarrow.fs.PyFileSystem) + + _test_sync_folder_with_many_files_s3(mock_s3_bucket_uri, tmp_path) + + def test_sync_folder_with_many_files_fs(tmpdir): # Create 256 files to upload for i in range(256): @@ -1001,6 +1010,154 @@ def get_remote_trial_dir(trial_id: int): assert num_checkpoints == 2 # 1 before restore + 1 after +def test_distributed_checkpointing_to_s3( + ray_start_4_cpus, mock_s3_bucket_uri, tmp_path +): + """Tests a Tune run with distributed checkpointing to a mock s3 bucket. + + This test runs a Tune run with 3 distributed DDP workers. + We run 10 steps in total and checkpoint every 3 steps. + At the end of the test, we check the ranked index files are + available both locally and on the cloud. + We also make sure the model checkpoint files are only available + on the cloud. + """ + exp_name = "test_dist_ckpt_to_s3" + local_dir = os.path.join(tmp_path, "local_dir") + + def train_fn(config): + world_rank = session.get_world_rank() + for step in range(config["num_steps"]): + time.sleep(0.1) + checkpoint = None + if step % 3 == 0: + checkpoint_dir = tempfile.mkdtemp(dir=tmp_path) + path = os.path.join(checkpoint_dir, f"optim-{world_rank}.pt") + with open(path, "wb") as f: + f.write( + pickle.dumps( + { + "optimizer": "adam", + "lr": 0.001, + "optimizer_state": np.random.random((100, 100)), + } + ) + ) + path = os.path.join(checkpoint_dir, f"model-{world_rank}.pt") + with open(path, "wb") as f: + f.write( + pickle.dumps( + { + "model": "resnet", + "weights": np.random.random((100, 100)), + } + ) + ) + checkpoint = Checkpoint.from_directory(checkpoint_dir) + session.report({"score": step}, checkpoint=checkpoint) + + def _check_dir_content(checkpoint_dir, exist=True): + # Double check local checkpoint dir. + local_trial_data = os.listdir( + os.path.join(local_dir, "test_dist_ckpt_to_s3", "trial_0") + ) + if exist: + # checkpoint in local trial folder. + assert checkpoint_dir in local_trial_data + local_checkpoint_data = os.listdir( + os.path.join( + local_dir, "test_dist_ckpt_to_s3", "trial_0", checkpoint_dir + ) + ) + # Local folder has index files. + assert ".RANK_0.files" in local_checkpoint_data + assert ".RANK_1.files" in local_checkpoint_data + assert ".RANK_2.files" in local_checkpoint_data + # But no data files. + assert "model-0.pt" not in local_checkpoint_data + assert "model-1.pt" not in local_checkpoint_data + assert "model-2.pt" not in local_checkpoint_data + else: + assert checkpoint_dir not in local_trial_data + + cloud_trial_data = os.listdir( + os.path.join(download_dir, "test_dist_ckpt_to_s3", "trial_0") + ) + if exist: + # Checkpoint in cloud trial folder. + assert checkpoint_dir in cloud_trial_data + cloud_checkpoint_data = os.listdir( + os.path.join( + download_dir, "test_dist_ckpt_to_s3", "trial_0", checkpoint_dir + ) + ) + # Cloud folder has index files. + assert ".RANK_0.files" in cloud_checkpoint_data + assert ".RANK_1.files" in cloud_checkpoint_data + assert ".RANK_2.files" in cloud_checkpoint_data + # And all the data files. + assert "model-0.pt" in cloud_checkpoint_data + assert "model-1.pt" in cloud_checkpoint_data + assert "model-2.pt" in cloud_checkpoint_data + else: + assert checkpoint_dir not in cloud_trial_data + + with unittest.mock.patch.dict(os.environ, {"RAY_AIR_LOCAL_CACHE_DIR": local_dir}): + trainer = TorchTrainer( + train_fn, + train_loop_config={"num_steps": 10}, + scaling_config=ScalingConfig( + num_workers=3, + use_gpu=False, + ), + # Note(jungong) : Trainers ignore the RunConfig specified via + # Tuner below. So to specify proper cloud paths and CheckpointConfig, + # we must pass another dummy RunConfig here. + # TODO(jungong) : this is extremely awkward. Refactor and clean up. + run_config=RunConfig( + storage_path=mock_s3_bucket_uri, + checkpoint_config=CheckpointConfig( + num_to_keep=3, + checkpoint_frequency=3, + _checkpoint_keep_all_ranks=True, + _checkpoint_upload_from_workers=True, + ), + ), + ) + + tuner = tune.Tuner( + trainer, + run_config=RunConfig( + name=exp_name, + storage_path=mock_s3_bucket_uri, + checkpoint_config=CheckpointConfig( + num_to_keep=3, + ), + ), + tune_config=tune.TuneConfig( + # Only running 1 trial. + trial_dirname_creator=lambda t: "trial_0" + ), + ) + result_grid = tuner.fit() + # Run was successful. + assert not result_grid.errors + # Make sure checkpoint is backed by the full s3 checkpoint uri. + assert result_grid[0].checkpoint.uri.startswith("s3://") + + # Download remote dir locally to do some sanity checks + download_dir = os.path.join(tmp_path, "download") + + shutil.rmtree(download_dir, ignore_errors=True) + download_from_uri(uri=mock_s3_bucket_uri, local_path=str(download_dir)) + + # Step 0 checkpoint is deleted. + _check_dir_content("checkpoint_000000", exist=False) + _check_dir_content("checkpoint_000001") # Step 3 + _check_dir_content("checkpoint_000002") # Step 6 + _check_dir_content("checkpoint_000003") # Step 9 + + if __name__ == "__main__": import sys diff --git a/python/ray/tune/tests/test_trainable_util.py b/python/ray/tune/tests/test_trainable_util.py index f077938f1d75..65cf745ff998 100644 --- a/python/ray/tune/tests/test_trainable_util.py +++ b/python/ray/tune/tests/test_trainable_util.py @@ -10,7 +10,6 @@ import ray import ray._private.utils -import ray.cloudpickle as cloudpickle from ray.tune.utils.util import wait_for_gpu from ray.tune.utils.util import flatten_dict, unflatten_dict, unflatten_list_dict from ray.tune.trainable.util import TrainableUtil @@ -74,24 +73,6 @@ def testFindCheckpointDir(self): parent = os.path.dirname(found_dir) TrainableUtil.find_checkpoint_dir(parent) - def testPickleCheckpoint(self): - for i in range(5): - path = os.path.join(self.checkpoint_dir, str(i)) - with open(path, "w") as f: - f.write(str(i)) - - checkpoint_path = os.path.join(self.checkpoint_dir, "0") - - data_dict = TrainableUtil.pickle_checkpoint(checkpoint_path) - loaded = cloudpickle.loads(data_dict) - - checkpoint_name = os.path.basename(checkpoint_path) - self.assertEqual(loaded["checkpoint_name"], checkpoint_name) - - for i in range(5): - path = os.path.join(self.checkpoint_dir, str(i)) - self.assertEqual(loaded["data"][str(i)], open(path, "rb").read()) - class FlattenDictTest(unittest.TestCase): def test_output_type(self): diff --git a/python/ray/tune/tests/test_trial_relative_logdir.py b/python/ray/tune/tests/test_trial_relative_logdir.py index d6bf194ddb02..dc1425548ddd 100644 --- a/python/ray/tune/tests/test_trial_relative_logdir.py +++ b/python/ray/tune/tests/test_trial_relative_logdir.py @@ -310,5 +310,17 @@ def test_change_trial_local_dir(tmpdir): assert trial.get_trial_checkpoints()[0].dir_or_data.startswith(new_local_dir) +def test_trial_logdir_length(tmpdir): + """Test that trial local paths with a long logdir are truncated""" + trial = Trial( + trainable_name="none", + experiment_path=str(tmpdir), + stub=True, + config={"a" * 50: 5.0 / 7, "b" * 50: "long" * 40}, + ) + trial.init_local_path() + assert len(os.path.basename(trial.local_path)) < 200 + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/tests/test_trial_runner.py b/python/ray/tune/tests/test_trial_runner.py index 02ccad99f58e..92a9e35ba593 100644 --- a/python/ray/tune/tests/test_trial_runner.py +++ b/python/ray/tune/tests/test_trial_runner.py @@ -7,7 +7,7 @@ from ray.rllib import _register_all from ray import tune -from ray.tune import TuneError, register_trainable +from ray.tune import TuneError from ray.tune.execution.ray_trial_executor import RayTrialExecutor from ray.tune.schedulers import TrialScheduler, FIFOScheduler from ray.tune.search import BasicVariantGenerator @@ -27,36 +27,6 @@ def setUp(self): def tearDown(self): ray.shutdown() - def testExperimentTagTruncation(self): - ray.init(num_cpus=2) - - def train(config, reporter): - reporter(timesteps_total=1) - - trial_executor = RayTrialExecutor(resource_manager=self._resourceManager()) - register_trainable("f1", train) - - experiments = { - "foo": { - "run": "f1", - "config": { - "a" * 50: tune.sample_from(lambda spec: 5.0 / 7), - "b" * 50: tune.sample_from(lambda spec: "long" * 40), - }, - } - } - - for name, spec in experiments.items(): - trial_generator = BasicVariantGenerator() - trial_generator.add_configurations({name: spec}) - while not trial_generator.is_finished(): - trial = trial_generator.next_trial() - if not trial: - break - trial_executor.start_trial(trial) - self.assertLessEqual(len(os.path.basename(trial.local_path)), 200) - trial_executor.stop_trial(trial) - def testExtraResources(self): ray.init(num_cpus=4, num_gpus=2) snapshot = TrialStatusSnapshot() diff --git a/python/ray/tune/tests/test_trial_runner_3.py b/python/ray/tune/tests/test_trial_runner_3.py index fb5da5758474..35e8575e872b 100644 --- a/python/ray/tune/tests/test_trial_runner_3.py +++ b/python/ray/tune/tests/test_trial_runner_3.py @@ -1159,7 +1159,7 @@ def testPeriodicCloudCheckpointSyncTimeout(self): assert syncer.sync_up_counter == 2 def testExperimentCheckpointWithDatasets(self): - """Test trial runner checkpointing where trials contain Ray Datasets. + """Test trial runner checkpointing where trials contain Datasets. When possible, a dataset plan should be saved (for read_* APIs). See `Dataset.serialize_lineage` for more information. diff --git a/python/ray/tune/tests/test_trial_scheduler.py b/python/ray/tune/tests/test_trial_scheduler.py index 102b668277d3..26fb21becc67 100644 --- a/python/ray/tune/tests/test_trial_scheduler.py +++ b/python/ray/tune/tests/test_trial_scheduler.py @@ -807,10 +807,6 @@ def result(score, ts): [t.status for t in trials], [Trial.PAUSED, Trial.PENDING, Trial.PAUSED] ) - @pytest.mark.skipif( - os.environ.get("TUNE_NEW_EXECUTION") == "1", - reason="BOHB does not currently work with the new execution backend.", - ) def testNonstopBOHB(self): from ray.tune.search.bohb import TuneBOHB @@ -1903,7 +1899,7 @@ def testFastPerturb(self): shutil.rmtree(tmpdir) @pytest.mark.skipif( - os.environ.get("TUNE_NEW_EXECUTION") == "1", + os.environ.get("TUNE_NEW_EXECUTION") != "0", reason=( "This test is generally flaky: The print after writing `Cleanup` " "to the file is printed, but the data is not always written. " diff --git a/python/ray/tune/tests/test_tune_restore.py b/python/ray/tune/tests/test_tune_restore.py index e97162db3f4c..96295af8dc93 100644 --- a/python/ray/tune/tests/test_tune_restore.py +++ b/python/ray/tune/tests/test_tune_restore.py @@ -17,6 +17,7 @@ import ray from ray import tune from ray._private.test_utils import recursive_fnmatch, run_string_as_driver +from ray.air._internal.checkpoint_manager import _TrackedCheckpoint, CheckpointStorage from ray.exceptions import RayTaskError from ray.rllib import _register_all from ray.tune import TuneError @@ -643,6 +644,27 @@ def training_func(config): tune.run(training_func) +@pytest.mark.parametrize( + "trial_config", [{}, {"attr": 4}, {"nested": {"key": "value"}}] +) +def test_trial_last_result_restore(trial_config): + metrics = {"metric1": 4, "nested2": {"metric3": 6}} + metrics["config"] = trial_config + + trial = Trial(trainable_name="stub", config=trial_config, stub=True) + trial.update_last_result(metrics) + + checkpoint = _TrackedCheckpoint( + dir_or_data="no_data", + storage_mode=CheckpointStorage.PERSISTENT, + metrics=metrics, + ) + + trial.restoring_from = checkpoint + trial.on_restore() + assert trial.last_result == metrics + + def test_stacktrace(): """Test proper stacktrace is printed for RayTaskError.""" CMD = """ diff --git a/python/ray/tune/tests/test_tune_restore_warm_start.py b/python/ray/tune/tests/test_tune_restore_warm_start.py index 9f9a402e1791..dcaeb8464ec3 100644 --- a/python/ray/tune/tests/test_tune_restore_warm_start.py +++ b/python/ray/tune/tests/test_tune_restore_warm_start.py @@ -470,10 +470,6 @@ def cost(space, reporter): return search_alg, cost -@pytest.mark.skipif( - os.environ.get("TUNE_NEW_EXECUTION") == "1", - reason="BOHB does not currently work with the new execution backend.", -) class BOHBWarmStartTest(AbstractWarmStartTest, unittest.TestCase): def set_basic_conf(self): space = {"width": tune.uniform(0, 20), "height": tune.uniform(-100, 100)} diff --git a/python/ray/tune/tests/test_tuner_restore.py b/python/ray/tune/tests/test_tuner_restore.py index 034dfe916c05..fcc9fb566618 100644 --- a/python/ray/tune/tests/test_tuner_restore.py +++ b/python/ray/tune/tests/test_tuner_restore.py @@ -1,4 +1,5 @@ import json +import logging import os from pathlib import Path import shutil @@ -8,6 +9,7 @@ import pytest import ray +import ray.cloudpickle as ray_pickle from ray import tune from ray.air import ( Checkpoint, @@ -416,7 +418,6 @@ def _test_tuner_restore_from_cloud(tmpdir, configure_storage_path, storage_path) remote_contents = os.listdir(check_path / "exp_dir") assert "tuner.pkl" in remote_contents - assert "trainable.pkl" in remote_contents prev_cp = _find_newest_experiment_checkpoint(str(check_path / "exp_dir")) prev_lstat = os.lstat(prev_cp) @@ -429,7 +430,6 @@ def _test_tuner_restore_from_cloud(tmpdir, configure_storage_path, storage_path) assert results[0].metrics["_metric"] == 1 local_contents = os.listdir(tmpdir / "ray_results" / "exp_dir") assert "tuner.pkl" in local_contents - assert "trainable.pkl" in local_contents after_cp = _find_newest_experiment_checkpoint( str(tmpdir / "ray_results" / "exp_dir") @@ -595,7 +595,7 @@ def load_checkpoint(self, checkpoint_path): assert result.metrics["score"] == 2 -def test_restore_overwrite_trainable(ray_start_2_cpus, tmpdir, caplog): +def test_restore_overwrite_trainable(ray_start_2_cpus, tmpdir): """Test validation for trainable compatibility, when re-specifying a trainable on restore.""" @@ -633,7 +633,7 @@ def train_func_2(config): resume_errored=True, ) - # Can still change trainable code, but logs a warning + # Can technically change trainable code (not recommended!) def train_func_1(config): checkpoint = session.get_checkpoint() assert checkpoint and checkpoint.to_dict()["data"] == config["data"] @@ -707,8 +707,8 @@ def create_trainable_with_params(): fail_marker.unlink() tuner = Tuner.restore( str(tmp_path / exp_name), - resume_errored=True, trainable=create_trainable_with_params(), + resume_errored=True, ) results = tuner.fit() assert not results.errors @@ -1053,7 +1053,40 @@ def test_tuner_can_restore(tmp_path, upload_dir): assert not Tuner.can_restore(tmp_path / "new_exp") -def testParamSpaceOverwrite(tmp_path, monkeypatch): +def testParamSpaceOverwriteValidation(ray_start_4_cpus, tmp_path): + """Check that validation on restore fails if we try adding or removing + hyperparameters to the param_space.""" + name = "test_param_space_valid" + param_space = {"a": 1, "b": {"c": tune.choice([0, 1])}, "d": tune.uniform(0, 1)} + tuner = Tuner( + _train_fn_sometimes_failing, + param_space=param_space, + run_config=RunConfig(storage_path=str(tmp_path), name=name), + ) + tuner.fit() + + bad_param_spaces = [ + {}, + {"a": 1, "b": {}, "d": 2}, + {"a": 1, "b": {"c": 2, "e": 3}, "d": 4}, + ] + for bad_param_space in bad_param_spaces: + with pytest.raises(ValueError): + Tuner.restore( + str(tmp_path / name), + trainable=_train_fn_sometimes_failing, + param_space=bad_param_space, + ) + + # Should work with the original param space + Tuner.restore( + str(tmp_path / name), + trainable=_train_fn_sometimes_failing, + param_space=param_space, + ) + + +def testParamSpaceOverwrite(ray_start_4_cpus, tmp_path, monkeypatch): """Test that overwriting param space on restore propagates new refs to existing trials and newly generated trials.""" @@ -1133,6 +1166,18 @@ def train_fn(config): assert r.config["test2"].name in ["11", "12", "13", "14"] +def test_tuner_pkl_backwards_compatibility(tmp_path, caplog): + tuner_internal = Tuner( + _train_fn_sometimes_failing, param_space={"a": 1} + )._local_tuner + with open(tmp_path / "tuner.pkl", "wb") as f: + ray_pickle.dump(tuner_internal, f) + + with caplog.at_level(logging.WARNING, "ray.tune.impl.tuner_internal"): + tuner_internal._load_tuner_state(tmp_path / "tuner.pkl") + assert "run with an older version of Ray" in caplog.text + + if __name__ == "__main__": import sys diff --git a/python/ray/tune/tests/tutorial.py b/python/ray/tune/tests/tutorial.py index ceb48366334f..d3b217ca7f8f 100644 --- a/python/ray/tune/tests/tutorial.py +++ b/python/ray/tune/tests/tutorial.py @@ -12,6 +12,7 @@ import torch.nn.functional as F from ray import air, tune +from ray.air import session from ray.tune.schedulers import ASHAScheduler # __tutorial_imports_end__ # fmt: on @@ -105,7 +106,7 @@ def train_mnist(config): acc = test(model, test_loader) # Send the current training result back to Tune - tune.report(mean_accuracy=acc) + session.report({"mean_accuracy": acc}) if i % 5 == 0: # This saves the model to the trial directory diff --git a/python/ray/tune/tests/utils/experiment.py b/python/ray/tune/tests/utils/experiment.py index 754dd6cd2274..8237beb9cb78 100644 --- a/python/ray/tune/tests/utils/experiment.py +++ b/python/ray/tune/tests/utils/experiment.py @@ -1,14 +1,15 @@ import os import tempfile from contextlib import contextmanager -from functools import partial from pathlib import Path from typing import Any, Dict, Optional, Type from ray.air._internal.checkpoint_manager import _TrackedCheckpoint, CheckpointStorage +from ray.tune import SyncConfig +from ray.tune.callback import CallbackList from ray.tune.execution.trial_runner import TrialRunner, _TuneControllerBase from ray.tune.experiment import Trial -from ray.tune.trainable import TrainableUtil +from ray.tune.utils.callback import _create_default_callbacks class _ExperimentCheckpointCreator: @@ -30,6 +31,11 @@ def _get_trial_checkpoints(self): experiment_path=experiment_path, experiment_dir_name=experiment_name ) + # Also, create any default logger callback artifacts. + self.callbacks = CallbackList( + _create_default_callbacks([], sync_config=SyncConfig(syncer=None)) + ) + def save_checkpoint(self): self.runner.save_to_dir() @@ -44,6 +50,12 @@ def get_trial_checkpoints(self): def trial_result(self, trial: Trial, result: Dict): trial.update_last_result(result) trial.invalidate_json_state() + self.callbacks.on_trial_result( + iteration=-1, # Dummy value + trials=self.get_trials(), + trial=trial, + result=result, + ) def trial_checkpoint( self, @@ -56,11 +68,6 @@ def trial_checkpoint( dir_or_data=checkpoint_data, storage_mode=checkpoint_storage, metrics=trial.last_result, - local_to_remote_path_fn=partial( - TrainableUtil.get_remote_storage_path, - logdir=trial.local_path, - remote_checkpoint_dir=trial.remote_path, - ), ) trial.on_checkpoint(checkpoint) trial.invalidate_json_state() @@ -83,6 +90,11 @@ def create_trial( ) trial.init_local_path() self.runner.add_trial(trial) + self.callbacks.on_trial_start( + iteration=-1, # Dummy value + trials=self.get_trials(), + trial=trial, + ) return trial diff --git a/python/ray/tune/trainable/trainable.py b/python/ray/tune/trainable/trainable.py index f13e22ba6afe..63a5d997f4f4 100644 --- a/python/ray/tune/trainable/trainable.py +++ b/python/ray/tune/trainable/trainable.py @@ -9,7 +9,7 @@ import tempfile import time from contextlib import redirect_stderr, redirect_stdout -from typing import Any, Callable, Dict, List, Optional, Union, Type, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, Type import warnings import ray @@ -108,10 +108,10 @@ class Trainable: def __init__( self, config: Dict[str, Any] = None, - logger_creator: Callable[[Dict[str, Any]], "Logger"] = None, + logger_creator: Callable[[Dict[str, Any]], "Logger"] = None, # Deprecated (2.7) remote_checkpoint_dir: Optional[str] = None, - custom_syncer: Optional[Syncer] = None, # Deprecated - sync_timeout: Optional[int] = None, # Deprecated + custom_syncer: Optional[Syncer] = None, # Deprecated (2.6) + sync_timeout: Optional[int] = None, # Deprecated (2.6) sync_config: Optional[SyncConfig] = None, ): """Initialize a Trainable. @@ -125,7 +125,7 @@ def __init__( Args: config: Trainable-specific configuration data. By default will be saved as ``self.config``. - logger_creator: Function that creates a ray.tune.Logger + logger_creator: (Deprecated) Function that creates a ray.tune.Logger object. If unspecified, a default logger is created. remote_checkpoint_dir: Upload directory (S3 or GS path). This is **per trial** directory, @@ -140,6 +140,7 @@ def __init__( if self.is_actor(): disable_ipython() + # TODO(ml-team): Remove `logger_creator` in 2.7. self._result_logger = self._logdir = None self._create_logger(self.config, logger_creator) @@ -219,7 +220,9 @@ def _remote_storage_path(self, local_path): """Converts a `local_path` to be based off of `self.remote_checkpoint_dir`.""" return TrainableUtil.get_remote_storage_path( - local_path, self.logdir, self.remote_checkpoint_dir + local_path=local_path, + local_path_prefix=self.logdir, + remote_path_prefix=self.remote_checkpoint_dir, ) @classmethod diff --git a/python/ray/tune/trainable/util.py b/python/ray/tune/trainable/util.py index 6366338bd821..2adbb6bf3dc9 100644 --- a/python/ray/tune/trainable/util.py +++ b/python/ray/tune/trainable/util.py @@ -18,7 +18,6 @@ from ray.air.config import ScalingConfig from ray.tune.registry import _ParameterRegistry from ray.tune.utils import _detect_checkpoint_function -from ray.util import placement_group from ray.util.annotations import DeveloperAPI, PublicAPI if TYPE_CHECKING: @@ -42,27 +41,6 @@ def load_metadata(checkpoint_dir: str) -> Dict: with open(os.path.join(checkpoint_dir, _TUNE_METADATA_FILENAME), "rb") as f: return pickle.load(f) - @staticmethod - def pickle_checkpoint(checkpoint_path: str): - """Pickles checkpoint data.""" - checkpoint_dir = TrainableUtil.find_checkpoint_dir(checkpoint_path) - data = {} - for basedir, _, file_names in os.walk(checkpoint_dir): - for file_name in file_names: - path = os.path.join(basedir, file_name) - with open(path, "rb") as f: - data[os.path.relpath(path, checkpoint_dir)] = f.read() - # Use normpath so that a directory path isn't mapped to empty string. - name = os.path.relpath(os.path.normpath(checkpoint_path), checkpoint_dir) - name += os.path.sep if os.path.isdir(checkpoint_path) else "" - data_dict = pickle.dumps( - { - "checkpoint_name": name, - "data": data, - } - ) - return data_dict - @staticmethod def find_checkpoint_dir(checkpoint_path): """Returns the directory containing the checkpoint path. @@ -102,6 +80,14 @@ def find_rel_checkpoint_dir(logdir, checkpoint_path): tokens = rel_path.split(os.sep) return os.path.join(tokens[0]) + @staticmethod + def _make_checkpoint_dir_name(index: Union[int, str]): + """Get the name of the checkpoint directory suffix.""" + suffix = "checkpoint" + if index is not None: + suffix += f"_{index:06d}" if isinstance(index, int) else f"_{index}" + return suffix + @staticmethod def make_checkpoint_dir( checkpoint_dir: str, index: Union[int, str], override: bool = False @@ -115,9 +101,7 @@ def make_checkpoint_dir( override: Deletes checkpoint_dir before creating a new one. """ - suffix = "checkpoint" - if index is not None: - suffix += f"_{index:06d}" if isinstance(index, int) else f"_{index}" + suffix = TrainableUtil._make_checkpoint_dir_name(index) checkpoint_dir = os.path.join(checkpoint_dir, suffix) if override and os.path.exists(checkpoint_dir): @@ -201,65 +185,21 @@ def get_checkpoints_paths(logdir): @staticmethod def get_remote_storage_path( - local_path: str, logdir: str, remote_checkpoint_dir: str + local_path: str, local_path_prefix: str, remote_path_prefix: str ) -> str: """Converts a ``local_path`` to be based off of - ``remote_checkpoint_dir`` instead of ``logdir``. + ``remote_path_prefix`` instead of ``local_path_prefix``. - ``logdir`` is assumed to be a prefix of ``local_path``.""" - rel_local_path = os.path.relpath(local_path, logdir) - uri = URI(remote_checkpoint_dir) - return str(uri / rel_local_path) + ``local_path_prefix`` is assumed to be a prefix of ``local_path``. + Example: -@DeveloperAPI -class PlacementGroupUtil: - @staticmethod - def get_remote_worker_options( - num_workers: int, - num_cpus_per_worker: int, - num_gpus_per_worker: int, - num_workers_per_host: Optional[int], - timeout_s: Optional[int], - ) -> (Dict[str, Any], placement_group): - """Returns the option for remote workers. - - Args: - num_workers: Number of training workers to include in - world. - num_cpus_per_worker: Number of CPU resources to reserve - per training worker. - num_gpus_per_worker: Number of GPU resources to reserve - per training worker. - num_workers_per_host: Optional[int]: Number of workers to - colocate per host. - timeout_s: Seconds before the torch process group - times out. Useful when machines are unreliable. Defaults - to 60 seconds. This value is also reused for triggering - placement timeouts if forcing colocation. - - - Returns: - type: option that contains CPU/GPU count of - the remote worker and the placement group information. - pg: return a reference to the placement group + >>> TrainableUtil.get_remote_storage_path("/a/b/c", "/a", "s3://bucket/") + 's3://bucket/b/c' """ - pg = None - options = dict(num_cpus=num_cpus_per_worker, num_gpus=num_gpus_per_worker) - if num_workers_per_host: - num_hosts = int(num_workers / num_workers_per_host) - cpus_per_node = num_cpus_per_worker * num_workers_per_host - gpus_per_node = num_gpus_per_worker * num_workers_per_host - bundle = {"CPU": cpus_per_node, "GPU": gpus_per_node} - - all_bundles = [bundle] * num_hosts - pg = placement_group(all_bundles, strategy="STRICT_SPREAD") - logger.debug("Waiting for placement_group to start.") - ray.get(pg.ready(), timeout=timeout_s) - logger.debug("Placement_group started.") - options["placement_group"] = pg - - return options, pg + rel_local_path = os.path.relpath(local_path, local_path_prefix) + uri = URI(remote_path_prefix) + return str(uri / rel_local_path) @PublicAPI(stability="beta") @@ -332,35 +272,6 @@ def step(self): tune.with_parameters(MyTrainable, data=data), # ... ) - - .. note:: - When restoring a Tune experiment, you need to re-specify the trainable - wrapped with ``tune.with_parameters``. - The reasoning behind this is as follows: - - 1. ``tune.with_parameters`` stores parameters in the object store and - attaches object references to the trainable, but the objects they point to - may not exist anymore upon restoring in a new Ray cluster. - - 2. The attached objects could be arbitrarily large, so Tune does not save the - object data along with the trainable. - - To restore, Tune allows the trainable to be re-specified in - :meth:`Tuner.restore(path, trainable=...) `. - Continuing from the previous examples, here's an example of restoration: - - .. code-block:: python - - from ray.tune import Tuner - - data = HugeDataset(download=True) - - tuner = Tuner.restore( - "/path/to/experiment/", - trainable=tune.with_parameters(MyTrainable, data=data), - # ... - ) - """ from ray.tune.trainable import Trainable @@ -431,9 +342,6 @@ def _inner(config): trainable_with_params._resources = trainable._resources trainable_with_params.__name__ = trainable_name - - # Mark this trainable as being wrapped by saving the attached parameter names - trainable_with_params._attached_param_names = keys return trainable_with_params diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index 48c7da2a2aa8..877d954439ac 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -4,7 +4,6 @@ import datetime import logging import os -from pathlib import Path import signal import sys import threading @@ -17,6 +16,7 @@ Mapping, Optional, Sequence, + Tuple, Type, Union, TYPE_CHECKING, @@ -25,6 +25,7 @@ import ray from ray._private.storage import _get_storage_uri from ray.air import CheckpointConfig +from ray.air._internal import usage as air_usage from ray.air.util.node import _force_on_current_node from ray.tune.analysis import ExperimentAnalysis from ray.tune.callback import Callback @@ -35,9 +36,11 @@ get_air_verbosity, _detect_reporter as _detect_air_reporter, IS_NOTEBOOK, + AirVerbosity, ) from ray.tune.impl.placeholder import create_resolvers_map, inject_placeholders +from ray.tune.logger import TBXLoggerCallback from ray.tune.progress_reporter import ( ProgressReporter, _detect_reporter, @@ -192,7 +195,7 @@ def signal_interrupt_tune_run(sig: int, frame): "to skip. " ) experiment_interrupted_event.set() - # Restore original signal handler to react to future SIGINT signals + # Restore original signal handler to react to future SIGINT signals. signal.signal(signal.SIGINT, original_handler) # We should only install the handler when it is safe to do so. @@ -226,6 +229,59 @@ def _ray_auto_init(entrypoint: str): ) +def _resolve_and_validate_storage_path( + storage_path: str, local_dir: Optional[str], sync_config: Optional[SyncConfig] +) -> Tuple[str, str, Optional[str], SyncConfig]: + # TODO(ml-team): Simplify/remove this in 2.6 when `local_dir` + # and `SyncConfig(upload_dir)` are hard-deprecated. + sync_config = sync_config or SyncConfig() + + # Resolve storage_path + local_path, remote_path = _resolve_storage_path( + storage_path, local_dir, sync_config.upload_dir, error_location="tune.run" + ) + + if sync_config.upload_dir: + assert remote_path == sync_config.upload_dir + warnings.warn( + "Setting a `SyncConfig.upload_dir` is deprecated and will be removed " + "in the future. Pass `RunConfig.storage_path` instead." + ) + # Set upload_dir to None to avoid further downstream resolution. + # Copy object first to not alter user input. + sync_config = copy.copy(sync_config) + sync_config.upload_dir = None + + if local_dir: + assert local_path == local_dir + warnings.warn( + "Passing a `local_dir` is deprecated and will be removed " + "in the future. Pass `storage_path` instead or set the" + "`RAY_AIR_LOCAL_CACHE_DIR` environment variable instead." + ) + local_path = local_dir + + if not remote_path: + # If no remote path is set, try to get Ray Storage URI + remote_path = _get_storage_uri() + if remote_path: + logger.info( + "Using configured Ray storage URI as storage path: " f"{remote_path}" + ) + + sync_config.validate_upload_dir(remote_path) + + if not local_path: + local_path = _get_defaults_results_dir() + + storage_path = storage_path or remote_path or local_path + + if storage_path != local_path and local_path: + os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = local_path + + return storage_path, local_path, remote_path, sync_config + + class _Config(abc.ABC): def to_dict(self) -> dict: """Converts this configuration to a dict format.""" @@ -253,7 +309,9 @@ def run( checkpoint_score_attr: Optional[str] = None, checkpoint_freq: int = 0, checkpoint_at_end: bool = False, - verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, + checkpoint_keep_all_ranks: bool = False, + checkpoint_upload_from_workers: bool = False, + verbose: Optional[Union[int, AirVerbosity, Verbosity]] = None, progress_reporter: Optional[ProgressReporter] = None, log_to_file: bool = False, trial_name_creator: Optional[Callable[[Trial], str]] = None, @@ -278,7 +336,10 @@ def run( _remote: Optional[bool] = None, # Passed by the Tuner. _remote_string_queue: Optional[Queue] = None, + # Todo (krfricke): Find a better way to pass entrypoint information, e.g. + # a context object or similar. _tuner_api: bool = False, + _trainer_api: bool = False, ) -> ExperimentAnalysis: """Executes training. @@ -384,9 +445,16 @@ def run( checkpoint_at_end: Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. This has no effect when using the Functional Training API. + checkpoint_keep_all_ranks: Whether to save checkpoints from all ranked + training workers. + checkpoint_upload_from_workers: Whether to upload checkpoint files + directly from distributed training workers. verbose: 0, 1, 2, or 3. Verbosity mode. - 0 = silent, 1 = only status updates, 2 = status and brief trial - results, 3 = status and detailed trial results. Defaults to 3. + 0 = silent, 1 = only status updates, 2 = status and brief + results, 3 = status and detailed results. Defaults to 3. + If the ``RAY_AIR_NEW_OUTPUT=1`` environment variable is set, + uses the new context-aware verbosity settings: + 0 = silent, 1 = default, 2 = verbose. progress_reporter: Progress reporter for reporting intermediate experiment progress. Defaults to CLIReporter if running in command-line, or JupyterNotebookReporter if running in @@ -487,19 +555,24 @@ class and registered trainables. remote_run_kwargs = locals().copy() remote_run_kwargs.pop("_remote") - error_message_map = ( - { + if _tuner_api and _trainer_api: + error_message_map = { + "entrypoint": "Trainer(...)", + "search_space_arg": "param_space", + "restore_entrypoint": 'Trainer.restore(path="{path}", ...)', + } + elif _tuner_api and not _trainer_api: + error_message_map = { "entrypoint": "Tuner(...)", "search_space_arg": "param_space", "restore_entrypoint": 'Tuner.restore(path="{path}", trainable=...)', } - if _tuner_api - else { + else: + error_message_map = { "entrypoint": "tune.run(...)", "search_space_arg": "config", "restore_entrypoint": "tune.run(..., resume=True)", } - ) _ray_auto_init(entrypoint=error_message_map["entrypoint"]) if _remote is None: @@ -514,11 +587,18 @@ class and registered trainables. DeprecationWarning, ) + if verbose is None: + # Default `verbose` value. For new output engine, this is AirVerbosity.VERBOSE. + # For old output engine, this is Verbosity.V3_TRIAL_DETAILS + verbose = get_air_verbosity(AirVerbosity.VERBOSE) or Verbosity.V3_TRIAL_DETAILS + if _remote: - if get_air_verbosity() is not None: - logger.warning( - "Ignoring AIR_VERBOSITY setting, " - "as it doesn't support ray client mode yet." + if get_air_verbosity(verbose) is not None: + logger.info( + "[output] This uses the legacy output and progress reporter, " + "as Ray client is not supported by the new engine. " + "For more information, see " + "https://docs.ray.io/en/master/ray-air/experimental-features.html" ) remote_run = ray.remote(num_cpus=0)(run) @@ -557,6 +637,12 @@ class and registered trainables. ray._private.usage.usage_lib.record_library_usage("tune") + # Track environment variable usage here will also catch: + # 1.) Tuner.fit() usage + # 2.) Trainer.fit() usage + # 3.) Ray client usage (env variables are inherited by the Ray runtime env) + air_usage.tag_ray_air_env_vars() + all_start = time.time() if mode and mode not in ["min", "max"]: @@ -565,21 +651,28 @@ class and registered trainables. "must be one of ['min', 'max']" ) - air_verbosity = get_air_verbosity() + air_verbosity = get_air_verbosity(verbose) if air_verbosity is not None and IS_NOTEBOOK: - logger.warning( - "Ignoring AIR_VERBOSITY setting, " - "as it doesn't support JupyterNotebook mode yet." + logger.info( + "[output] This uses the legacy output and progress reporter, " + "as Jupyter notebooks are not supported by the new engine, yet. " + "For more information, please see " + "https://docs.ray.io/en/master/ray-air/experimental-features.html" ) air_verbosity = None if air_verbosity is not None: - logger.warning( - f"Testing new AIR console output flow with verbosity={air_verbosity}. " - f"This will also disable the old flow - setting it to 0 now." + logger.info( + f"[output] This will use the new output engine with verbosity " + f"{air_verbosity}. To disable the new output and use the legacy " + f"output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. " + f"For more information, please see " + f"https://docs.ray.io/en/master/ray-air/experimental-features.html" ) + # Disable old output engine set_verbosity(0) else: + # Use old output engine set_verbosity(verbose) config = config or {} @@ -592,50 +685,18 @@ class and registered trainables. f"Got '{type(config)}' instead." ) - sync_config = sync_config or SyncConfig() - - # Resolve storage_path - local_path, remote_path = _resolve_storage_path( - storage_path, local_dir, sync_config.upload_dir, error_location="tune.run" + ( + storage_path, + local_path, + remote_path, + sync_config, + ) = _resolve_and_validate_storage_path( + storage_path=storage_path, local_dir=local_dir, sync_config=sync_config ) - if sync_config.upload_dir: - assert remote_path == sync_config.upload_dir - warnings.warn( - "Setting a `SyncConfig.upload_dir` is deprecated and will be removed " - "in the future. Pass `RunConfig.storage_path` instead." - ) - # Set upload_dir to None to avoid further downstream resolution. - # Copy object first to not alter user input. - sync_config = copy.copy(sync_config) - sync_config.upload_dir = None - - if local_dir: - assert local_path == local_dir - warnings.warn( - "Passing a `local_dir` is deprecated and will be removed " - "in the future. Pass `storage_path` instead or set the" - "`RAY_AIR_LOCAL_CACHE_DIR` environment variable instead." - ) - local_path = local_dir - - if not remote_path: - # If no remote path is set, try to get Ray Storage URI - remote_path = _get_storage_uri() - if remote_path: - logger.info( - "Using configured Ray storage URI as storage path: " f"{remote_path}" - ) - - sync_config.validate_upload_dir(remote_path) - - if not local_path: - local_path = _get_defaults_results_dir() - - storage_path = storage_path or remote_path or local_path - - if storage_path != local_path and local_path: - os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = local_path + air_usage.tag_ray_air_storage_config( + local_path=local_path, remote_path=remote_path, sync_config=sync_config + ) checkpoint_score_attr = checkpoint_score_attr or "" if checkpoint_score_attr.startswith("min-"): @@ -650,6 +711,8 @@ class and registered trainables. checkpoint_score_order=checkpoint_score_order, checkpoint_frequency=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, + _checkpoint_keep_all_ranks=checkpoint_keep_all_ranks, + _checkpoint_upload_from_workers=checkpoint_upload_from_workers, ) if num_samples == -1: @@ -847,7 +910,11 @@ class and registered trainables. progress_metrics = _detect_progress_metrics(_get_trainable(run_or_experiment)) - # Create syncer callbacks + # NOTE: Report callback telemetry before populating the list with default callbacks. + # This tracks user-specified callback usage. + air_usage.tag_callbacks(callbacks) + + # Create default logging + syncer callbacks callbacks = _create_default_callbacks( callbacks, sync_config=sync_config, @@ -908,9 +975,10 @@ class and registered trainables. callbacks=callbacks, metric=metric, trial_checkpoint_config=experiments[0].checkpoint_config, + _trainer_api=_trainer_api, ) - if bool(int(os.environ.get("TUNE_NEW_EXECUTION", "0"))): + if bool(int(os.environ.get("TUNE_NEW_EXECUTION", "1"))): trial_runner_cls = TuneController runner_kwargs.pop("trial_executor") runner_kwargs["reuse_actors"] = reuse_actors @@ -923,6 +991,9 @@ class and registered trainables. if not runner.resumed: for exp in experiments: search_alg.add_configurations([exp]) + # search_alg.total_samples has been updated, so we should + # update the number of pending trials + runner.update_max_pending_trials() else: logger.debug( "You have resumed the Tune run, which means that any newly specified " @@ -949,19 +1020,37 @@ class and registered trainables. ) else: air_progress_reporter = _detect_air_reporter( - air_verbosity, search_alg.total_samples, metric=metric, mode=mode + air_verbosity, + search_alg.total_samples, + metric=metric, + mode=mode, + config=config, ) - # rich live context manager has to be called encapsulting + # rich live context manager has to be called encapsulating # the while loop. For other kind of reporters, no op. # `ExitStack` allows us to *conditionally* apply context manager. with contextlib.ExitStack() as stack: from ray.tune.experimental.output import TuneRichReporter + if any(isinstance(cb, TBXLoggerCallback) for cb in callbacks): + tensorboard_path = runner._local_experiment_path + else: + tensorboard_path = None + if air_progress_reporter and isinstance( air_progress_reporter, TuneRichReporter ): stack.enter_context(air_progress_reporter.with_live()) + elif air_progress_reporter: + air_progress_reporter.experiment_started( + experiment_name=runner._experiment_dir_name, + experiment_path=runner.experiment_path, + searcher_str=search_alg.__class__.__name__, + scheduler_str=scheduler.__class__.__name__, + total_num_samples=search_alg.total_samples, + tensorboard_path=tensorboard_path, + ) try: while ( @@ -1015,12 +1104,18 @@ class and registered trainables. if experiment_interrupted_event.is_set(): restore_entrypoint = error_message_map["restore_entrypoint"].format( - path=Path(experiment_checkpoint).parent, - ) - logger.warning( - "Experiment has been interrupted, but the most recent state was saved.\n" - f"Continue running this experiment with: {restore_entrypoint}" + path=runner.experiment_path, ) + if _trainer_api: + logger.warning( + f"Training has been interrupted, but the most recent state was saved.\n" + f"Resume training with: {restore_entrypoint}" + ) + else: + logger.warning( + f"Experiment has been interrupted, but the most recent state was " + f"saved.\nResume experiment with: {restore_entrypoint}" + ) ea = ExperimentAnalysis( experiment_checkpoint, trials=all_trials, @@ -1038,7 +1133,7 @@ def run_experiments( experiments: Union[Experiment, Mapping, Sequence[Union[Experiment, Mapping]]], scheduler: Optional[TrialScheduler] = None, server_port: Optional[int] = None, - verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, + verbose: Optional[Union[int, AirVerbosity, Verbosity]] = None, progress_reporter: Optional[ProgressReporter] = None, resume: Union[bool, str] = False, reuse_actors: Optional[bool] = None, @@ -1072,11 +1167,18 @@ def run_experiments( if not trial_executor or isinstance(trial_executor, RayTrialExecutor): _ray_auto_init(entrypoint="tune.run_experiments(...)") + if verbose is None: + # Default `verbose` value. For new output engine, this is AirVerbosity.VERBOSE. + # For old output engine, this is Verbosity.V3_TRIAL_DETAILS + verbose = get_air_verbosity(AirVerbosity.VERBOSE) or Verbosity.V3_TRIAL_DETAILS + if _remote: - if get_air_verbosity() is not None: - logger.warning( - "Ignoring AIR_VERBOSITY setting, " - "as it doesn't support ray client mode yet." + if get_air_verbosity(verbose) is not None: + logger.info( + "[output] This uses the legacy output and progress reporter, " + "as Ray client is not supported by the new engine. " + "For more information, see " + "https://docs.ray.io/en/master/ray-air/experimental-features.html" ) remote_run = ray.remote(num_cpus=0)(run_experiments) diff --git a/python/ray/tune/tuner.py b/python/ray/tune/tuner.py index 88ccc109cf1e..08a1cd2de0a9 100644 --- a/python/ray/tune/tuner.py +++ b/python/ray/tune/tuner.py @@ -1,7 +1,6 @@ import logging from pathlib import Path from typing import Any, Callable, Dict, Optional, Type, Union, TYPE_CHECKING -import warnings import ray @@ -146,15 +145,20 @@ def __init__( # TODO(xwjiang): Remove this later. _tuner_kwargs: Optional[Dict] = None, _tuner_internal: Optional[TunerInternal] = None, + _trainer_api: bool = False, ): """Configure and construct a tune run.""" kwargs = locals().copy() self._is_ray_client = ray.util.client.ray.is_connected() - if self._is_ray_client and get_air_verbosity() is not None: - logger.warning( - "Ignoring AIR_VERBOSITY setting, " - "as it doesn't support ray client mode yet." - ) + if self._is_ray_client: + _run_config = run_config or RunConfig() + if get_air_verbosity(_run_config.verbose) is not None: + logger.info( + "[output] This uses the legacy output and progress reporter, " + "as Ray client is not supported by the new engine. " + "For more information, see " + "https://docs.ray.io/en/master/ray-air/experimental-features.html" + ) if _tuner_internal: if not self._is_ray_client: @@ -175,16 +179,10 @@ def __init__( def restore( cls, path: str, - trainable: Optional[ - Union[str, Callable, Type[Trainable], "BaseTrainer"] - ] = None, + trainable: Union[str, Callable, Type[Trainable], "BaseTrainer"], resume_unfinished: bool = True, resume_errored: bool = False, restart_errored: bool = False, - # Deprecated - overwrite_trainable: Optional[ - Union[str, Callable, Type[Trainable], "BaseTrainer"] - ] = None, param_space: Optional[Dict[str, Any]] = None, ) -> "Tuner": """Restores Tuner after a previously failed run. @@ -215,11 +213,10 @@ def restore( trainable: The trainable to use upon resuming the experiment. This should be the same trainable that was used to initialize the original Tuner. - NOTE: Starting in 2.5, this will be a required parameter. param_space: The same `param_space` that was passed to the original Tuner. This can be optionally re-specified due to the `param_space` potentially containing Ray object - references (tuning over Ray Datasets or tuning over + references (tuning over Datasets or tuning over several `ray.put` object references). **Tune expects the `param_space` to be unmodified**, and the only part that will be used during restore are the updated object references. @@ -230,30 +227,12 @@ def restore( restore from their latest checkpoints. restart_errored: If True, will re-schedule errored trials but force restarting them from scratch (no checkpoint will be loaded). - overwrite_trainable: Deprecated. Use the `trainable` argument instead. """ # TODO(xwjiang): Add some comments to clarify the config behavior across # retored runs. # For example, is callbacks supposed to be automatically applied # when a Tuner is restored and fit again? - if overwrite_trainable: - if not trainable: - trainable = overwrite_trainable - warning_message = ( - "`overwrite_trainable` has been renamed to `trainable`. " - "The old argument will be removed starting from version 2.5." - ) - warnings.warn(warning_message, DeprecationWarning) - - if not trainable: - warning_message = ( - "Passing in the experiment's `trainable` will be a required argument " - "to `Tuner.restore` starting from version 2.5. " - "Please specify the trainable to avoid this warning." - ) - warnings.warn(warning_message) - resume_config = _ResumeConfig( resume_unfinished=resume_unfinished, resume_errored=resume_errored, diff --git a/python/ray/tune/utils/callback.py b/python/ray/tune/utils/callback.py index 448084712652..478f559f5ed1 100644 --- a/python/ray/tune/utils/callback.py +++ b/python/ray/tune/utils/callback.py @@ -1,6 +1,6 @@ import logging import os -from typing import List, Optional, Type, Union, TYPE_CHECKING +from typing import Collection, List, Optional, Type, Union, TYPE_CHECKING from ray.tune.callback import Callback, CallbackList @@ -45,8 +45,8 @@ def _create_default_callbacks( sync_config: SyncConfig, air_verbosity: Optional["AirVerbosity"] = None, metric: Optional[str] = None, - progress_metrics: Optional[List[str]] = None, -): + progress_metrics: Optional[Collection[str]] = None, +) -> List[Callback]: """Create default callbacks for `Tuner.fit()`. This function takes a list of existing callbacks and adds default @@ -93,7 +93,9 @@ def _create_default_callbacks( if air_verbosity is not None: # new flow from ray.tune.experimental.output import AirResultCallbackWrapper - callbacks.append(AirResultCallbackWrapper(air_verbosity)) + callbacks.append( + AirResultCallbackWrapper(air_verbosity, metrics=progress_metrics) + ) elif not has_trial_progress_callback: # old flow trial_progress_callback = TrialProgressCallback( metric=metric, progress_metrics=progress_metrics diff --git a/python/ray/tune/utils/log.py b/python/ray/tune/utils/log.py index 44489f591f1d..a4b57e2d8da8 100644 --- a/python/ray/tune/utils/log.py +++ b/python/ray/tune/utils/log.py @@ -1,5 +1,6 @@ +import time from enum import Enum -from typing import Union +from typing import Dict, Tuple, Union from ray.util import PublicAPI from ray.util.annotations import DeveloperAPI @@ -49,3 +50,15 @@ def disable_ipython(): InteractiveShell.clear_instance() except Exception: pass + + +_log_cache_count: Dict[str, Tuple[str, float]] = {} + + +def _dedup_logs(domain: str, value: str, repeat_after_s: int = 5) -> bool: + cur_val, ts = _log_cache_count.get(domain, (None, None)) + if value == cur_val and time.monotonic() - repeat_after_s < ts: + return False + else: + _log_cache_count[domain] = value, time.monotonic() + return True diff --git a/python/ray/tune/utils/release_test_util.py b/python/ray/tune/utils/release_test_util.py index db7d848c12aa..5355b3a1b1d8 100644 --- a/python/ray/tune/utils/release_test_util.py +++ b/python/ray/tune/utils/release_test_util.py @@ -79,6 +79,7 @@ def function_trainable(config): checkpoint_iters = config["checkpoint_iters"] checkpoint_size_b = config["checkpoint_size_b"] checkpoint_num_items = checkpoint_size_b // 8 # np.float64 + checkpoint_num_files = config["checkpoint_num_files"] for i in range(num_iters): if ( @@ -87,10 +88,11 @@ def function_trainable(config): and i % checkpoint_iters == 0 ): with tune.checkpoint_dir(step=i) as dir: - checkpoint_file = os.path.join(dir, "bogus.ckpt") - checkpoint_data = np.random.uniform(0, 1, size=checkpoint_num_items) - with open(checkpoint_file, "wb") as fp: - pickle.dump(checkpoint_data, fp) + for i in range(checkpoint_num_files): + checkpoint_file = os.path.join(dir, f"bogus_{i}.ckpt") + checkpoint_data = np.random.uniform(0, 1, size=checkpoint_num_items) + with open(checkpoint_file, "wb") as fp: + pickle.dump(checkpoint_data, fp) tune.report(score=i + score) time.sleep(sleep_time) @@ -104,12 +106,16 @@ def timed_tune_run( max_runtime: int = 300, checkpoint_freq_s: int = -1, checkpoint_size_b: int = 0, + checkpoint_num_files: int = 1, **tune_kwargs, ): durable = ( "storage_path" in tune_kwargs and tune_kwargs["storage_path"] - and tune_kwargs["storage_path"].startswith("s3://") + and ( + tune_kwargs["storage_path"].startswith("s3://") + or tune_kwargs["storage_path"].startswith("gs://") + ) ) sleep_time = 1.0 / results_per_second @@ -124,6 +130,7 @@ def timed_tune_run( "sleep_time": sleep_time, "checkpoint_iters": checkpoint_iters, "checkpoint_size_b": checkpoint_size_b, + "checkpoint_num_files": checkpoint_num_files, } print(f"Starting benchmark with config: {config}") @@ -133,38 +140,8 @@ def timed_tune_run( _train = function_trainable - aws_key_id = os.getenv("AWS_ACCESS_KEY_ID", "") - aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY", "") - aws_session = os.getenv("AWS_SESSION_TOKEN", "") - if durable: - - class AwsDurableTrainable(TestDurableTrainable): - AWS_ACCESS_KEY_ID = aws_key_id - AWS_SECRET_ACCESS_KEY = aws_secret - AWS_SESSION_TOKEN = aws_session - - def setup_env(self): - if self.AWS_ACCESS_KEY_ID: - os.environ["AWS_ACCESS_KEY_ID"] = self.AWS_ACCESS_KEY_ID - if self.AWS_SECRET_ACCESS_KEY: - os.environ["AWS_SECRET_ACCESS_KEY"] = self.AWS_SECRET_ACCESS_KEY - if self.AWS_SESSION_TOKEN: - os.environ["AWS_SESSION_TOKEN"] = self.AWS_SESSION_TOKEN - - if all( - os.getenv(k, "") - for k in [ - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", - "AWS_SESSION_TOKEN", - ] - ): - print("Worker: AWS secrets found in env.") - else: - print("Worker: No AWS secrets found in env!") - - _train = AwsDurableTrainable + _train = TestDurableTrainable run_kwargs["checkpoint_freq"] = checkpoint_iters start_time = time.monotonic() diff --git a/python/ray/util/__init__.py b/python/ray/util/__init__.py index 15c0101c5e80..69388db4ffe9 100644 --- a/python/ray/util/__init__.py +++ b/python/ray/util/__init__.py @@ -2,6 +2,7 @@ import ray from ray._private.client_mode_hook import client_mode_hook +from ray._private.auto_init_hook import wrap_auto_init from ray._private.services import get_node_ip_address from ray.util import iter from ray.util import rpdb as pdb @@ -21,7 +22,8 @@ @PublicAPI(stability="beta") -@client_mode_hook(auto_init=True) +@wrap_auto_init +@client_mode_hook def list_named_actors(all_namespaces: bool = False) -> List[str]: """List all named actors in the system. diff --git a/python/ray/util/actor_pool.py b/python/ray/util/actor_pool.py index abfb66a8a1cd..bd8b88f3adde 100644 --- a/python/ray/util/actor_pool.py +++ b/python/ray/util/actor_pool.py @@ -79,8 +79,12 @@ def map(self, fn: Callable[[Any], Any], values: List[Any]): for v in values: self.submit(fn, v) - while self.has_next(): - yield self.get_next() + + def get_generator(): + while self.has_next(): + yield self.get_next() + + return get_generator() def map_unordered(self, fn: Callable[[Any], Any], values: List[Any]): """Similar to map(), but returning an unordered iterator. @@ -116,8 +120,12 @@ def map_unordered(self, fn: Callable[[Any], Any], values: List[Any]): for v in values: self.submit(fn, v) - while self.has_next(): - yield self.get_next_unordered() + + def get_generator(): + while self.has_next(): + yield self.get_next_unordered() + + return get_generator() def submit(self, fn, value): """Schedule a single task to run in the pool. diff --git a/python/ray/util/client/common.py b/python/ray/util/client/common.py index 873bc25ed68b..66825dc2fee1 100644 --- a/python/ray/util/client/common.py +++ b/python/ray/util/client/common.py @@ -79,8 +79,8 @@ CLIENT_SERVER_MAX_THREADS = float(os.getenv("RAY_CLIENT_SERVER_MAX_THREADS", 100)) -# Large objects are chunked into 64 MiB messages -OBJECT_TRANSFER_CHUNK_SIZE = 64 * 2**20 +# Large objects are chunked into 5 MiB messages, ref PR #35025 +OBJECT_TRANSFER_CHUNK_SIZE = 5 * 2**20 # Warn the user if the object being transferred is larger than 2 GiB OBJECT_TRANSFER_WARNING_SIZE = 2 * 2**30 diff --git a/python/ray/util/client/dataclient.py b/python/ray/util/client/dataclient.py index 449c76534381..5ce08117087d 100644 --- a/python/ray/util/client/dataclient.py +++ b/python/ray/util/client/dataclient.py @@ -39,7 +39,11 @@ def chunk_put(req: ray_client_pb2.DataRequest): into the result_queue, we would effectively double the memory needed on the client to handle the put. """ - total_size = len(req.put.data) + # When accessing a protobuf field, deserialization is performed, which will + # generate a copy. So we need to avoid accessing the `data` field multiple + # times in the loop + request_data = req.put.data + total_size = len(request_data) assert total_size > 0, "Cannot chunk object with missing data" if total_size >= OBJECT_TRANSFER_WARNING_SIZE and log_once( "client_object_put_size_warning" @@ -60,7 +64,7 @@ def chunk_put(req: ray_client_pb2.DataRequest): end = min(total_size, (chunk_id + 1) * OBJECT_TRANSFER_CHUNK_SIZE) chunk = ray_client_pb2.PutRequest( client_ref_id=req.put.client_ref_id, - data=req.put.data[start:end], + data=request_data[start:end], chunk_id=chunk_id, total_chunks=total_chunks, total_size=total_size, @@ -77,7 +81,11 @@ def chunk_task(req: ray_client_pb2.DataRequest): into the result_queue, we would effectively double the memory needed on the client to handle the task. """ - total_size = len(req.task.data) + # When accessing a protobuf field, deserialization is performed, which will + # generate a copy. So we need to avoid accessing the `data` field multiple + # times in the loop + request_data = req.task.data + total_size = len(request_data) assert total_size > 0, "Cannot chunk object with missing data" total_chunks = math.ceil(total_size / OBJECT_TRANSFER_CHUNK_SIZE) for chunk_id in range(0, total_chunks): @@ -91,7 +99,7 @@ def chunk_task(req: ray_client_pb2.DataRequest): options=req.task.options, baseline_options=req.task.baseline_options, namespace=req.task.namespace, - data=req.task.data[start:end], + data=request_data[start:end], chunk_id=chunk_id, total_chunks=total_chunks, ) diff --git a/python/ray/util/client/server/proxier.py b/python/ray/util/client/server/proxier.py index 0b3128d090ba..c648104a9968 100644 --- a/python/ray/util/client/server/proxier.py +++ b/python/ray/util/client/server/proxier.py @@ -294,8 +294,8 @@ def start_specific_server(self, client_id: str, job_config: JobConfig) -> bool: f"ray_client_server_{specific_server.port}", unique=True ) - serialized_runtime_env = job_config.get_serialized_runtime_env() - runtime_env_config = job_config.get_proto_runtime_env_config() + serialized_runtime_env = job_config._get_serialized_runtime_env() + runtime_env_config = job_config._get_proto_runtime_env_config() if not serialized_runtime_env or serialized_runtime_env == "{}": # TODO(edoakes): can we just remove this case and always send it # to the agent? diff --git a/python/ray/util/client/server/server.py b/python/ray/util/client/server/server.py index c07db1bc9a1c..962b8bbb1c19 100644 --- a/python/ray/util/client/server/server.py +++ b/python/ray/util/client/server/server.py @@ -118,7 +118,7 @@ def Init( ) -> ray_client_pb2.InitResponse: if request.job_config: job_config = pickle.loads(request.job_config) - job_config.client_job = True + job_config._client_job = True else: job_config = None current_job_config = None @@ -144,7 +144,7 @@ def Init( # that tests the behavior of multiple clients with the same job config # connecting to one server (test_client_init.py::test_num_clients), # so I'm leaving it here for now. - job_config = job_config.get_proto_job_config() + job_config = job_config._get_proto_job_config() # If the server has been initialized, we need to compare whether the # runtime env is compatible. if current_job_config: @@ -262,8 +262,8 @@ def ClusterInfo(self, request, context=None) -> ray_client_pb2.ClusterInfoRespon ctx = ray_client_pb2.ClusterInfoResponse.RuntimeContext() with disable_client_hook(): rtc = ray.get_runtime_context() - ctx.job_id = rtc.job_id.binary() - ctx.node_id = rtc.node_id.binary() + ctx.job_id = ray._private.utils.hex_to_binary(rtc.get_job_id()) + ctx.node_id = ray._private.utils.hex_to_binary(rtc.get_node_id()) ctx.namespace = rtc.namespace ctx.capture_client_tasks = ( rtc.should_capture_child_tasks_in_placement_group diff --git a/python/ray/util/placement_group.py b/python/ray/util/placement_group.py index 62d373ecd136..065db97af66d 100644 --- a/python/ray/util/placement_group.py +++ b/python/ray/util/placement_group.py @@ -2,6 +2,7 @@ from typing import Dict, List, Optional, Union import ray +from ray._private.auto_init_hook import auto_init_ray from ray._private.client_mode_hook import client_mode_should_convert, client_mode_wrap from ray._private.utils import hex_to_binary, get_ray_doc_version from ray._raylet import PlacementGroupID @@ -321,7 +322,8 @@ def get_current_placement_group() -> Optional[PlacementGroup]: None if the current task or actor wasn't created with any placement group. """ - if client_mode_should_convert(auto_init=True): + auto_init_ray() + if client_mode_should_convert(): # Client mode is only a driver. return None worker = ray._private.worker.global_worker @@ -374,7 +376,6 @@ def _valid_resource_shape(resources, bundle_specs): def _validate_resource_shape( placement_group, resources, placement_resources, task_or_actor_repr ): - bundles = placement_group.bundle_specs resources_valid = _valid_resource_shape(resources, bundles) placement_resources_valid = _valid_resource_shape(placement_resources, bundles) diff --git a/python/ray/util/spark/databricks_hook.py b/python/ray/util/spark/databricks_hook.py index 404ff5a7211e..86d4b8a8f8a2 100644 --- a/python/ray/util/spark/databricks_hook.py +++ b/python/ray/util/spark/databricks_hook.py @@ -44,13 +44,16 @@ def display_databricks_driver_proxy_url(spark_context, port, title): orgId = commandContextTags.apply("orgId") clusterId = commandContextTags.apply("clusterId") - template = "/driver-proxy/o/{orgId}/{clusterId}/{port}/" - proxy_url = template.format(orgId=orgId, clusterId=clusterId, port=port) + proxy_link = f"/driver-proxy/o/{orgId}/{clusterId}/{port}/" + proxy_url = f"https://dbc-dp-{orgId}.cloud.databricks.com{proxy_link}" + + print("To monitor and debug Ray from Databricks, view the dashboard at ") + print(f" {proxy_url}") displayHTML( f""" diff --git a/python/ray/util/state/__init__.py b/python/ray/util/state/__init__.py new file mode 100644 index 000000000000..d74f9b650df3 --- /dev/null +++ b/python/ray/util/state/__init__.py @@ -0,0 +1,50 @@ +from ray.util.state.api import ( + get_actor, + get_log, + get_node, + get_objects, + get_placement_group, + get_task, + get_worker, + get_job, + list_actors, + list_jobs, + list_nodes, + list_placement_groups, + list_tasks, + list_workers, + list_objects, + list_runtime_envs, + list_logs, + list_cluster_events, + summarize_actors, + summarize_objects, + summarize_tasks, + StateApiClient, +) + + +__all__ = [ + "get_actor", + "get_log", + "get_node", + "get_objects", + "get_placement_group", + "get_task", + "get_worker", + "get_job", + "list_actors", + "list_jobs", + "list_nodes", + "list_placement_groups", + "list_tasks", + "list_workers", + "list_objects", + "list_runtime_envs", + "list_logs", + "list_cluster_events", + "summarize_actors", + "summarize_objects", + "summarize_tasks", + "StateApiClient", +] diff --git a/python/ray/util/state/api.py b/python/ray/util/state/api.py new file mode 100644 index 000000000000..0cef8d4cf58e --- /dev/null +++ b/python/ray/util/state/api.py @@ -0,0 +1,1443 @@ +import logging +import threading +import urllib +import warnings +from contextlib import contextmanager +from dataclasses import fields +from typing import Any, Dict, Generator, List, Optional, Tuple, Union + +import requests + +from ray.dashboard.modules.dashboard_sdk import SubmissionClient +from ray.dashboard.utils import ( + get_address_for_submission_client, + ray_address_to_api_server_url, +) +from ray.util.annotations import DeveloperAPI +from ray.util.state.common import ( + DEFAULT_LIMIT, + DEFAULT_RPC_TIMEOUT, + ActorState, + ClusterEventState, + GetApiOptions, + GetLogOptions, + JobState, + ListApiOptions, + NodeState, + ObjectState, + PlacementGroupState, + PredicateType, + RuntimeEnvState, + StateResource, + SummaryApiOptions, + SummaryResource, + SupportedFilterType, + TaskState, + WorkerState, + dict_to_state, +) +from ray.util.state.exception import RayStateApiException, ServerUnavailable + +logger = logging.getLogger(__name__) + + +@contextmanager +def warnings_on_slow_request( + *, address: str, endpoint: str, timeout: float, explain: bool +): + """A context manager to print warnings if the request is replied slowly. + + Warnings are printed 3 times + + Args: + address: The address of the endpoint. + endpoint: The name of the endpoint. + timeout: Request timeout in seconds. + explain: Whether ot not it will print the warning. + """ + # Do nothing if explain is not specified. + if not explain: + yield + return + + # Prepare timers to print warning. + # Print 3 times with exponential backoff. timeout / 2, timeout / 4, timeout / 8 + def print_warning(elapsed: float): + logger.info( + f"({round(elapsed, 2)} / {timeout} seconds) " + "Waiting for the response from the API server " + f"address {address}{endpoint}.", + ) + + warning_timers = [ + threading.Timer(timeout / i, print_warning, args=[timeout / i]) + for i in [2, 4, 8] + ] + + try: + for timer in warning_timers: + timer.start() + yield + finally: + # Make sure all timers are cancelled once request is terminated. + for timer in warning_timers: + timer.cancel() + + +""" +This file contains API client and methods for querying ray state. + +Usage: + 1. [Recommended] With StateApiClient: + ``` + client = StateApiClient(address="auto") + data = client.list(StateResource.NODES) + ... + ``` + + 2. With SDK APIs: + The API creates a `StateApiClient` for each invocation. So if multiple + invocations of listing are used, it is better to reuse the `StateApiClient` + as suggested above. + ``` + data = list_nodes(address="auto") + ``` +""" + + +@DeveloperAPI +class StateApiClient(SubmissionClient): + """State API Client issues REST GET requests to the server for resource states.""" + + def __init__( + self, + address: Optional[str] = None, + cookies: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, Any]] = None, + ): + """Initialize a StateApiClient and check the connection to the cluster. + + Args: + address: Ray bootstrap address (e.g. `127.0.0.0:6379`, `auto`), or Ray + Client adress (e.g. `ray://:10001`), or Ray dashboard + address (e.g. `http://:8265`). + If not provided, it will be detected automatically from any running + local Ray cluster. + cookies: Cookies to use when sending requests to the HTTP job server. + headers: Headers to use when sending requests to the HTTP job server, used + for cases like authentication to a remote cluster. + """ + if requests is None: + raise RuntimeError( + "The Ray state CLI & SDK require the ray[default] " + "installation: `pip install 'ray[default']``" + ) + if not headers: + headers = {"Content-Type": "application/json"} + + # Resolve API server URL + api_server_url = get_address_for_submission_client(address) + + super().__init__( + address=api_server_url, + create_cluster_if_needed=False, + headers=headers, + cookies=cookies, + ) + + @classmethod + def _make_param(cls, options: Union[ListApiOptions, GetApiOptions]) -> Dict: + options_dict = {} + for field in fields(options): + # TODO(rickyyx): We will need to find a way to pass server side timeout + # TODO(rickyyx): We will have to convert filter option + # slightly differently for now. But could we do k,v pair rather than this? + # I see we are also converting dict to XXXApiOptions later on, we could + # probably organize the marshaling a bit better. + if field.name == "filters": + options_dict["filter_keys"] = [] + options_dict["filter_predicates"] = [] + options_dict["filter_values"] = [] + for filter in options.filters: + if len(filter) != 3: + raise ValueError( + f"The given filter has incorrect input type, {filter}. " + "Provide (key, predicate, value) tuples." + ) + filter_k, filter_predicate, filter_val = filter + options_dict["filter_keys"].append(filter_k) + options_dict["filter_predicates"].append(filter_predicate) + options_dict["filter_values"].append(filter_val) + continue + + option_val = getattr(options, field.name) + if option_val is not None: + options_dict[field.name] = option_val + + return options_dict + + def _make_http_get_request( + self, + endpoint: str, + params: Dict, + timeout: float, + _explain: bool = False, + ) -> Dict: + with warnings_on_slow_request( + address=self._address, endpoint=endpoint, timeout=timeout, explain=_explain + ): + # Send a request. + response = None + try: + response = self._do_request( + "GET", + endpoint, + timeout=timeout, + params=params, + ) + # If we have a valid JSON error, don't raise a generic exception but + # instead let the caller parse it to raise a more precise exception. + if ( + response.status_code == 500 + and "application/json" + not in response.headers.get("Content-Type", "") + ): + response.raise_for_status() + except requests.exceptions.RequestException as e: + err_str = f"Failed to make request to {self._address}{endpoint}. " + + # Best-effort to give hints to users on potential reasons of connection + # failure. + err_str += ( + "Failed to connect to API server. Please check the API server " + "log for details. Make sure dependencies are installed with " + "`pip install ray[default]`. Please also check dashboard is " + "available, and included when starting ray cluster, " + "i.e. `ray start --include-dashboard=True --head`. " + ) + if response is None: + raise ServerUnavailable(err_str) + + err_str += f"Response(url={response.url},status={response.status_code})" + raise RayStateApiException(err_str) from e + + # Process the response. + response = response.json() + if response["result"] is False: + raise RayStateApiException( + "API server internal error. See dashboard.log file for more details. " + f"Error: {response['msg']}" + ) + + # Dictionary of `ListApiResponse` or `SummaryApiResponse` + return response["data"]["result"] + + def get( + self, + resource: StateResource, + id: str, + options: Optional[GetApiOptions], + _explain: bool = False, + ) -> Optional[ + Union[ + ActorState, + PlacementGroupState, + NodeState, + WorkerState, + TaskState, + List[ObjectState], + JobState, + ] + ]: + """Get resources states by id + + Args: + resource_name: Resource names, i.e. 'workers', 'actors', 'nodes', + 'placement_groups', 'tasks', 'objects'. + 'jobs' and 'runtime-envs' are not supported yet. + id: ID for the resource, i.e. 'node_id' for nodes. + options: Get options. See `GetApiOptions` for details. + _explain: Print the API information such as API + latency or failed query information. + + Returns: + None if not found, and if found: + - ActorState for actors + - PlacementGroupState for placement groups + - NodeState for nodes + - WorkerState for workers + - TaskState for tasks + - JobState for jobs + + Empty list for objects if not found, or list of ObjectState for objects + + Raises: + This doesn't catch any exceptions raised when the underlying request + call raises exceptions. For example, it could raise `requests.Timeout` + when timeout occurs. + + ValueError: + if the resource could not be GET by id, i.e. jobs and runtime-envs. + + """ + # TODO(rickyyx): Make GET not using filters on list operation + params = self._make_param(options) + + RESOURCE_ID_KEY_NAME = { + StateResource.NODES: "node_id", + StateResource.ACTORS: "actor_id", + StateResource.PLACEMENT_GROUPS: "placement_group_id", + StateResource.WORKERS: "worker_id", + StateResource.TASKS: "task_id", + StateResource.OBJECTS: "object_id", + StateResource.JOBS: "submission_id", + } + if resource not in RESOURCE_ID_KEY_NAME: + raise ValueError(f"Can't get {resource.name} by id.") + + params["filter_keys"] = [RESOURCE_ID_KEY_NAME[resource]] + params["filter_predicates"] = ["="] + params["filter_values"] = [id] + params["detail"] = True + endpoint = f"/api/v0/{resource.value}" + + list_api_response = self._make_http_get_request( + endpoint=endpoint, + params=params, + timeout=options.timeout, + _explain=_explain, + ) + result = list_api_response["result"] + + # Empty result + if len(result) == 0: + return None + + result = [dict_to_state(d, resource) for d in result] + if resource == StateResource.OBJECTS: + # NOTE(rickyyx): + # There might be multiple object entries for a single object id + # because a single object could be referenced at different places + # e.g. pinned as local variable, used as parameter + return result + + if resource == StateResource.TASKS: + # There might be multiple task attempts given a task id due to + # task retries. + if len(result) == 1: + return result[0] + return result + + # For the rest of the resources, there should only be a single entry + # for a particular id. + assert len(result) == 1 + return result[0] + + def _print_api_warning( + self, + resource: StateResource, + api_response: dict, + warn_data_source_not_available: bool = True, + warn_data_truncation: bool = True, + warn_limit: bool = True, + warn_server_side_warnings: bool = True, + ): + """Print the API warnings. + + Args: + resource: Resource names, i.e. 'jobs', 'actors', 'nodes', + see `StateResource` for details. + api_response: The dictionarified `ListApiResponse` or `SummaryApiResponse`. + warn_data_source_not_available: Warn when some data sources + are not available. + warn_data_truncation: Warn when results were truncated at + the data source. + warn_limit: Warn when results were limited. + warn_server_side_warnings: Warn when the server side generates warnings + (E.g., when callsites not enabled for listing objects) + """ + # Print warnings if anything was given. + if warn_data_source_not_available: + warning_msgs = api_response.get("partial_failure_warning", None) + if warning_msgs: + warnings.warn(warning_msgs) + + if warn_data_truncation: + # Print warnings if data is truncated at the data source. + num_after_truncation = api_response["num_after_truncation"] + total = api_response["total"] + if total > num_after_truncation: + # NOTE(rickyyx): For now, there's not much users + # could do (neither can we), with hard truncation. + # Unless we allow users to set a higher + # `RAY_MAX_LIMIT_FROM_DATA_SOURCE`, the data will + # always be truncated at the data source. + warnings.warn( + ( + "The returned data may contain incomplete result. " + f"{num_after_truncation} ({total} total from the cluster) " + f"{resource.value} are retrieved from the data source. " + f"{total - num_after_truncation} entries have been truncated. " + f"Max of {num_after_truncation} entries are retrieved " + "from data source to prevent over-sized payloads." + ), + ) + + if warn_limit: + # Print warnings if return data is limited at the API server due to + # limit enforced at the server side + num_filtered = api_response["num_filtered"] + data = api_response["result"] + if num_filtered > len(data): + warnings.warn( + ( + f"Limit last {len(data)} entries " + f"(Total {num_filtered}). Use `--filter` to reduce " + "the amount of data to return or " + "setting a higher limit with `--limit` to see all data. " + ), + ) + + if warn_server_side_warnings: + # Print the additional warnings. + warnings_to_print = api_response.get("warnings", []) + if warnings_to_print: + for warning_to_print in warnings_to_print: + warnings.warn(warning_to_print) + + def _raise_on_missing_output(self, resource: StateResource, api_response: dict): + """Raise an exception when the API resopnse contains a missing output. + + Output can be missing if (1) Failures on some of data source queries (e.g., + `ray list tasks` queries all raylets, and if some of queries fail, it will + contain missing output. If all queries fail, it will just fail). (2) Data + is truncated because the output is too large. + + Args: + resource: Resource names, i.e. 'jobs', 'actors', 'nodes', + see `StateResource` for details. + api_response: The dictionarified `ListApiResponse` or `SummaryApiResponse`. + """ + # Raise an exception if there are partial failures that cause missing output. + warning_msgs = api_response.get("partial_failure_warning", None) + if warning_msgs: + raise RayStateApiException( + f"Failed to retrieve all {resource.value} from the cluster because" + "they are not reachable due to query failures to the data sources. " + "To avoid raising an exception and allow having missing output, " + "set `raise_on_missing_output=False`. " + ) + # Raise an exception is there is data truncation that cause missing output. + total = api_response["total"] + num_after_truncation = api_response["num_after_truncation"] + + if total != num_after_truncation: + raise RayStateApiException( + f"Failed to retrieve all {resource.value} from the cluster because " + "they are not reachable due to data truncation. It happens " + "when the returned data is too large " + # When the data is truncated, the truncation + # threshold == num_after_truncation. We cannot set this to env + # var because the CLI side might not have the correct env var. + f"(> {num_after_truncation}) " + "To avoid raising an exception and allow having missing output, " + "set `raise_on_missing_output=False`. " + ) + + def list( + self, + resource: StateResource, + options: ListApiOptions, + raise_on_missing_output: bool, + _explain: bool = False, + ) -> List[ + Union[ + ActorState, + JobState, + NodeState, + TaskState, + ObjectState, + PlacementGroupState, + RuntimeEnvState, + WorkerState, + ClusterEventState, + ] + ]: + """List resources states + + Args: + resource: Resource names, i.e. 'jobs', 'actors', 'nodes', + see `StateResource` for details. + options: List options. See `ListApiOptions` for details. + raise_on_missing_output: When True, raise an exception if the output + is incomplete. Output can be incomplete if + (1) there's a partial network failure when the source is distributed. + (2) data is truncated because it is too large. + Set it to False to avoid throwing an exception on missing data. + _explain: Print the API information such as API + latency or failed query information. + + Returns: + A list of queried result from `ListApiResponse`, + + Raises: + This doesn't catch any exceptions raised when the underlying request + call raises exceptions. For example, it could raise `requests.Timeout` + when timeout occurs. + + """ + + endpoint = f"/api/v0/{resource.value}" + params = self._make_param(options) + list_api_response = self._make_http_get_request( + endpoint=endpoint, + params=params, + timeout=options.timeout, + _explain=_explain, + ) + if raise_on_missing_output: + self._raise_on_missing_output(resource, list_api_response) + if _explain: + self._print_api_warning(resource, list_api_response) + return [dict_to_state(d, resource) for d in list_api_response["result"]] + + def summary( + self, + resource: SummaryResource, + *, + options: SummaryApiOptions, + raise_on_missing_output: bool, + _explain: bool = False, + ) -> Dict: + """Summarize resources states + + Args: + resource_name: Resource names, + see `SummaryResource` for details. + options: summary options. See `SummaryApiOptions` for details. + raise_on_missing_output: Raise an exception if the output has missing data. + Output can have missing data if (1) there's a partial network failure + when the source is distributed. (2) data is truncated + because it is too large. + _explain: Print the API information such as API + latency or failed query information. + + Returns: + A dictionary of queried result from `SummaryApiResponse`. + + Raises: + This doesn't catch any exceptions raised when the underlying request + call raises exceptions. For example, it could raise `requests.Timeout` + when timeout occurs. + """ + params = {"timeout": options.timeout} + endpoint = f"/api/v0/{resource.value}/summarize" + summary_api_response = self._make_http_get_request( + endpoint=endpoint, + params=params, + timeout=options.timeout, + _explain=_explain, + ) + if raise_on_missing_output: + self._raise_on_missing_output(resource, summary_api_response) + if _explain: + # There's no limit applied to summary, so we shouldn't warn. + self._print_api_warning(resource, summary_api_response, warn_limit=False) + return summary_api_response["result"]["node_id_to_summary"] + + +@DeveloperAPI +def get_actor( + id: str, + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + _explain: bool = False, +) -> Optional[Dict]: + """Get an actor by id. + + Args: + id: Id of the actor + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout value for the state API requests made. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + None if actor not found, or + :class:`ActorState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).get( + StateResource.ACTORS, id, GetApiOptions(timeout=timeout), _explain=_explain + ) + + +@DeveloperAPI +def get_job( + id: str, + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + _explain: bool = False, +) -> Optional[JobState]: + """Get a submission job detail by id. + + Args: + id: Submission ID obtained from job API. + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout value for the state API requests made. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + None if job not found, or + :class:`JobState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).get( + StateResource.JOBS, + id, + GetApiOptions(timeout=timeout), + _explain=_explain, + ) + + +@DeveloperAPI +def get_placement_group( + id: str, + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + _explain: bool = False, +) -> Optional[PlacementGroupState]: + """Get a placement group by id. + + Args: + id: Id of the placement group + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout value for the state APIs requests made. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + None if actor not found, or + :class:`~ray.util.state.common.PlacementGroupState`. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).get( + StateResource.PLACEMENT_GROUPS, + id, + GetApiOptions(timeout=timeout), + _explain=_explain, + ) + + +@DeveloperAPI +def get_node( + id: str, + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + _explain: bool = False, +) -> Optional[NodeState]: + """Get a node by id. + + Args: + id: Id of the node. + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout value for the state APIs requests made. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + None if actor not found, or + :class:`NodeState `. + + Raises: + Exceptions: :class:`RayStateApiException ` + if the CLI is failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).get( + StateResource.NODES, + id, + GetApiOptions(timeout=timeout), + _explain=_explain, + ) + + +@DeveloperAPI +def get_worker( + id: str, + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + _explain: bool = False, +) -> Optional[WorkerState]: + """Get a worker by id. + + Args: + id: Id of the worker + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout value for the state APIs requests made. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + None if actor not found, or + :class:`WorkerState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).get( + StateResource.WORKERS, + id, + GetApiOptions(timeout=timeout), + _explain=_explain, + ) + + +@DeveloperAPI +def get_task( + id: str, + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + _explain: bool = False, +) -> Optional[TaskState]: + """Get task attempts of a task by id. + + Args: + id: Id of the task + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout value for the state APIs requests made. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + None if task not found, or a list of + :class:`~ray.util.state.common.TaskState` + from the task attempts. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).get( + StateResource.TASKS, + id, + GetApiOptions(timeout=timeout), + _explain=_explain, + ) + + +@DeveloperAPI +def get_objects( + id: str, + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + _explain: bool = False, +) -> List[ObjectState]: + """Get objects by id. + + There could be more than 1 entry returned since an object could be + referenced at different places. + + Args: + id: Id of the object + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout value for the state APIs requests made. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of + :class:`~ray.util.state.common.ObjectState`. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).get( + StateResource.OBJECTS, + id, + GetApiOptions(timeout=timeout), + _explain=_explain, + ) + + +@DeveloperAPI +def list_actors( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[ActorState]: + """List actors in the cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("id", "=", "abcd")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `ActorState`) + will be queried and returned. See + :class:`ActorState `. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of + :class:`ActorState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.ACTORS, + options=ListApiOptions( + limit=limit, + timeout=timeout, + filters=filters, + detail=detail, + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_placement_groups( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[PlacementGroupState]: + """List placement groups in the cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("state", "=", "abcd")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `PlacementGroupState`) + will be queried and returned. See + :class:`~ray.util.state.common.PlacementGroupState`. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of :class:`~ray.util.state.common.PlacementGroupState`. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.PLACEMENT_GROUPS, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_nodes( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[NodeState]: + """List nodes in the cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("node_name", "=", "abcd")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `NodeState`) + will be queried and returned. See + :class:`NodeState `. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of dictionarified + :class:`NodeState `. + + Raises: + Exceptions: :class:`RayStateApiException ` + if the CLI failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.NODES, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_jobs( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[JobState]: + """List jobs submitted to the cluster by :ref: `ray job submission `. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("status", "=", "abcd")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `JobState`) + will be queried and returned. See + :class:`JobState `. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of dictionarified + :class:`JobState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.JOBS, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_workers( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[WorkerState]: + """List workers in the cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("is_alive", "=", "True")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `WorkerState`) + will be queried and returned. See + :class:`WorkerState `. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of + :class:`WorkerState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.WORKERS, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_tasks( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[TaskState]: + """List tasks in the cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("is_alive", "=", "True")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `WorkerState`) + will be queried and returned. See + :class:`WorkerState `. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of + :class:`TaskState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.TASKS, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_objects( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[ObjectState]: + """List objects in the cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("ip", "=", "0.0.0.0")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `ObjectState`) + will be queried and returned. See + :class:`ObjectState `. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of + :class:`ObjectState `. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.OBJECTS, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_runtime_envs( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[RuntimeEnvState]: + """List runtime environments in the cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + filters: List of tuples of filter key, predicate (=, or !=), and + the filter value. E.g., `("node_id", "=", "abcdef")` + limit: Max number of entries returned by the state backend. + timeout: Max timeout value for the state APIs requests made. + detail: When True, more details info (specified in `RuntimeEnvState`) + will be queried and returned. See + :class:`RuntimeEnvState `. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Returns: + List of + :class:`RuntimeEnvState `. + + Raises: + Exceptions: :class:`RayStateApiException ` + if the CLI failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).list( + StateResource.RUNTIME_ENVS, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def list_cluster_events( + address: Optional[str] = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + limit: int = DEFAULT_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + detail: bool = False, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> List[Dict]: + return StateApiClient(address=address).list( + StateResource.CLUSTER_EVENTS, + options=ListApiOptions( + limit=limit, timeout=timeout, filters=filters, detail=detail + ), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +""" +Log APIs +""" + + +@DeveloperAPI +def get_log( + address: Optional[str] = None, + node_id: Optional[str] = None, + node_ip: Optional[str] = None, + filename: Optional[str] = None, + actor_id: Optional[str] = None, + task_id: Optional[str] = None, + pid: Optional[int] = None, + follow: bool = False, + tail: int = -1, + timeout: int = DEFAULT_RPC_TIMEOUT, + suffix: str = "out", + encoding: Optional[str] = "utf-8", + errors: Optional[str] = "strict", + submission_id: Optional[str] = None, + attempt_number: int = 0, + _interval: Optional[float] = None, +) -> Generator[str, None, None]: + """Retrieve log file based on file name or some entities ids (pid, actor id, task id). + + Examples: + >>> import ray + >>> from ray.util.state import get_log # doctest: +SKIP + # To connect to an existing ray instance if there is + >>> ray.init("auto") # doctest: +SKIP + # Node IP could be retrieved from list_nodes() or ray.nodes() + >>> node_ip = "172.31.47.143" # doctest: +SKIP + >>> filename = "gcs_server.out" # doctest: +SKIP + >>> for l in get_log(filename=filename, node_ip=node_ip): # doctest: +SKIP + >>> print(l) # doctest: +SKIP + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If not specified, it will be retrieved from the initialized ray cluster. + node_id: Id of the node containing the logs . + node_ip: Ip of the node containing the logs. (At least one of the node_id and + node_ip have to be supplied when identifying a node). + filename: Name of the file (relative to the ray log directory) to be retrieved. + actor_id: Id of the actor if getting logs from an actor. + task_id: Id of the task if getting logs generated by a task. + pid: PID of the worker if getting logs generated by a worker. When querying + with pid, either node_id or node_ip must be supplied. + follow: When set to True, logs will be streamed and followed. + tail: Number of lines to get from the end of the log file. Set to -1 for getting + the entire log. + timeout: Max timeout for requests made when getting the logs. + suffix: The suffix of the log file if query by id of tasks/workers/actors. Default to "out". + encoding: The encoding used to decode the content of the log file. Default is + "utf-8". Use None to get binary data directly. + errors: The error handling scheme to use for decoding errors. Default is + "strict". See https://docs.python.org/3/library/codecs.html#error-handlers + submission_id: Job submission ID if getting log from a submission job. + attempt_number: The attempt number of the task if getting logs generated by a task. + _interval: The interval in secs to print new logs when `follow=True`. + + Return: + A Generator of log line, None for SendType and ReturnType. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + + api_server_url = ray_address_to_api_server_url(address) + media_type = "stream" if follow else "file" + + options = GetLogOptions( + node_id=node_id, + node_ip=node_ip, + filename=filename, + actor_id=actor_id, + task_id=task_id, + pid=pid, + lines=tail, + interval=_interval, + media_type=media_type, + timeout=timeout, + suffix=suffix, + submission_id=submission_id, + attempt_number=attempt_number, + ) + options_dict = {} + for field in fields(options): + option_val = getattr(options, field.name) + if option_val is not None: + options_dict[field.name] = option_val + + with requests.get( + f"{api_server_url}/api/v0/logs/{media_type}?" + f"{urllib.parse.urlencode(options_dict)}", + stream=True, + ) as r: + if r.status_code != 200: + raise RayStateApiException(r.text) + for bytes in r.iter_content(chunk_size=None): + bytes = bytearray(bytes) + # First byte 1 means success. + if bytes.startswith(b"1"): + bytes.pop(0) + logs = bytes + if encoding is not None: + logs = bytes.decode(encoding=encoding, errors=errors) + else: + assert bytes.startswith(b"0") + error_msg = bytes.decode("utf-8") + raise RayStateApiException(error_msg) + yield logs + + +@DeveloperAPI +def list_logs( + address: Optional[str] = None, + node_id: Optional[str] = None, + node_ip: Optional[str] = None, + glob_filter: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, +) -> Dict[str, List[str]]: + """Listing log files available. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If not specified, it will be retrieved from the initialized ray cluster. + node_id: Id of the node containing the logs. + node_ip: Ip of the node containing the logs. + glob_filter: Name of the file (relative to the ray log directory) to be + retrieved. E.g. `glob_filter="*worker*"` for all worker logs. + actor_id: Id of the actor if getting logs from an actor. + timeout: Max timeout for requests made when getting the logs. + _interval: The interval in secs to print new logs when `follow=True`. + + Return: + A dictionary where the keys are log groups (e.g. gcs, raylet, worker), and + values are list of log filenames. + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data, or ConnectionError if failed to resolve the + ray address. + """ # noqa: E501 + assert ( + node_ip is not None or node_id is not None + ), "At least one of node ip and node id is required" + + api_server_url = ray_address_to_api_server_url(address) + + if not glob_filter: + glob_filter = "*" + + options_dict = {} + if node_ip: + options_dict["node_ip"] = node_ip + if node_id: + options_dict["node_id"] = node_id + if glob_filter: + options_dict["glob"] = glob_filter + options_dict["timeout"] = timeout + + r = requests.get( + f"{api_server_url}/api/v0/logs?{urllib.parse.urlencode(options_dict)}" + ) + r.raise_for_status() + + response = r.json() + if response["result"] is False: + raise RayStateApiException( + "API server internal error. See dashboard.log file for more details. " + f"Error: {response['msg']}" + ) + return response["data"]["result"] + + +""" +Summary APIs +""" + + +@DeveloperAPI +def summarize_tasks( + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> Dict: + """Summarize the tasks in cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout for requests made when getting the states. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Return: + Dictionarified + :class:`~ray.util.state.common.TaskSummaries` + + Raises: + Exceptions: :class:`RayStateApiException ` + if the CLI is failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).summary( + SummaryResource.TASKS, + options=SummaryApiOptions(timeout=timeout), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def summarize_actors( + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> Dict: + """Summarize the actors in cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout for requests made when getting the states. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Return: + Dictionarified + :class:`~ray.util.state.common.ActorSummaries` + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).summary( + SummaryResource.ACTORS, + options=SummaryApiOptions(timeout=timeout), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) + + +@DeveloperAPI +def summarize_objects( + address: Optional[str] = None, + timeout: int = DEFAULT_RPC_TIMEOUT, + raise_on_missing_output: bool = True, + _explain: bool = False, +) -> Dict: + """Summarize the objects in cluster. + + Args: + address: Ray bootstrap address, could be `auto`, `localhost:6379`. + If None, it will be resolved automatically from an initialized ray. + timeout: Max timeout for requests made when getting the states. + raise_on_missing_output: When True, exceptions will be raised if + there is missing data due to truncation/data source unavailable. + _explain: Print the API information such as API latency or + failed query information. + + Return: + Dictionarified :class:`~ray.util.state.common.ObjectSummaries` + + Raises: + Exceptions: :class:`RayStateApiException ` if the CLI + failed to query the data. + """ # noqa: E501 + return StateApiClient(address=address).summary( + SummaryResource.OBJECTS, + options=SummaryApiOptions(timeout=timeout), + raise_on_missing_output=raise_on_missing_output, + _explain=_explain, + ) diff --git a/python/ray/util/state/common.py b/python/ray/util/state/common.py new file mode 100644 index 000000000000..6a00d0cafb75 --- /dev/null +++ b/python/ray/util/state/common.py @@ -0,0 +1,1605 @@ +import datetime +import json +import logging +import sys +from abc import ABC +from dataclasses import asdict, field, fields +from enum import Enum, unique +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +import ray.dashboard.utils as dashboard_utils +from ray._private.ray_constants import env_integer +from ray.core.generated.common_pb2 import TaskStatus, TaskType +from ray.core.generated.gcs_pb2 import TaskEvents +from ray.util.state.custom_types import ( + TypeActorStatus, + TypeNodeStatus, + TypePlacementGroupStatus, + TypeReferenceType, + TypeTaskStatus, + TypeTaskType, + TypeWorkerExitType, + TypeWorkerType, +) +from ray.util.state.exception import RayStateApiException + +try: + from pydantic.dataclasses import dataclass + + from ray.dashboard.modules.job.pydantic_models import JobDetails + +except ImportError: + # pydantic is not available in the dashboard. + # We will use the dataclass from the standard library. + from dataclasses import dataclass + + JobDetails = object + + +logger = logging.getLogger(__name__) + +DEFAULT_RPC_TIMEOUT = 30 +DEFAULT_LIMIT = 100 +DEFAULT_LOG_LIMIT = 1000 + +# Max number of entries from API server to the client +RAY_MAX_LIMIT_FROM_API_SERVER = env_integer( + "RAY_MAX_LIMIT_FROM_API_SERVER", 10 * 1000 +) # 10k + +# Max number of entries from data sources (rest will be truncated at the +# data source, e.g. raylet) +RAY_MAX_LIMIT_FROM_DATA_SOURCE = env_integer( + "RAY_MAX_LIMIT_FROM_DATA_SOURCE", 10 * 1000 +) # 10k + + +@unique +class StateResource(Enum): + ACTORS = "actors" + JOBS = "jobs" + PLACEMENT_GROUPS = "placement_groups" + NODES = "nodes" + WORKERS = "workers" + TASKS = "tasks" + OBJECTS = "objects" + RUNTIME_ENVS = "runtime_envs" + CLUSTER_EVENTS = "cluster_events" + + +@unique +class SummaryResource(Enum): + ACTORS = "actors" + TASKS = "tasks" + OBJECTS = "objects" + + +SupportedFilterType = Union[str, bool, int, float] + + +PredicateType = str # Literal["=", "!="] + + +class Humanify: + """A class containing default methods to + convert units into a human readable string.""" + + def timestamp(x: float): + """Converts miliseconds to a datetime object.""" + return str(datetime.datetime.fromtimestamp(x / 1000)) + + def memory(x: int): + """Converts raw bytes to a human readable memory size.""" + if x >= 2**30: + return str(format(x / (2**30), ".3f")) + " GiB" + elif x >= 2**20: + return str(format(x / (2**20), ".3f")) + " MiB" + elif x >= 2**10: + return str(format(x / (2**10), ".3f")) + " KiB" + return str(format(x, ".3f")) + " B" + + def duration(x: int): + """Converts miliseconds to a human readable duration.""" + return str(datetime.timedelta(milliseconds=x)) + + def events(events: List[dict]): + """Converts a list of task events into a human readable format.""" + for event in events: + if "created_ms" in event: + event["created_ms"] = Humanify.timestamp(event["created_ms"]) + return events + + def node_resources(resources: dict): + """Converts a node's resources into a human readable format.""" + for resource in resources: + if "memory" in resource: + resources[resource] = Humanify.memory(resources[resource]) + return resources + + +@dataclass(init=True) +class ListApiOptions: + # Maximum number of entries to return + limit: int = DEFAULT_LIMIT + # The timeout for the API call. + timeout: int = DEFAULT_RPC_TIMEOUT + # If True, more detailed output will be printed. + # The API could query more sources than detail == False + # to get more data in detail. + detail: bool = False + # Filters. Each tuple pair (key, predicate, value) means key predicate value. + # If there's more than 1 filter, it means AND. + # E.g., [(key, "=", val), (key2, "!=" val2)] means (key=val) AND (key2!=val2) + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = field( + default_factory=list + ) + # [only tasks] If driver tasks should be excluded. + exclude_driver: bool = True + # When the request is processed on the server side, + # we should apply multiplier so that server side can finish + # processing a request within timeout. Otherwise, + # timeout will always lead Http timeout. + server_timeout_multiplier: float = 0.8 + + def __post_init__(self): + # To return the data to users, when there's a partial failure + # we need to have a timeout that's smaller than the users' timeout. + # 80% is configured arbitrarily. + self.timeout = int(self.timeout * self.server_timeout_multiplier) + assert self.timeout != 0, "0 second timeout is not supported." + if self.filters is None: + self.filters = [] + + for filter in self.filters: + _, filter_predicate, _ = filter + if filter_predicate != "=" and filter_predicate != "!=": + raise ValueError( + f"Unsupported filter predicate {filter_predicate} is given. " + "Available predicates: =, !=." + ) + + +@dataclass(init=True) +class GetApiOptions: + # Timeout for the HTTP request + timeout: int = DEFAULT_RPC_TIMEOUT + + +@dataclass(init=True) +class SummaryApiOptions: + # Timeout for the HTTP request + timeout: int = DEFAULT_RPC_TIMEOUT + + # Filters. Each tuple pair (key, predicate, value) means key predicate value. + # If there's more than 1 filter, it means AND. + # E.g., [(key, "=", val), (key2, "!=" val2)] means (key=val) AND (key2!=val2) + # For summary endpoints that call list under the hood, we'll pass + # these filters directly into the list call. + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = field( + default_factory=list + ) + + # Change out to summarize the output. There is a summary_by value for each entity. + # Tasks: by func_name + # Actors: by class + # Objects: by callsite + summary_by: Optional[str] = None + + +def state_column(*, filterable: bool, detail: bool = False, format_fn=None, **kwargs): + """A wrapper around dataclass.field to add additional metadata. + + The metadata is used to define detail / filterable option of + each column. + + Args: + detail: If True, the column is used when detail == True + filterable: If True, the column can be used for filtering. + kwargs: The same kwargs for the `dataclasses.field` function. + """ + m = {"detail": detail, "filterable": filterable, "format_fn": format_fn} + # Default for detail field is None since it could be missing. + if detail and "default" not in kwargs: + kwargs["default"] = None + + if "metadata" in kwargs: + # Metadata explicitly specified, so add detail and filterable if missing. + kwargs["metadata"].update(m) + else: + # Metadata not explicitly specified, so add it. + kwargs["metadata"] = m + return field(**kwargs) + + +class StateSchema(ABC): + """Schema class for Ray resource abstraction. + + The child class must be dataclass. All child classes + - perform runtime type checking upon initialization. + - are supposed to use `state_column` instead of `field`. + It will allow the class to return filterable/detail columns. + If `state_column` is not specified, that column is not filterable + and for non-detail output. + + For example, + ``` + @dataclass + class State(StateSchema): + column_a: str + column_b: int = state_column(detail=True, filterable=True) + + s = State(column_a="abc", b=1) + # Returns {"column_b"} + s.filterable_columns() + # Returns {"column_a"} + s.base_columns() + # Returns {"column_a", "column_b"} + s.columns() + ``` + + In addition, the schema also provides a humanify abstract method to + convert the state object into something human readable, ready for printing. + + Subclasses should override this method, providing logic to convert its own fields + to something human readable, packaged and returned in a dict. + + Each field that wants to be humanified should include a 'format_fn' key in its + metadata dictionary. + """ + + @classmethod + def humanify(cls, state: dict) -> dict: + """Convert the given state object into something human readable.""" + for f in fields(cls): + if ( + f.metadata.get("format_fn") is not None + and f.name in state + and state[f.name] is not None + ): + try: + state[f.name] = f.metadata["format_fn"](state[f.name]) + except Exception as e: + logger.error(f"Failed to format {f.name}:{state[f.name]} with {e}") + return state + + @classmethod + def list_columns(cls, detail: bool = True) -> List[str]: + """Return a list of columns.""" + cols = [] + for f in fields(cls): + if detail: + cols.append(f.name) + elif not f.metadata.get("detail", False): + cols.append(f.name) + + return cols + + @classmethod + def columns(cls) -> Set[str]: + """Return a set of all columns.""" + return set(cls.list_columns()) + + @classmethod + def filterable_columns(cls) -> Set[str]: + """Return a list of filterable columns""" + filterable = set() + for f in fields(cls): + if f.metadata.get("filterable", False): + filterable.add(f.name) + return filterable + + @classmethod + def base_columns(cls) -> Set[str]: + """Return a list of base columns. + + Base columns mean columns to return when detail == False. + """ + return set(cls.list_columns(detail=False)) + + @classmethod + def detail_columns(cls) -> Set[str]: + """Return a list of detail columns. + + Detail columns mean columns to return when detail == True. + """ + return set(cls.list_columns(detail=True)) + + def asdict(self): + return asdict(self) + + # Allow dict like access on the class directly for backward compatibility. + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + setattr(self, key, value) + + def get(self, key, default=None): + return getattr(self, key, default) + + +def filter_fields(data: dict, state_dataclass: StateSchema, detail: bool) -> dict: + """Filter the given data's columns based on the given schema. + + Args: + data: A single data entry to filter columns. + state_dataclass: The schema to filter data. + detail: Whether or not it should include columns for detail output. + """ + filtered_data = {} + columns = state_dataclass.columns() if detail else state_dataclass.base_columns() + for col in columns: + if col in data: + filtered_data[col] = data[col] + else: + filtered_data[col] = None + return filtered_data + + +@dataclass(init=True) +class GetLogOptions: + timeout: int + node_id: Optional[str] = None + node_ip: Optional[str] = None + # One of {file, stream}. File means it will return the whole log. + # stream means it will keep the connection and streaming the log. + media_type: str = "file" + # The file name of the log. + filename: Optional[str] = None + # The actor id of the log. It is used only for worker logs. + actor_id: Optional[str] = None + # The task id of the log. + task_id: Optional[str] = None + # The attempt number of the task. + attempt_number: int = 0 + # The pid of the log. It is used only for worker logs. + pid: Optional[int] = None + # Total log lines to return. + lines: int = 1000 + # The interval where new logs are streamed to. + # Should be used only when media_type == stream. + interval: Optional[float] = None + # The suffix of the log file if file resolution not through filename directly. + # Default to "out". + suffix: str = "out" + # The job submission id for submission job. This doesn't work for driver job + # since Ray doesn't log driver logs to file in the ray logs directory. + submission_id: Optional[str] = None + + def __post_init__(self): + if self.pid: + self.pid = int(self.pid) + if self.interval: + self.interval = float(self.interval) + self.lines = int(self.lines) + + if self.media_type == "file": + assert self.interval is None + if self.media_type not in ["file", "stream"]: + raise ValueError(f"Invalid media type: {self.media_type}") + if not (self.node_id or self.node_ip) and not (self.actor_id or self.task_id): + raise ValueError( + "node_id or node_ip must be provided as constructor arguments when no " + "actor or task_id is supplied as arguments." + ) + if self.node_id and self.node_ip: + raise ValueError( + "Both node_id and node_ip are given. Only one of them can be provided. " + f"Given node id: {self.node_id}, given node ip: {self.node_ip}" + ) + if not ( + self.actor_id + or self.task_id + or self.pid + or self.filename + or self.submission_id + ): + raise ValueError( + "None of actor_id, task_id, pid, submission_id or filename " + "is provided. At least one of them is required to fetch logs." + ) + + if self.suffix not in ["out", "err"]: + raise ValueError( + f"Invalid suffix: {self.suffix}. Must be one of 'out' or 'err'." + ) + + +# See the ActorTableData message in gcs.proto for all potential options that +# can be included in this class. +@dataclass(init=True) +class ActorState(StateSchema): + """Actor State""" + + #: The id of the actor. + actor_id: str = state_column(filterable=True) + #: The class name of the actor. + class_name: str = state_column(filterable=True) + #: The state of the actor. + #: + #: - DEPENDENCIES_UNREADY: Actor is waiting for dependency to be ready. + #: E.g., a new actor is waiting for object ref that's created from + #: other remote task. + #: - PENDING_CREATION: Actor's dependency is ready, but it is not created yet. + #: It could be because there are not enough resources, too many actor + #: entries in the scheduler queue, or the actor creation is slow + #: (e.g., slow runtime environment creation, + #: slow worker startup, or etc.). + #: - ALIVE: The actor is created, and it is alive. + #: - RESTARTING: The actor is dead, and it is restarting. + #: It is equivalent to `PENDING_CREATION`, + #: but means the actor was dead more than once. + #: - DEAD: The actor is permanatly dead. + state: TypeActorStatus = state_column(filterable=True) + #: The job id of this actor. + job_id: str = state_column(filterable=True) + #: The name of the actor given by the `name` argument. + name: Optional[str] = state_column(filterable=True) + #: The node id of this actor. + #: If the actor is restarting, it could be the node id + #: of the dead actor (and it will be re-updated when + #: the actor is successfully restarted). + node_id: Optional[str] = state_column(filterable=True) + #: The pid of the actor. 0 if it is not created yet. + pid: Optional[int] = state_column(filterable=True) + #: The namespace of the actor. + ray_namespace: Optional[str] = state_column(filterable=True) + #: The runtime environment information of the actor. + serialized_runtime_env: Optional[str] = state_column(filterable=False, detail=True) + #: The resource requirement of the actor. + required_resources: Optional[dict] = state_column(filterable=False, detail=True) + #: Actor's death information in detail. None if the actor is not dead yet. + death_cause: Optional[dict] = state_column(filterable=False, detail=True) + #: True if the actor is detached. False otherwise. + is_detached: Optional[bool] = state_column(filterable=False, detail=True) + #: The placement group id that's associated with this actor. + placement_group_id: Optional[str] = state_column(detail=True, filterable=True) + #: Actor's repr name if a customized __repr__ method exists, else empty string. + repr_name: Optional[str] = state_column(detail=True, filterable=True) + + +@dataclass(init=True) +class PlacementGroupState(StateSchema): + """PlacementGroup State""" + + #: The id of the placement group. + placement_group_id: str = state_column(filterable=True) + #: The name of the placement group if it is given by the name argument. + name: str = state_column(filterable=True) + #: The job id of the placement group. + creator_job_id: str = state_column(filterable=True) + #: The state of the placement group. + #: + #: - PENDING: The placement group creation is pending scheduling. + #: It could be because there's not enough resources, some of creation + #: stage has failed (e.g., failed to commit placement gropus because + #: the node is dead). + #: - CREATED: The placement group is created. + #: - REMOVED: The placement group is removed. + #: - RESCHEDULING: The placement group is rescheduling because some of + #: bundles are dead because they were on dead nodes. + state: TypePlacementGroupStatus = state_column(filterable=True) + #: The bundle specification of the placement group. + bundles: Optional[List[dict]] = state_column(filterable=False, detail=True) + #: True if the placement group is detached. False otherwise. + is_detached: Optional[bool] = state_column(filterable=True, detail=True) + #: The scheduling stats of the placement group. + stats: Optional[dict] = state_column(filterable=False, detail=True) + + +@dataclass(init=True) +class NodeState(StateSchema): + """Node State""" + + #: The id of the node. + node_id: str = state_column(filterable=True) + #: The ip address of the node. + node_ip: str = state_column(filterable=True) + #: If this is a head node. + is_head_node: bool = state_column(filterable=True) + #: The state of the node. + #: + #: ALIVE: The node is alive. + #: DEAD: The node is dead. + state: TypeNodeStatus = state_column(filterable=True) + #: The name of the node if it is given by the name argument. + node_name: str = state_column(filterable=True) + #: The total resources of the node. + resources_total: dict = state_column( + filterable=False, format_fn=Humanify.node_resources + ) + #: The time when the node (raylet) starts. + start_time_ms: Optional[int] = state_column( + filterable=False, detail=True, format_fn=Humanify.timestamp + ) + #: The time when the node exits. The timestamp could be delayed + #: if the node is dead unexpectedly (could be delayed + # up to 30 seconds). + end_time_ms: Optional[int] = state_column( + filterable=False, detail=True, format_fn=Humanify.timestamp + ) + + +# NOTE: +# Declaring this as dataclass would make __init__ not being called properly. +class JobState(StateSchema, JobDetails): + """The state of the job that's submitted by Ray's Job APIs or driver jobs""" + + def __init__(self, **kwargs): + JobDetails.__init__(self, **kwargs) + + @classmethod + def filterable_columns(cls) -> Set[str]: + # We are not doing any filtering since filtering is currently done + # at the backend. + return {"job_id", "type", "status", "submission_id"} + + @classmethod + def humanify(cls, state: dict) -> dict: + return state + + @classmethod + def list_columns(cls, detail: bool = False) -> List[str]: + if not detail: + return [ + "job_id", + "submission_id", + "entrypoint", + "type", + "status", + "message", + "error_type", + "driver_info", + ] + if isinstance(JobDetails, object): + # We don't have pydantic in the dashboard. This is because + # we call this method at module import time, so we need to + # check if the class is a pydantic model. + return [] + + return JobDetails.__fields__ + + def asdict(self): + return JobDetails.dict(self) + + @classmethod + def schema_dict(cls) -> Dict[str, Any]: + schema_types = cls.schema()["properties"] + # Get type name to actual type mapping. + return { + k: v["type"] for k, v in schema_types.items() if v.get("type") is not None + } + + +@dataclass(init=True) +class WorkerState(StateSchema): + """Worker State""" + + #: The id of the worker. + worker_id: str = state_column(filterable=True) + #: Whether or not if the worker is alive. + is_alive: bool = state_column(filterable=True) + #: The type of the worker. + #: + #: - WORKER: The regular Ray worker process that executes tasks or + # instantiates an actor. + #: - DRIVER: The driver (Python script that calls `ray.init`). + #: - SPILL_WORKER: The worker that spills objects. + #: - RESTORE_WORKER: The worker that restores objects. + worker_type: TypeWorkerType = state_column(filterable=True) + #: The exit type of the worker if the worker is dead. + #: + #: - SYSTEM_ERROR: Worker exit due to system level failures (i.e. worker crash). + #: - INTENDED_SYSTEM_EXIT: System-level exit that is intended. E.g., + #: Workers are killed because they are idle for a long time. + #: - USER_ERROR: Worker exits because of user error. + #: E.g., execptions from the actor initialization. + #: - INTENDED_USER_EXIT: Intended exit from users (e.g., users exit + #: workers with exit code 0 or exit initated by Ray API such as ray.kill). + exit_type: Optional[TypeWorkerExitType] = state_column(filterable=True) + #: The node id of the worker. + node_id: str = state_column(filterable=True) + #: The ip address of the worker. + ip: str = state_column(filterable=True) + #: The pid of the worker. + pid: int = state_column(filterable=True) + #: The exit detail of the worker if the worker is dead. + exit_detail: Optional[str] = state_column(detail=True, filterable=False) + #: The time worker is first launched. + #: -1 if the value doesn't exist. + #: The lifecycle of worker is as follow. + #: worker_launch_time_ms (process startup requested). + #: -> worker_launched_time_ms (process started). + #: -> start_time_ms (worker is ready to be used). + #: -> end_time_ms (worker is destroyed). + worker_launch_time_ms: Optional[int] = state_column( + filterable=False, detail=True, format_fn=Humanify.timestamp + ) + #: The time worker is succesfully launched + #: -1 if the value doesn't exist. + worker_launched_time_ms: Optional[int] = state_column( + filterable=False, detail=True, format_fn=Humanify.timestamp + ) + #: The time when the worker is started and initialized. + #: 0 if the value doesn't exist. + start_time_ms: Optional[int] = state_column( + filterable=False, detail=True, format_fn=Humanify.timestamp + ) + #: The time when the worker exits. The timestamp could be delayed + #: if the worker is dead unexpectedly. + #: 0 if the value doesn't exist. + end_time_ms: Optional[int] = state_column( + filterable=False, detail=True, format_fn=Humanify.timestamp + ) + + +@dataclass(init=True) +class ClusterEventState(StateSchema): + severity: str = state_column(filterable=True) + time: str = state_column(filterable=False) + source_type: str = state_column(filterable=True) + message: str = state_column(filterable=False) + event_id: str = state_column(filterable=True) + custom_fields: Optional[dict] = state_column(filterable=False, detail=True) + + +@dataclass(init=True) +class TaskState(StateSchema): + """Task State""" + + #: The id of the task. + task_id: str = state_column(filterable=True) + #: The attempt (retry) number of the task. + attempt_number: int = state_column(filterable=True) + #: The name of the task if it is given by the name argument. + name: str = state_column(filterable=True) + #: The state of the task. + #: + #: Refer to src/ray/protobuf/common.proto for a detailed explanation of the state + #: breakdowns and typical state transition flow. + #: + state: TypeTaskStatus = state_column(filterable=True) + #: The job id of this task. + job_id: str = state_column(filterable=True) + #: The actor id that's associated with this task. + #: It is empty if there's no relevant actors. + actor_id: Optional[str] = state_column(filterable=True) + #: The type of the task. + #: + #: - NORMAL_TASK: Tasks created by `func.remote()`` + #: - ACTOR_CREATION_TASK: Actors created by `class.remote()` + #: - ACTOR_TASK: Actor tasks submitted by `actor.method.remote()` + #: - DRIVER_TASK: Driver (A script that calls `ray.init`). + type: TypeTaskType = state_column(filterable=True) + #: The name of the task. If is the name of the function + #: if the type is a task or an actor task. + #: It is the name of the class if it is a actor scheduling task. + func_or_class_name: str = state_column(filterable=True) + #: The parent task id. If the parent is a normal task, it will be the task's id. + #: If the parent runs in a concurrent actor (async actor or threaded actor), + #: it will be the actor's creation task id. + parent_task_id: str = state_column(filterable=True) + #: Id of the node that runs the task. If the task is retried, it could + #: contain the node id of the previous executed task. + #: If empty, it means the task hasn't been scheduled yet. + node_id: Optional[str] = state_column(filterable=True) + #: The worker id that's associated with this task. + worker_id: Optional[str] = state_column(filterable=True) + #: Task error type. + error_type: Optional[str] = state_column(filterable=True) + #: The language of the task. E.g., Python, Java, or Cpp. + language: Optional[str] = state_column(detail=True, filterable=True) + #: The required resources to execute the task. + required_resources: Optional[dict] = state_column(detail=True, filterable=False) + #: The runtime environment information for the task. + runtime_env_info: Optional[dict] = state_column(detail=True, filterable=False) + #: The placement group id that's associated with this task. + placement_group_id: Optional[str] = state_column(detail=True, filterable=True) + #: The list of events of the given task. + #: Refer to src/ray/protobuf/common.proto for a detailed explanation of the state + #: breakdowns and typical state transition flow. + events: Optional[List[dict]] = state_column( + detail=True, filterable=False, format_fn=Humanify.events + ) + #: The list of profile events of the given task. + profiling_data: Optional[dict] = state_column(detail=True, filterable=False) + #: The time when the task is created. A Unix timestamp in ms. + creation_time_ms: Optional[int] = state_column( + detail=True, + filterable=False, + format_fn=Humanify.timestamp, + ) + #: The time when the task starts to run. A Unix timestamp in ms. + start_time_ms: Optional[int] = state_column( + detail=True, + filterable=False, + format_fn=Humanify.timestamp, + ) + #: The time when the task is finished or failed. A Unix timestamp in ms. + end_time_ms: Optional[int] = state_column( + detail=True, filterable=False, format_fn=Humanify.timestamp + ) + #: The task logs info, e.g. offset into the worker log file when the task + #: starts/finishes. + task_log_info: Optional[dict] = state_column(detail=True, filterable=False) + #: Task error detail info. + error_message: Optional[str] = state_column(detail=True, filterable=False) + + +@dataclass(init=True) +class ObjectState(StateSchema): + """Object State""" + + #: The id of the object. + object_id: str = state_column(filterable=True) + #: The size of the object in mb. + object_size: int = state_column(filterable=True, format_fn=Humanify.memory) + #: The status of the task that creates the object. + #: + #: - NIL: We don't have a status for this task because we are not the owner or the + #: task metadata has already been deleted. + #: - WAITING_FOR_DEPENDENCIES: The task is waiting for its dependencies + #: to be created. + #: - SCHEDULED: All dependencies have been created and the task is + #: scheduled to execute. + #: It could be because the task is waiting for resources, + #: runtime environmenet creation, fetching dependencies to the + #: local node, and etc.. + #: - FINISHED: The task finished successfully. + #: - WAITING_FOR_EXECUTION: The task is scheduled properly and + #: waiting for execution. It includes time to deliver the task + #: to the remote worker + queueing time from the execution side. + #: - RUNNING: The task that is running. + task_status: TypeTaskStatus = state_column(filterable=True) + #: The reference type of the object. + #: See :ref:`Debugging with Ray Memory ` for more details. + #: + #: - ACTOR_HANDLE: The reference is an actor handle. + #: - PINNED_IN_MEMORY: The object is pinned in memory, meaning there's + #: in-flight `ray.get` on this reference. + #: - LOCAL_REFERENCE: There's a local reference (e.g., Python reference) + #: to this object reference. The object won't be GC'ed until all of them is gone. + #: - USED_BY_PENDING_TASK: The object reference is passed to other tasks. E.g., + #: `a = ray.put()` -> `task.remote(a)`. In this case, a is used by a + #: pending task `task`. + #: - CAPTURED_IN_OBJECT: The object is serialized by other objects. E.g., + #: `a = ray.put(1)` -> `b = ray.put([a])`. a is serialized within a list. + #: - UNKNOWN_STATUS: The object ref status is unkonwn. + reference_type: TypeReferenceType = state_column(filterable=True) + #: The callsite of the object. + call_site: str = state_column(filterable=True) + #: The worker type that creates the object. + #: + #: - WORKER: The regular Ray worker process that executes tasks or + #: instantiates an actor. + #: - DRIVER: The driver (Python script that calls `ray.init`). + #: - SPILL_WORKER: The worker that spills objects. + #: - RESTORE_WORKER: The worker that restores objects. + type: TypeWorkerType = state_column(filterable=True) + #: The pid of the owner. + pid: int = state_column(filterable=True) + #: The ip address of the owner. + ip: str = state_column(filterable=True) + + +@dataclass(init=True) +class RuntimeEnvState(StateSchema): + """Runtime Environment State""" + + #: The runtime environment spec. + runtime_env: dict = state_column(filterable=True) + #: Whether or not the runtime env creation has succeeded. + success: bool = state_column(filterable=True) + #: The latency of creating the runtime environment. + #: Available if the runtime env is successfully created. + creation_time_ms: Optional[float] = state_column( + filterable=False, format_fn=Humanify.timestamp + ) + #: The node id of this runtime environment. + node_id: str = state_column(filterable=True) + #: The number of actors and tasks that use this runtime environment. + ref_cnt: Optional[int] = state_column(detail=True, filterable=False) + #: The error message if the runtime environment creation has failed. + #: Available if the runtime env is failed to be created. + error: Optional[str] = state_column(detail=True, filterable=True) + + +AVAILABLE_STATES = [ + ActorState, + PlacementGroupState, + NodeState, + WorkerState, + JobState, + TaskState, + ObjectState, + RuntimeEnvState, +] + + +for state in AVAILABLE_STATES: + if len(state.filterable_columns()) > 0: + filterable_cols = "\n\n ".join(state.filterable_columns()) + state.__doc__ += f""" +\nBelow columns can be used for the `--filter` option. +\n + {filterable_cols} +\n +""" + + if len(state.detail_columns()) > 0: + detail_cols = "\n\n ".join(state.detail_columns()) + state.__doc__ += f""" +\nBelow columns are available only when `get` API is used, +\n`--detail` is specified through CLI, or `detail=True` is given to Python APIs. +\n +\n + {detail_cols} +\n +""" + + +@dataclass(init=True) +class ListApiResponse: + # NOTE(rickyyx): We currently perform hard truncation when querying + # resources which could have a large number (e.g. asking raylets for + # the number of all objects). + # The returned of resources seen by the user will go through from the + # below funnel: + # - total + # | With truncation at the data source if the number of returned + # | resource exceeds `RAY_MAX_LIMIT_FROM_DATA_SOURCE` + # v + # - num_after_truncation + # | With filtering at the state API server + # v + # - num_filtered + # | With limiting, + # | set by min(`RAY_MAX_LIMIT_FROM_API_SERER`, ) + # v + # - len(result) + + # Total number of the available resource from the cluster. + total: int + # Number of resources returned by data sources after truncation + num_after_truncation: int + # Number of resources after filtering + num_filtered: int + # Returned data. None if no data is returned. + result: List[Dict] + # List API can have a partial failure if queries to + # all sources fail. For example, getting object states + # require to ping all raylets, and it is possible some of + # them fails. Note that it is impossible to guarantee high + # availability of data because ray's state information is + # not replicated. + partial_failure_warning: Optional[str] = "" + # A list of warnings to print. + warnings: Optional[List[str]] = None + + +""" +Summary API schema +""" + +DRIVER_TASK_ID_PREFIX = "ffffffffffffffffffffffffffffffffffffffff" + + +@dataclass(init=True) +class TaskSummaryPerFuncOrClassName: + #: The function or class name of this task. + func_or_class_name: str + #: The type of the class. Equivalent to protobuf TaskType. + type: str + #: State name to the count dict. State name is equivalent to + #: the protobuf TaskStatus. + state_counts: Dict[TypeTaskStatus, int] = field(default_factory=dict) + + +@dataclass +class Link: + #: The type of entity to link to + type: str + #: The id of the entity to link to + id: str + + +@dataclass(init=True) +class NestedTaskSummary: + #: The name of this task group + name: str + #: A unique identifier for this group + key: str + #: The type of the class. Equivalent to protobuf TaskType, + #: "ACTOR" if it represents an Actor, or "GROUP" if it's a grouping of tasks. + type: str + #: Unix timestamp to use to sort the task group. + timestamp: Optional[int] = None + #: State name to the count dict. State name is equivalent to + #: the protobuf TaskStatus. + state_counts: Dict[TypeTaskStatus, int] = field(default_factory=dict) + #: The child + children: List["NestedTaskSummary"] = field(default_factory=list) + #: A link to more details about this summary. + link: Optional[Link] = None + + +@dataclass +class TaskSummaries: + #: Group key -> summary. + #: Right now, we only have func_class_name as a key. + # TODO(sang): Support the task group abstraction. + summary: Union[Dict[str, TaskSummaryPerFuncOrClassName], List[NestedTaskSummary]] + #: Total Ray tasks. + total_tasks: int + #: Total actor tasks. + total_actor_tasks: int + #: Total scheduled actors. + total_actor_scheduled: int + summary_by: str = "func_name" + + @classmethod + def to_summary_by_func_name(cls, *, tasks: List[Dict]) -> "TaskSummaries": + # NOTE: The argument tasks contains a list of dictionary + # that have the same k/v as TaskState. + summary = {} + total_tasks = 0 + total_actor_tasks = 0 + total_actor_scheduled = 0 + + for task in tasks: + key = task["func_or_class_name"] + if key not in summary: + summary[key] = TaskSummaryPerFuncOrClassName( + func_or_class_name=task["func_or_class_name"], + type=task["type"], + ) + task_summary = summary[key] + + state = task["state"] + if state not in task_summary.state_counts: + task_summary.state_counts[state] = 0 + task_summary.state_counts[state] += 1 + + type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number + if type_enum == TaskType.NORMAL_TASK: + total_tasks += 1 + elif type_enum == TaskType.ACTOR_CREATION_TASK: + total_actor_scheduled += 1 + elif type_enum == TaskType.ACTOR_TASK: + total_actor_tasks += 1 + + return TaskSummaries( + summary=summary, + total_tasks=total_tasks, + total_actor_tasks=total_actor_tasks, + total_actor_scheduled=total_actor_scheduled, + summary_by="func_name", + ) + + @classmethod + def to_summary_by_lineage( + cls, *, tasks: List[Dict], actors: List[Dict] + ) -> "TaskSummaries": + """ + This summarizes tasks by lineage. + i.e. A task will be grouped with another task if they have the + same parent. + + This does things in 4 steps. + Step 1: Iterate through all tasks and keep track of them by id and ownership + Step 2: Put the tasks in a tree structure based on ownership + Step 3: Merge together siblings in the tree if there are more + than one with the same name. + Step 4: Total the children + + This can probably be more efficient if we merge together some steps to + reduce the amount of iterations but this algorithm produces very easy to + understand code. We can optimize in the future. + """ + # NOTE: The argument tasks contains a list of dictionary + # that have the same k/v as TaskState. + + tasks_by_id = {} + task_group_by_id = {} + actor_creation_task_id_for_actor_id = {} + summary = [] + total_tasks = 0 + total_actor_tasks = 0 + total_actor_scheduled = 0 + + # Step 1 + # We cannot assume that a parent task always comes before the child task + # So we need to keep track of all tasks by ids so we can quickly find the + # parent. + # We also track the actor creation tasks so we can quickly figure out the + # ownership of actors. + for task in tasks: + tasks_by_id[task["task_id"]] = task + type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number + if type_enum == TaskType.ACTOR_CREATION_TASK: + actor_creation_task_id_for_actor_id[task["actor_id"]] = task["task_id"] + + actor_dict = {actor["actor_id"]: actor for actor in actors} + + def get_or_create_task_group(task_id: str) -> Optional[NestedTaskSummary]: + """ + Gets an already created task_group + OR + Creates a task group and puts it in the right place under its parent. + For actor tasks, the parent is the Actor that owns it. For all other + tasks, the owner is the driver or task that created it. + + Returns None if there is missing data about the task or one of its parents. + + For task groups that represents actors, the id is in the + format actor:{actor_id} + """ + if task_id in task_group_by_id: + return task_group_by_id[task_id] + + task = tasks_by_id.get(task_id) + if not task: + logger.debug(f"We're missing data about {task_id}") + # We're missing data about this parent. So we're dropping the whole + # tree at that node. + return None + + # Use name first which allows users to customize the name of + # their remote function call using the name option. + func_name = task["name"] or task["func_or_class_name"] + task_id = task["task_id"] + type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number + + task_group_by_id[task_id] = NestedTaskSummary( + name=func_name, + key=task_id, + type=task["type"], + timestamp=task["creation_time_ms"], + link=Link(type="task", id=task_id), + ) + + # Set summary in right place under parent + if ( + type_enum == TaskType.ACTOR_TASK + or type_enum == TaskType.ACTOR_CREATION_TASK + ): + # For actor tasks, the parent is the actor and not the parent task. + parent_task_group = get_or_create_actor_task_group(task["actor_id"]) + if parent_task_group: + parent_task_group.children.append(task_group_by_id[task_id]) + else: + parent_task_id = task["parent_task_id"] + if not parent_task_id or parent_task_id.startswith( + DRIVER_TASK_ID_PREFIX + ): + summary.append(task_group_by_id[task_id]) + else: + parent_task_group = get_or_create_task_group(parent_task_id) + if parent_task_group: + parent_task_group.children.append(task_group_by_id[task_id]) + + return task_group_by_id[task_id] + + def get_or_create_actor_task_group( + actor_id: str, + ) -> Optional[NestedTaskSummary]: + """ + Gets an existing task group that represents an actor. + OR + Creates a task group that represents an actor. The owner of the actor is + the parent of the creation_task that created that actor. + + Returns None if there is missing data about the actor or one of its parents. + """ + key = f"actor:{actor_id}" + actor = actor_dict.get(actor_id) + if key not in task_group_by_id: + creation_task_id = actor_creation_task_id_for_actor_id.get(actor_id) + creation_task = tasks_by_id.get(creation_task_id) + + if not creation_task: + logger.debug(f"We're missing data about actor {actor_id}") + # We're missing data about the parent. So we're dropping the whole + # tree at that node. + return None + + # TODO(rickyx) + # We are using repr name for grouping actors if exists, + # else use class name. We should be using some group_name in the future. + if actor is None: + logger.debug( + f"We are missing actor info for actor {actor_id}, " + f"even though creation task exists: {creation_task}" + ) + [actor_name, *rest] = creation_task["func_or_class_name"].split(".") + else: + actor_name = ( + actor["repr_name"] + if actor["repr_name"] + else actor["class_name"] + ) + + task_group_by_id[key] = NestedTaskSummary( + name=actor_name, + key=key, + type="ACTOR", + timestamp=task["creation_time_ms"], + link=Link(type="actor", id=actor_id), + ) + + parent_task_id = creation_task["parent_task_id"] + if not parent_task_id or parent_task_id.startswith( + DRIVER_TASK_ID_PREFIX + ): + summary.append(task_group_by_id[key]) + else: + parent_task_group = get_or_create_task_group(parent_task_id) + if parent_task_group: + parent_task_group.children.append(task_group_by_id[key]) + + return task_group_by_id[key] + + # Step 2: Create the tree structure based on ownership + for task in tasks: + task_id = task["task_id"] + + task_group = get_or_create_task_group(task_id) + + if not task_group: + # We are probably missing data about this task or one of its parents. + continue + + state = task["state"] + if state not in task_group.state_counts: + task_group.state_counts[state] = 0 + task_group.state_counts[state] += 1 + + type_enum = TaskType.DESCRIPTOR.values_by_name[task["type"]].number + if type_enum == TaskType.NORMAL_TASK: + total_tasks += 1 + elif type_enum == TaskType.ACTOR_CREATION_TASK: + total_actor_scheduled += 1 + elif type_enum == TaskType.ACTOR_TASK: + total_actor_tasks += 1 + + def merge_sibings_for_task_group( + siblings: List[NestedTaskSummary], + ) -> Tuple[List[NestedTaskSummary], Optional[int]]: + """ + Merges task summaries with the same name into a group if there are more than + one child with that name. + + Args: + siblings: A list of NestedTaskSummary's to merge together + + Returns + Index 0: A list of NestedTaskSummary's which have been merged + Index 1: The smallest timestamp amongst the siblings + """ + if not len(siblings): + return siblings, None + + # Group by name + groups = {} + min_timestamp = None + + for child in siblings: + child.children, child_min_timestamp = merge_sibings_for_task_group( + child.children + ) + if child_min_timestamp and child_min_timestamp < ( + child.timestamp or sys.maxsize + ): + child.timestamp = child_min_timestamp + + if child.name not in groups: + groups[child.name] = NestedTaskSummary( + name=child.name, + key=child.name, + type="GROUP", + ) + groups[child.name].children.append(child) + if child.timestamp and child.timestamp < ( + groups[child.name].timestamp or sys.maxsize + ): + groups[child.name].timestamp = child.timestamp + if child.timestamp < (min_timestamp or sys.maxsize): + min_timestamp = child.timestamp + + # Take the groups that have more than one children and return it. + # For groups with just one child, return the child itself instead of + # creating a group. + return [ + group if len(group.children) > 1 else group.children[0] + for group in groups.values() + ], min_timestamp + + # Step 3 + summary, _ = merge_sibings_for_task_group(summary) + + def sort_task_groups(task_groups: List[NestedTaskSummary]) -> None: + # Sort by timestamp + # Put actor creation tasks above other tasks with the same timestamp + task_groups.sort(key=lambda x: 0 if x.type == "ACTOR_CREATION_TASK" else 1) + task_groups.sort(key=lambda x: x.timestamp or sys.maxsize) + + def calc_total_for_task_group( + task_group: NestedTaskSummary, + ) -> NestedTaskSummary: + """ + Calculates the total of a group as the sum of all children. + Sorts children by timestamp + """ + if not len(task_group.children): + return task_group + + for child in task_group.children: + totaled = calc_total_for_task_group(child) + + for state, count in totaled.state_counts.items(): + task_group.state_counts[state] = ( + task_group.state_counts.get(state, 0) + count + ) + + sort_task_groups(task_group.children) + + return task_group + + # Step 4 + summary = [calc_total_for_task_group(task_group) for task_group in summary] + sort_task_groups(summary) + + return TaskSummaries( + summary=summary, + total_tasks=total_tasks, + total_actor_tasks=total_actor_tasks, + total_actor_scheduled=total_actor_scheduled, + summary_by="lineage", + ) + + +@dataclass(init=True) +class ActorSummaryPerClass: + #: The class name of the actor. + class_name: str + #: State name to the count dict. State name is equivalent to + #: the protobuf ActorState. + state_counts: Dict[TypeActorStatus, int] = field(default_factory=dict) + + +@dataclass +class ActorSummaries: + #: Group key (actor class name) -> summary + summary: Dict[str, ActorSummaryPerClass] + #: Total number of actors + total_actors: int + summary_by: str = "class" + + @classmethod + def to_summary(cls, *, actors: List[Dict]): + # NOTE: The argument tasks contains a list of dictionary + # that have the same k/v as ActorState. + summary = {} + total_actors = 0 + + for actor in actors: + key = actor["class_name"] + if key not in summary: + summary[key] = ActorSummaryPerClass( + class_name=actor["class_name"], + ) + actor_summary = summary[key] + + state = actor["state"] + if state not in actor_summary.state_counts: + actor_summary.state_counts[state] = 0 + actor_summary.state_counts[state] += 1 + + total_actors += 1 + + return ActorSummaries( + summary=summary, + total_actors=total_actors, + ) + + +@dataclass(init=True) +class ObjectSummaryPerKey: + #: Total number of objects of the type. + total_objects: int + #: Total size in mb. + total_size_mb: float + #: Total number of workers that reference the type of objects. + total_num_workers: int + #: Total number of nodes that reference the type of objects. + total_num_nodes: int + #: State name to the count dict. State name is equivalent to + #: ObjectState. + task_state_counts: Dict[TypeTaskStatus, int] = field(default_factory=dict) + #: Ref count type to the count dict. State name is equivalent to + #: ObjectState. + ref_type_counts: Dict[TypeReferenceType, int] = field(default_factory=dict) + + +@dataclass +class ObjectSummaries: + #: Group key (actor class name) -> summary + summary: Dict[str, ObjectSummaryPerKey] + #: Total number of referenced objects in the cluster. + total_objects: int + #: Total size of referenced objects in the cluster in MB. + total_size_mb: float + #: Whether or not the callsite collection is enabled. + callsite_enabled: bool + summary_by: str = "callsite" + + @classmethod + def to_summary(cls, *, objects: List[Dict]): + # NOTE: The argument tasks contains a list of dictionary + # that have the same k/v as ObjectState. + summary = {} + total_objects = 0 + total_size_mb = 0 + key_to_workers = {} + key_to_nodes = {} + callsite_enabled = True + + for object in objects: + key = object["call_site"] + if key == "disabled": + callsite_enabled = False + if key not in summary: + summary[key] = ObjectSummaryPerKey( + total_objects=0, + total_size_mb=0, + total_num_workers=0, + total_num_nodes=0, + ) + key_to_workers[key] = set() + key_to_nodes[key] = set() + + object_summary = summary[key] + + task_state = object["task_status"] + if task_state not in object_summary.task_state_counts: + object_summary.task_state_counts[task_state] = 0 + object_summary.task_state_counts[task_state] += 1 + + ref_type = object["reference_type"] + if ref_type not in object_summary.ref_type_counts: + object_summary.ref_type_counts[ref_type] = 0 + object_summary.ref_type_counts[ref_type] += 1 + object_summary.total_objects += 1 + total_objects += 1 + + size_bytes = object["object_size"] + # object_size's unit is byte by default. It is -1, if the size is + # unknown. + if size_bytes != -1: + object_summary.total_size_mb += size_bytes / 1024**2 + total_size_mb += size_bytes / 1024**2 + + key_to_workers[key].add(object["pid"]) + key_to_nodes[key].add(object["ip"]) + + # Convert set of pid & node ips to length. + for key, workers in key_to_workers.items(): + summary[key].total_num_workers = len(workers) + for key, nodes in key_to_nodes.items(): + summary[key].total_num_nodes = len(nodes) + + return ObjectSummaries( + summary=summary, + total_objects=total_objects, + total_size_mb=total_size_mb, + callsite_enabled=callsite_enabled, + ) + + +@dataclass(init=True) +class StateSummary: + #: Node ID -> summary per node + #: If the data is not required to be orgnized per node, it will contain + #: a single key, "cluster". + node_id_to_summary: Dict[str, Union[TaskSummaries, ActorSummaries, ObjectSummaries]] + + +@dataclass(init=True) +class SummaryApiResponse: + # Carried over from ListApiResponse + # We currently use list API for listing the resources + total: int + # Carried over from ListApiResponse + # Number of resources returned by data sources after truncation + num_after_truncation: int + # Number of resources after filtering + num_filtered: int + result: StateSummary = None + partial_failure_warning: Optional[str] = "" + # A list of warnings to print. + warnings: Optional[List[str]] = None + + +def resource_to_schema(resource: StateResource) -> StateSchema: + if resource == StateResource.ACTORS: + return ActorState + elif resource == StateResource.JOBS: + return JobState + elif resource == StateResource.NODES: + return NodeState + elif resource == StateResource.OBJECTS: + return ObjectState + elif resource == StateResource.PLACEMENT_GROUPS: + return PlacementGroupState + elif resource == StateResource.RUNTIME_ENVS: + return RuntimeEnvState + elif resource == StateResource.TASKS: + return TaskState + elif resource == StateResource.WORKERS: + return WorkerState + elif resource == StateResource.CLUSTER_EVENTS: + return ClusterEventState + else: + assert False, "Unreachable" + + +def protobuf_message_to_dict( + message, + fields_to_decode: List[str], + preserving_proto_field_name: bool = True, +) -> dict: + """Convert a protobuf message to dict + + Args: + fields_to_decode: field names which will be decoded from binary to hex. + preserving_proto_field_name: a pass-through option for protobuf message + method. See google.protobuf MessageToDict + + Return: + Dictionary of the converted rpc protobuf. + """ + return dashboard_utils.message_to_dict( + message, + fields_to_decode, + including_default_value_fields=True, + preserving_proto_field_name=preserving_proto_field_name, + ) + + +def protobuf_to_task_state_dict(message: TaskEvents) -> dict: + """ + Convert a TaskEvents to a dic repr of `TaskState` + """ + task_attempt = protobuf_message_to_dict( + message=message, + fields_to_decode=[ + "task_id", + "job_id", + "node_id", + "actor_id", + "parent_task_id", + "worker_id", + "placement_group_id", + "component_id", + ], + ) + + task_state = {} + task_info = task_attempt.get("task_info", {}) + state_updates = task_attempt.get("state_updates", {}) + profiling_data = task_attempt.get("profile_events", {}) + if profiling_data: + for event in profiling_data["events"]: + # End/start times are recorded in ns. We convert them to ms. + event["end_time"] = int(event["end_time"]) / 1e6 + event["start_time"] = int(event["start_time"]) / 1e6 + event["extra_data"] = json.loads(event["extra_data"]) + task_state["profiling_data"] = profiling_data + + # Convert those settable fields + mappings = [ + ( + task_info, + [ + "task_id", + "name", + "actor_id", + "type", + "func_or_class_name", + "language", + "required_resources", + "runtime_env_info", + "parent_task_id", + "placement_group_id", + ], + ), + (task_attempt, ["task_id", "attempt_number", "job_id"]), + ( + state_updates, + ["node_id", "worker_id", "task_log_info", "actor_repr_name"], + ), + ] + for src, keys in mappings: + for key in keys: + task_state[key] = src.get(key) + + task_state["creation_time_ms"] = None + task_state["start_time_ms"] = None + task_state["end_time_ms"] = None + events = [] + + for state in TaskStatus.keys(): + key = f"{state.lower()}_ts" + if key in state_updates: + # timestamp is recorded as nanosecond from the backend. + # We need to convert it to the second. + ts_ms = int(state_updates[key]) // 1e6 + events.append( + { + "state": state, + "created_ms": ts_ms, + } + ) + if state == "PENDING_ARGS_AVAIL": + task_state["creation_time_ms"] = ts_ms + if state == "RUNNING": + task_state["start_time_ms"] = ts_ms + if state == "FINISHED" or state == "FAILED": + task_state["end_time_ms"] = ts_ms + + task_state["events"] = events + if len(events) > 0: + latest_state = events[-1]["state"] + else: + latest_state = "NIL" + task_state["state"] = latest_state + + # Parse error info + if latest_state == "FAILED": + error_info = state_updates.get("error_info", None) + if error_info: + # We captured colored error message printed to console, e.g. + # "\x1b[31mTraceback (most recent call last):\x1b[0m", + # this is to remove the ANSI escape codes. + task_state["error_message"] = remove_ansi_escape_codes( + error_info.get("error_message", "") + ) + task_state["error_type"] = error_info.get("error_type", "") + + # Parse actor task name for actor with repr name. + if ( + state_updates.get("actor_repr_name") + and task_state["type"] == "ACTOR_TASK" + and task_state["name"] + == task_state["func_or_class_name"] # no name option provided. + ): + # If it's an actor task with no name override, and has repr name defined + # for the actor, we override the name. + method_name = task_state["name"].split(".")[-1] + actor_repr_task_name = f"{state_updates['actor_repr_name']}.{method_name}" + task_state["name"] = actor_repr_task_name + + return task_state + + +def remove_ansi_escape_codes(text: str) -> str: + """Remove ANSI escape codes from a string.""" + import re + + return re.sub(r"\x1b[^m]*m", "", text) + + +def dict_to_state(d: Dict, state_schema: StateSchema) -> StateSchema: + """Convert a dict to a state schema. + + Args: + d: a dict to convert. + state_schema: a schema to convert to. + + Returns: + A state schema. + """ + try: + return resource_to_schema(state_schema)(**d) + except Exception as e: + raise RayStateApiException(f"Failed to convert {d} to StateSchema: {e}") from e diff --git a/python/ray/util/state/custom_types.py b/python/ray/util/state/custom_types.py new file mode 100644 index 000000000000..5f3535a27446 --- /dev/null +++ b/python/ray/util/state/custom_types.py @@ -0,0 +1,100 @@ +import sys + +from ray.core.generated.common_pb2 import ( + TaskStatus, + TaskType, + WorkerExitType, + WorkerType, +) +from ray.core.generated.gcs_pb2 import ( + ActorTableData, + GcsNodeInfo, + PlacementGroupTableData, +) +from ray.dashboard.memory_utils import ReferenceType + +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal + + +ACTOR_STATUS = [ + "DEPENDENCIES_UNREADY", + "PENDING_CREATION", + "ALIVE", + "RESTARTING", + "DEAD", +] +TypeActorStatus = Literal[tuple(ACTOR_STATUS)] +PLACEMENT_GROUP_STATUS = [ + "PENDING", + "CREATED", + "REMOVED", + "RESCHEDULING", +] +TypePlacementGroupStatus = Literal[tuple(PLACEMENT_GROUP_STATUS)] +TASK_STATUS = [ + "NIL", + "PENDING_ARGS_AVAIL", + "PENDING_NODE_ASSIGNMENT", + "PENDING_OBJ_STORE_MEM_AVAIL", + "PENDING_ARGS_FETCH", + "SUBMITTED_TO_WORKER", + "RUNNING", + "RUNNING_IN_RAY_GET", + "RUNNING_IN_RAY_WAIT", + "FINISHED", + "FAILED", +] +TypeTaskStatus = Literal[tuple(TASK_STATUS)] +NODE_STATUS = ["ALIVE", "DEAD"] +TypeNodeStatus = Literal[tuple(NODE_STATUS)] +WORKER_TYPE = [ + "WORKER", + "DRIVER", + "SPILL_WORKER", + "RESTORE_WORKER", +] +TypeWorkerType = Literal[tuple(WORKER_TYPE)] +WORKER_EXIT_TYPE = [ + "SYSTEM_ERROR", + "INTENDED_SYSTEM_EXIT", + "USER_ERROR", + "INTENDED_USER_EXIT", + "NODE_OUT_OF_MEMORY", +] +TypeWorkerExitType = Literal[tuple(WORKER_EXIT_TYPE)] +TASK_TYPE = [ + "NORMAL_TASK", + "ACTOR_CREATION_TASK", + "ACTOR_TASK", + "DRIVER_TASK", +] +TypeTaskType = Literal[tuple(TASK_TYPE)] +TypeReferenceType = Literal[ + tuple(reference_type.value for reference_type in ReferenceType) +] + + +def validate_protobuf_enum(grpc_enum, custom_enum): + """Validate the literal contains the correct enum values from protobuf""" + enum_vals = set(grpc_enum.DESCRIPTOR.values_by_name) + # Sometimes, the grpc enum is mocked, and it + # doesn't include any values in that case. + if len(enum_vals) > 0: + assert enum_vals == set(custom_enum) + + +# Do the enum validation here. +# It is necessary to avoid regression. Alternatively, we can auto generate this +# directly by protobuf. +validate_protobuf_enum(ActorTableData.ActorState, ACTOR_STATUS) +validate_protobuf_enum( + PlacementGroupTableData.PlacementGroupState, PLACEMENT_GROUP_STATUS +) +validate_protobuf_enum(TaskStatus, TASK_STATUS) +validate_protobuf_enum(GcsNodeInfo.GcsNodeState, NODE_STATUS) +validate_protobuf_enum(WorkerType, WORKER_TYPE) +validate_protobuf_enum(WorkerExitType, WORKER_EXIT_TYPE) +validate_protobuf_enum(TaskType, TASK_TYPE) diff --git a/python/ray/util/state/exception.py b/python/ray/util/state/exception.py new file mode 100644 index 000000000000..8d8a180c2c32 --- /dev/null +++ b/python/ray/util/state/exception.py @@ -0,0 +1,18 @@ +"""Internal Error""" + + +class DataSourceUnavailable(Exception): + pass + + +"""User-facing Error""" + + +class RayStateApiException(Exception): + pass + + +class ServerUnavailable(RayStateApiException): + """Thrown when failing to connect to dashboard server""" + + pass diff --git a/python/ray/util/state/state_cli.py b/python/ray/util/state/state_cli.py new file mode 100644 index 000000000000..f8c992d58b2d --- /dev/null +++ b/python/ray/util/state/state_cli.py @@ -0,0 +1,1308 @@ +import json +import logging +from datetime import datetime +from enum import Enum, unique +from typing import Dict, List, Optional, Tuple + +import click +import yaml + +import ray._private.services as services +from ray._private.thirdparty.tabulate.tabulate import tabulate +from ray.util.state import ( + StateApiClient, + get_log, + list_logs, + summarize_actors, + summarize_objects, + summarize_tasks, +) +from ray.util.state.common import ( + DEFAULT_LIMIT, + DEFAULT_LOG_LIMIT, + DEFAULT_RPC_TIMEOUT, + GetApiOptions, + ListApiOptions, + PredicateType, + StateResource, + StateSchema, + SupportedFilterType, + resource_to_schema, +) +from ray.util.state.exception import RayStateApiException +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(__name__) + + +@unique +class AvailableFormat(Enum): + DEFAULT = "default" + JSON = "json" + YAML = "yaml" + TABLE = "table" + + +def _parse_filter(filter: str) -> Tuple[str, PredicateType, SupportedFilterType]: + """Parse the filter string to a tuple of key, preciate, and value.""" + # The function assumes there's going to be no key that includes "="" or "!=". + # Since key is controlled by us, it should be trivial to keep the invariant. + predicate = None + # Tuple of [predicate_start, predicate_end). + predicate_index = None + + # Find the first predicate match. This logic works because we assume the + # key doesn't contain = or !=. + for i in range(len(filter)): + char = filter[i] + if char == "=": + predicate = "=" + predicate_index = (i, i + 1) + break + elif char == "!": + if len(filter) <= i + 1: + continue + + next_char = filter[i + 1] + if next_char == "=": + predicate = "!=" + predicate_index = (i, i + 2) + break + + if not predicate or not predicate_index: + raise ValueError( + f"The format of a given filter {filter} is invalid: " + "Cannot find the predicate. " + "Please provide key=val or key!=val format string." + ) + + key, predicate, value = ( + filter[: predicate_index[0]], + filter[predicate_index[0] : predicate_index[1]], + filter[predicate_index[1] :], + ) + + assert predicate == "=" or predicate == "!=" + if len(key) == 0 or len(value) == 0: + raise ValueError( + f"The format of a given filter {filter} is invalid: " + f"Cannot identify key {key} or value, {value}. " + "Please provide key=val or key!=val format string." + ) + + return (key, predicate, value) + + +def _get_available_formats() -> List[str]: + """Return the available formats in a list of string""" + return [format_enum.value for format_enum in AvailableFormat] + + +def _get_available_resources( + excluded: Optional[List[StateResource]] = None, +) -> List[str]: + """Return the available resources in a list of string + + Args: + excluded: List of resources that should be excluded + """ + # All resource names use '_' rather than '-'. But users options have '-' + return [ + e.value.replace("_", "-") + for e in StateResource + if excluded is None or e not in excluded + ] + + +def get_table_output(state_data: List, schema: StateSchema, detail: bool) -> str: + """Display the table output. + + The table headers are ordered as the order defined in the dataclass of + `StateSchema`. For example, + + @dataclass + class A(StateSchema): + a: str + b: str + c: str + + will create headers + A B C + ----- + + Args: + state_data: A list of state data. + schema: The schema for the corresponding resource. + + Returns: + The table formatted string. + """ + time = datetime.now() + header = "=" * 8 + f" List: {time} " + "=" * 8 + headers = [] + table = [] + cols = schema.list_columns(detail=detail) + for data in state_data: + for key, val in data.items(): + if isinstance(val, dict): + data[key] = yaml.dump(val, indent=2) + keys = set(data.keys()) + headers = [] + for col in cols: + if col in keys: + headers.append(col.upper()) + table.append([data[header.lower()] for header in headers]) + return f""" +{header} +Stats: +------------------------------ +Total: {len(state_data)} + +Table: +------------------------------ +{tabulate(table, headers=headers, showindex=True, tablefmt="plain", floatfmt=".3f")} +""" + + +def output_with_format( + state_data: List[Dict], + *, + schema: Optional[StateSchema], + format: AvailableFormat = AvailableFormat.DEFAULT, + detail: bool = False, +) -> str: + # humanify all input state data + if schema: + state_data = [schema.humanify(state) for state in state_data] + if format == AvailableFormat.DEFAULT: + return get_table_output(state_data, schema, detail) + if format == AvailableFormat.YAML: + return yaml.dump( + state_data, + indent=4, + explicit_start=True, + # We want to keep the defined ordering of the states, thus sort_keys=False + sort_keys=False, + ) + elif format == AvailableFormat.JSON: + return json.dumps(state_data) + elif format == AvailableFormat.TABLE: + return get_table_output(state_data, schema, detail) + else: + raise ValueError( + f"Unexpected format: {format}. " + f"Supported formatting: {_get_available_formats()}" + ) + + +def format_summary_output(state_data: Dict, *, resource: StateResource) -> str: + if len(state_data) == 0: + return "No resource in the cluster" + + # Parse the data. + cluster_data = state_data["cluster"] + summaries = cluster_data["summary"] + summary_by = cluster_data["summary_by"] + del cluster_data["summary_by"] + del cluster_data["summary"] + + cluster_info_table = yaml.dump(cluster_data, indent=2) + + # Create a table. + table = [] + headers = [] + for summary in summaries.values(): + # Convert dict to yaml for better formatting. + for key, val in summary.items(): + if isinstance(val, dict): + summary[key] = yaml.dump(val, indent=2) + + headers = sorted([key.upper() for key in summary.keys()]) + table.append([summary[header.lower()] for header in headers]) + + summary_table = tabulate( + table, headers=headers, showindex=True, tablefmt="plain", numalign="left" + ) + + time = datetime.now() + header = "=" * 8 + f" {resource.value.capitalize()} Summary: {time} " + "=" * 8 + return f""" +{header} +Stats: +------------------------------------ +{cluster_info_table} + +Table (group by {summary_by}): +------------------------------------ +{summary_table} +""" + + +def format_object_summary_output(state_data: Dict) -> str: + if len(state_data) == 0: + return "No resource in the cluster" + + # Parse the data. + cluster_data = state_data["cluster"] + summaries = cluster_data["summary"] + summary_by = cluster_data["summary_by"] + del cluster_data["summary_by"] + del cluster_data["summary"] + + cluster_info_table = yaml.dump(cluster_data, indent=2) + + # Create a table per callsite. + tables = [] + for callsite, summary in summaries.items(): + # Convert dict to yaml for better formatting. + for key, val in summary.items(): + if isinstance(val, dict): + summary[key] = yaml.dump(val, indent=2) + + table = [] + headers = sorted([key.upper() for key in summary.keys()]) + table.append([summary[header.lower()] for header in headers]) + table_for_callsite = tabulate( + table, headers=headers, showindex=True, numalign="left" + ) + + # Format callsite. | is a separator for ray callsite. + formatted_callsite = callsite.replace("|", "\n|") + tables.append(f"{formatted_callsite}\n{table_for_callsite}") + + time = datetime.now() + header = "=" * 8 + f" Object Summary: {time} " + "=" * 8 + table_string = "\n\n\n\n".join(tables) + return f""" +{header} +Stats: +------------------------------------ +{cluster_info_table} + +Table (group by {summary_by}) +------------------------------------ +{table_string} +""" + + +def format_get_api_output( + state_data: Optional[StateSchema], + id: str, + *, + schema: StateSchema, + format: AvailableFormat = AvailableFormat.YAML, +) -> str: + if not state_data or isinstance(state_data, list) and len(state_data) == 0: + return f"Resource with id={id} not found in the cluster." + + if not isinstance(state_data, list): + state_data = [state_data] + state_data = [state.asdict() for state in state_data] + + return output_with_format(state_data, schema=schema, format=format, detail=True) + + +def format_list_api_output( + state_data: List[StateSchema], + *, + schema: StateSchema, + format: AvailableFormat = AvailableFormat.DEFAULT, + detail: bool = False, +) -> str: + if len(state_data) == 0: + return "No resource in the cluster" + state_data = [state.asdict() for state in state_data] + return output_with_format(state_data, schema=schema, format=format, detail=detail) + + +def _should_explain(format: AvailableFormat) -> bool: + # If the format is json or yaml, it should not print stats because + # users don't want additional strings. + return format == AvailableFormat.DEFAULT or format == AvailableFormat.TABLE + + +""" +Common Options for State API commands +""" +timeout_option = click.option( + "--timeout", + default=DEFAULT_RPC_TIMEOUT, + help=f"Timeout in seconds for the API requests. Default is {DEFAULT_RPC_TIMEOUT}", +) +address_option = click.option( + "--address", + default=None, + help=( + "The address of Ray API server. If not provided, it will be configured " + "automatically from querying the GCS server." + ), +) + + +@click.command() +@click.argument( + "resource", + # NOTE(rickyyx): We are not allowing query job with id, and runtime envs + type=click.Choice( + _get_available_resources( + excluded=[StateResource.JOBS, StateResource.RUNTIME_ENVS] + ) + ), +) +@click.argument( + "id", + type=str, +) +@address_option +@timeout_option +@PublicAPI(stability="stable") +def ray_get( + resource: str, + id: str, + address: Optional[str], + timeout: float, +): + """Get a state of a given resource by ID. + + We currently DO NOT support get by id for jobs and runtime-envs + + The output schema is defined at :ref:`State API Schema section. ` + + For example, the output schema of `ray get tasks ` is + :class:`~ray.util.state.common.TaskState`. + + Usage: + + Get an actor with actor id + + ``` + ray get actors + ``` + + Get a placement group information with + + ``` + ray get placement-groups + ``` + + The API queries one or more components from the cluster to obtain the data. + The returned state snapshot could be stale, and it is not guaranteed to return + the live data. + + Args: + resource: The type of the resource to query. + id: The id of the resource. + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + """ # noqa: E501 + # All resource names use '_' rather than '-'. But users options have '-' + resource = StateResource(resource.replace("-", "_")) + + # Create the State API server and put it into context + logger.debug(f"Create StateApiClient to ray instance at: {address}...") + client = StateApiClient(address=address) + options = GetApiOptions(timeout=timeout) + + # If errors occur, exceptions will be thrown. + try: + data = client.get( + resource=resource, + id=id, + options=options, + _explain=_should_explain(AvailableFormat.YAML), + ) + except RayStateApiException as e: + raise click.UsageError(str(e)) + + # Print data to console. + print( + format_get_api_output( + state_data=data, + id=id, + schema=resource_to_schema(resource), + format=AvailableFormat.YAML, + ) + ) + + +@click.command() +@click.argument( + "resource", + type=click.Choice(_get_available_resources()), +) +@click.option( + "--format", default="default", type=click.Choice(_get_available_formats()) +) +@click.option( + "-f", + "--filter", + help=( + "A key, predicate, and value to filter the result. " + "E.g., --filter 'key=value' or --filter 'key!=value'. " + "You can specify multiple --filter options. In this case all predicates " + "are concatenated as AND. For example, --filter key=value --filter key2=value " + "means (key==val) AND (key2==val2)" + ), + multiple=True, +) +@click.option( + "--limit", + default=DEFAULT_LIMIT, + type=int, + help=("Maximum number of entries to return. 100 by default."), +) +@click.option( + "--detail", + help=( + "If the flag is set, the output will contain data in more details. " + "Note that the API could query more sources " + "to obtain information in a greater detail." + ), + is_flag=True, + default=False, +) +@timeout_option +@address_option +@PublicAPI(stability="stable") +def ray_list( + resource: str, + format: str, + filter: List[str], + limit: int, + detail: bool, + timeout: float, + address: str, +): + """List all states of a given resource. + + Normally, summary APIs are recommended before listing all resources. + + The output schema is defined at :ref:`State API Schema section. ` + + For example, the output schema of `ray list tasks` is + :class:`~ray.util.state.common.TaskState`. + + Usage: + + List all actor information from the cluster. + + ``` + ray list actors + ``` + + List 50 actors from the cluster. The sorting order cannot be controlled. + + ``` + ray list actors --limit 50 + ``` + + List 10 actors with state PENDING. + + ``` + ray list actors --limit 10 --filter "state=PENDING" + ``` + + List actors with yaml format. + + ``` + ray list actors --format yaml + ``` + + List actors with details. When --detail is specified, it might query + more data sources to obtain data in details. + + ``` + ray list actors --detail + ``` + + The API queries one or more components from the cluster to obtain the data. + The returned state snapshot could be stale, and it is not guaranteed to return + the live data. + + The API can return partial or missing output upon the following scenarios. + + - When the API queries more than 1 component, if some of them fail, + the API will return the partial result (with a suppressible warning). + - When the API returns too many entries, the API + will truncate the output. Currently, truncated data cannot be + selected by users. + + Args: + resource: The type of the resource to query. + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + """ # noqa: E501 + # All resource names use '_' rather than '-'. But users options have '-' + resource = StateResource(resource.replace("-", "_")) + format = AvailableFormat(format) + + # Create the State API server and put it into context + client = StateApiClient(address=address) + + filter = [_parse_filter(f) for f in filter] + + options = ListApiOptions( + limit=limit, + timeout=timeout, + filters=filter, + detail=detail, + ) + + # If errors occur, exceptions will be thrown. Empty data indicate successful query. + try: + data = client.list( + resource, + options=options, + raise_on_missing_output=False, + _explain=_should_explain(format), + ) + except RayStateApiException as e: + raise click.UsageError(str(e)) + + # If --detail is given, the default formatting is yaml. + if detail and format == AvailableFormat.DEFAULT: + format = AvailableFormat.YAML + + # Print data to console. + print( + format_list_api_output( + state_data=data, + schema=resource_to_schema(resource), + format=format, + detail=detail, + ) + ) + + +@click.group("summary") +@click.pass_context +@PublicAPI(stability="stable") +def summary_state_cli_group(ctx): + """Return the summarized information of a given resource.""" + pass + + +@summary_state_cli_group.command(name="tasks") +@timeout_option +@address_option +@click.pass_context +@PublicAPI(stability="stable") +def task_summary(ctx, timeout: float, address: str): + """Summarize the task state of the cluster. + + By default, the output contains the information grouped by + task function names. + + The output schema is + :class:`~ray.util.state.common.TaskSummaries`. + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + """ # noqa: E501 + print( + format_summary_output( + summarize_tasks( + address=address, + timeout=timeout, + raise_on_missing_output=False, + _explain=True, + ), + resource=StateResource.TASKS, + ) + ) + + +@summary_state_cli_group.command(name="actors") +@timeout_option +@address_option +@click.pass_context +@PublicAPI(stability="stable") +def actor_summary(ctx, timeout: float, address: str): + """Summarize the actor state of the cluster. + + By default, the output contains the information grouped by + actor class names. + + The output schema is + :class:`ray.util.state.common.ActorSummaries + `. + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + """ # noqa: E501 + print( + format_summary_output( + summarize_actors( + address=address, + timeout=timeout, + raise_on_missing_output=False, + _explain=True, + ), + resource=StateResource.ACTORS, + ) + ) + + +@summary_state_cli_group.command(name="objects") +@timeout_option +@address_option +@click.pass_context +@PublicAPI(stability="stable") +def object_summary(ctx, timeout: float, address: str): + """Summarize the object state of the cluster. + + The API is recommended when debugging memory leaks. + See :ref:`Debugging with Ray Memory ` for more details. + (Note that this command is almost equivalent to `ray memory`, but it returns + easier-to-understand output). + + By default, the output contains the information grouped by + object callsite. Note that the callsite is not collected and + all data will be aggregated as "disable" callsite if the env var + `RAY_record_ref_creation_sites` is not configured. To enable the + callsite collection, set the following environment variable when + starting Ray. + + Example: + + ``` + RAY_record_ref_creation_sites=1 ray start --head + ``` + + ``` + RAY_record_ref_creation_sites=1 ray_script.py + ``` + + The output schema is + :class:`ray.util.state.common.ObjectSummaries + `. + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + """ # noqa: E501 + print( + format_object_summary_output( + summarize_objects( + address=address, + timeout=timeout, + raise_on_missing_output=False, + _explain=True, + ), + ) + ) + + +log_follow_option = click.option( + "--follow", + "-f", + required=False, + type=bool, + is_flag=True, + help="Streams the log file as it is updated instead of just tailing.", +) + +log_tail_option = click.option( + "--tail", + required=False, + type=int, + default=DEFAULT_LOG_LIMIT, + help="Number of lines to tail from log. Use -1 to fetch the whole file.", +) + +log_interval_option = click.option( + "--interval", + required=False, + type=float, + default=None, + help="The interval in secs to print new logs when `--follow` is specified.", + hidden=True, +) + +log_timeout_option = click.option( + "--timeout", + default=DEFAULT_RPC_TIMEOUT, + help=( + "Timeout in seconds for the API requests. " + f"Default is {DEFAULT_RPC_TIMEOUT}. If --follow is specified, " + "this option will be ignored." + ), +) + +log_node_ip_option = click.option( + "-ip", + "--node-ip", + required=False, + type=str, + default=None, + help="Filters the logs by this ip address", +) + +log_node_id_option = click.option( + "--node-id", + "-id", + required=False, + type=str, + default=None, + help="Filters the logs by this NodeID", +) + +log_suffix_option = click.option( + "--err", + is_flag=True, + default=False, + help=( + "If supplied, querying stderr files for workers/actors, " + "else defaults to stdout files." + ), +) + +log_encoding_option = click.option( + "--encoding", + required=False, + default="utf-8", + help=( + "The encoding use to decode the log file. Accepts any encoding " + "supported by Python's `codecs` module. Defaults to utf-8." + ), +) + +log_encoding_errors_option = click.option( + "--encoding-errors", + required=False, + default="strict", + help=( + "The error handling scheme to use for decoding errors. " + "Accepts any error handling scheme supported by Python's `codecs`" + "module. Defaults to strict." + ), +) + + +def _get_head_node_ip(address: Optional[str] = None): + """Get the head node ip from the ray address if possible + + Args: + address: ray cluster address, e.g. "auto", "localhost:6379" + + Raises: + click.UsageError if node ip could not be resolved + """ + try: + address = services.canonicalize_bootstrap_address_or_die(address) + return address.split(":")[0] + except (ConnectionError, ValueError) as e: + # Hide all the stack trace + raise click.UsageError(str(e)) + + +def _print_log( + address: Optional[str] = None, + node_id: Optional[str] = None, + node_ip: Optional[str] = None, + filename: Optional[str] = None, + actor_id: Optional[str] = None, + pid: Optional[int] = None, + follow: bool = False, + tail: int = DEFAULT_LOG_LIMIT, + timeout: int = DEFAULT_RPC_TIMEOUT, + interval: Optional[float] = None, + suffix: str = "out", + encoding: str = "utf-8", + encoding_errors: str = "strict", + task_id: Optional[str] = None, + attempt_number: int = 0, + submission_id: Optional[str] = None, +): + """Wrapper around `get_log()` that prints the preamble and the log lines""" + if tail > 0: + print( + f"--- Log has been truncated to last {tail} lines." + " Use `--tail` flag to toggle. Set to -1 for getting the entire file. ---\n" + ) + + if node_id is None and node_ip is None: + # Auto detect node ip from the ray address when address neither is given + node_ip = _get_head_node_ip(address) + + for chunk in get_log( + address=address, + node_id=node_id, + node_ip=node_ip, + filename=filename, + actor_id=actor_id, + tail=tail, + pid=pid, + follow=follow, + _interval=interval, + timeout=timeout, + suffix=suffix, + encoding=encoding, + errors=encoding_errors, + task_id=task_id, + attempt_number=attempt_number, + submission_id=submission_id, + ): + print(chunk, end="", flush=True) + + +LOG_CLI_HELP_MSG = """ +Get logs based on filename (cluster) or resource identifiers (actor) + +Example: + + Get all the log files available on a node (ray address could be + obtained from `ray start --head` or `ray.init()`). + + ``` + ray logs cluster + ``` + + [ray logs cluster] Print the last 500 lines of raylet.out on a head node. + + ``` + ray logs cluster raylet.out --tail 500 + ``` + + Or simply, using `ray logs` as an alias for `ray logs cluster`: + + ``` + ray logs raylet.out --tail 500 + ``` + + Print the last 500 lines of raylet.out on a worker node id A. + + ``` + ray logs raylet.out --tail 500 —-node-id A + ``` + + [ray logs actor] Follow the log file with an actor id ABC. + + ``` + ray logs actor --id ABC --follow + ``` + + [ray logs task] Get the std err generated by a task. + + ``` + ray logs task --id --err + ``` +""" + + +class LogCommandGroup(click.Group): + def resolve_command(self, ctx, args): + """Try resolve the command line args assuming users omitted the subcommand. + + This overrides the default `resolve_command` for the parent class. + This will allow command alias of `ray ` to `ray cluster `. + """ + ctx.resilient_parsing = True + res = super().resolve_command(ctx, args) + cmd_name, cmd, parsed_args = res + if cmd is None: + # It could have been `ray logs ...`, forward to `ray logs cluster ...` + return super().resolve_command(ctx, ["cluster"] + args) + return cmd_name, cmd, parsed_args + + +logs_state_cli_group = LogCommandGroup(help=LOG_CLI_HELP_MSG) + + +@logs_state_cli_group.command(name="cluster") +@click.argument( + "glob_filter", + required=False, + default="*", +) +@address_option +@log_node_id_option +@log_node_ip_option +@log_follow_option +@log_tail_option +@log_interval_option +@log_timeout_option +@log_encoding_option +@log_encoding_errors_option +@click.pass_context +@PublicAPI(stability="stable") +def log_cluster( + ctx, + glob_filter: str, + address: Optional[str], + node_id: Optional[str], + node_ip: Optional[str], + follow: bool, + tail: int, + interval: float, + timeout: int, + encoding: str, + encoding_errors: str, +): + """Get/List logs that matches the GLOB_FILTER in the cluster. + By default, it prints a list of log files that match the filter. + By default, it prints the head node logs. + If there's only 1 match, it will print the log file. + + Example: + + Print the last 500 lines of raylet.out on a head node. + + ``` + ray logs [cluster] raylet.out --tail 500 + ``` + + Print the last 500 lines of raylet.out on a worker node id A. + + ``` + ray logs [cluster] raylet.out --tail 500 —-node-id A + ``` + + Download the gcs_server.txt file to the local machine. + + ``` + ray logs [cluster] gcs_server.out --tail -1 > gcs_server.txt + ``` + + Follow the log files from the last 100 lines. + + ``` + ray logs [cluster] raylet.out --tail 100 -f + ``` + + Raises: + :class:`RayStateApiException ` if the CLI + is failed to query the data. + """ # noqa: E501 + + if node_id is None and node_ip is None: + node_ip = _get_head_node_ip(address) + + logs = list_logs( + address=address, + node_id=node_id, + node_ip=node_ip, + glob_filter=glob_filter, + timeout=timeout, + ) + + log_files_found = [] + for _, log_files in logs.items(): + for log_file in log_files: + log_files_found.append(log_file) + + if len(log_files_found) != 1: + # Print the list of log files found if no unique log found + if node_id: + print(f"Node ID: {node_id}") + elif node_ip: + print(f"Node IP: {node_ip}") + print(output_with_format(logs, schema=None, format=AvailableFormat.YAML)) + return + + # If there's only 1 file, that means there's a unique match. + filename = log_files_found[0] + + _print_log( + address=address, + node_id=node_id, + node_ip=node_ip, + filename=filename, + tail=tail, + follow=follow, + interval=interval, + timeout=timeout, + encoding=encoding, + encoding_errors=encoding_errors, + ) + + +@logs_state_cli_group.command(name="actor") +@click.option( + "--id", + "-a", + required=False, + type=str, + default=None, + help="Retrieves the logs corresponding to this ActorID.", +) +@click.option( + "--pid", + "-pid", + required=False, + type=str, + default=None, + help="Retrieves the logs from the actor with this pid.", +) +@address_option +@log_node_id_option +@log_node_ip_option +@log_follow_option +@log_tail_option +@log_interval_option +@log_timeout_option +@log_suffix_option +@click.pass_context +@PublicAPI(stability="stable") +def log_actor( + ctx, + id: Optional[str], + pid: Optional[str], + address: Optional[str], + node_id: Optional[str], + node_ip: Optional[str], + follow: bool, + tail: int, + interval: float, + timeout: int, + err: bool, +): + """Get/List logs associated with an actor. + + Example: + + Follow the log file with an actor id ABCDEFG. + + ``` + ray logs actor --id ABCDEFG --follow + ``` + + Get the actor log from pid 123, ip x.x.x.x + Note that this goes well with the driver log of Ray which prints + (ip=x.x.x.x, pid=123, class_name) logs. + + ``` + ray logs actor --pid=123 —ip=x.x.x.x + ``` + + Get the actor err log file. + + ``` + ray logs actor --id ABCDEFG --err + ``` + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + MissingParameter if inputs are missing. + """ # noqa: E501 + + if pid is None and id is None: + raise click.MissingParameter( + message="At least one of `--pid` and `--id` has to be set", + param_type="option", + ) + + _print_log( + address=address, + node_id=node_id, + node_ip=node_ip, + pid=pid, + actor_id=id, + tail=tail, + follow=follow, + interval=interval, + timeout=timeout, + suffix="err" if err else "out", + ) + + +@logs_state_cli_group.command(name="worker") +@click.option( + "--pid", + "-pid", + # The only identifier supported for now, TODO(rickyx): add worker id support + required=True, + type=str, + help="Retrieves the logs from the worker with this pid.", +) +@address_option +@log_node_id_option +@log_node_ip_option +@log_follow_option +@log_tail_option +@log_interval_option +@log_timeout_option +@log_suffix_option +@click.pass_context +@PublicAPI(stability="stable") +def log_worker( + ctx, + pid: Optional[str], + address: Optional[str], + node_id: Optional[str], + node_ip: Optional[str], + follow: bool, + tail: int, + interval: float, + timeout: int, + err: bool, +): + """Get logs associated with a worker process. + + Example: + + Follow the log file from a worker process with pid=123 + + ``` + ray logs worker --pid 123 --follow + ``` + + Get the stderr logs from a worker process. + + ``` + ray logs worker --pid 123 --err + ``` + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + MissingParameter if inputs are missing. + """ # noqa: E501 + + _print_log( + address=address, + node_id=node_id, + node_ip=node_ip, + pid=pid, + tail=tail, + follow=follow, + interval=interval, + timeout=timeout, + suffix="err" if err else "out", + ) + + +@logs_state_cli_group.command(name="job") +@click.option( + "--id", + "submission_id", + required=True, + type=str, + help=( + "Retrieves the logs from a submission job with submission id," + "i.e. raysubmit_XXX" + ), +) +@address_option +@log_follow_option +@log_tail_option +@log_interval_option +@log_timeout_option +@click.pass_context +@PublicAPI(stability="stable") +def log_job( + ctx, + submission_id: Optional[str], + address: Optional[str], + follow: bool, + tail: int, + interval: float, + timeout: int, +): + """Get logs associated with a submission job. + + Example: + + Follow the log file from a submission job with submission id raysumbit_xxx. + + ``` + ray logs job --id raysubmit_xxx + ``` + + Follow the submission job log. + + ``` + ray logs jobs --id raysubmit_xxx --follow + + ``` + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + MissingParameter if inputs are missing. + """ # noqa: E501 + + _print_log( + address=address, + tail=tail, + follow=follow, + interval=interval, + timeout=timeout, + submission_id=submission_id, + ) + + +@logs_state_cli_group.command(name="task") +@click.option( + "--id", + "task_id", + required=True, + type=str, + help="Retrieves the logs from the task with this task id.", +) +@click.option( + "--attempt-number", + "-a", + required=False, + type=int, + default=0, + help="Retrieves the logs from the attempt, default to 0", +) +@address_option +@log_follow_option +@log_interval_option +@log_tail_option +@log_timeout_option +@log_suffix_option +@click.pass_context +@PublicAPI(stability="stable") +def log_task( + ctx, + task_id: Optional[str], + attempt_number: int, + address: Optional[str], + follow: bool, + interval: float, + tail: int, + timeout: int, + err: bool, +): + """Get logs associated with a task. + + Example: + + Follow the log file from a task with task id = ABCDEFG + + ``` + ray logs tasks --id ABCDEFG --follow + ``` + + Get the log from a retry attempt 1 from a task. + + ``` + ray logs tasks --id ABCDEFG -a 1 + ``` + + Raises: + :class:`RayStateApiException ` + if the CLI is failed to query the data. + MissingParameter if inputs are missing. + """ # noqa: E501 + + _print_log( + address=address, + task_id=task_id, + attempt_number=attempt_number, + follow=follow, + tail=tail, + interval=interval, + timeout=timeout, + suffix="err" if err else "out", + ) diff --git a/python/ray/util/state/state_manager.py b/python/ray/util/state/state_manager.py new file mode 100644 index 000000000000..5617e8ea9e14 --- /dev/null +++ b/python/ray/util/state/state_manager.py @@ -0,0 +1,457 @@ +import dataclasses +import inspect +import logging +from collections import defaultdict +from functools import wraps +from typing import List, Optional, Tuple + +import grpc +from grpc.aio._call import UnaryStreamCall + +import ray +import ray.dashboard.modules.log.log_consts as log_consts +from ray._private import ray_constants +from ray._private.gcs_utils import GcsAioClient +from ray._private.utils import hex_to_binary +from ray._raylet import ActorID, JobID, TaskID +from ray.core.generated import gcs_service_pb2_grpc +from ray.core.generated.gcs_pb2 import ActorTableData +from ray.core.generated.gcs_service_pb2 import ( + GetAllActorInfoReply, + GetAllActorInfoRequest, + GetAllNodeInfoReply, + GetAllNodeInfoRequest, + GetAllPlacementGroupReply, + GetAllPlacementGroupRequest, + GetAllWorkerInfoReply, + GetAllWorkerInfoRequest, + GetTaskEventsReply, + GetTaskEventsRequest, +) +from ray.core.generated.node_manager_pb2 import ( + GetObjectsInfoReply, + GetObjectsInfoRequest, + GetTasksInfoReply, + GetTasksInfoRequest, +) +from ray.core.generated.node_manager_pb2_grpc import NodeManagerServiceStub +from ray.core.generated.reporter_pb2 import ( + ListLogsReply, + ListLogsRequest, + StreamLogRequest, +) +from ray.core.generated.reporter_pb2_grpc import LogServiceStub +from ray.core.generated.runtime_env_agent_pb2 import ( + GetRuntimeEnvsInfoReply, + GetRuntimeEnvsInfoRequest, +) +from ray.core.generated.runtime_env_agent_pb2_grpc import RuntimeEnvServiceStub +from ray.dashboard.datacenter import DataSource +from ray.dashboard.modules.job.common import JobInfoStorageClient +from ray.dashboard.modules.job.pydantic_models import JobDetails, JobType +from ray.dashboard.modules.job.utils import get_driver_jobs +from ray.dashboard.utils import Dict as Dictionary +from ray.util.state.common import ( + RAY_MAX_LIMIT_FROM_DATA_SOURCE, + PredicateType, + SupportedFilterType, +) +from ray.util.state.exception import DataSourceUnavailable + +logger = logging.getLogger(__name__) + +_STATE_MANAGER_GRPC_OPTIONS = [ + *ray_constants.GLOBAL_GRPC_OPTIONS, + ("grpc.max_send_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), + ("grpc.max_receive_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE), +] + + +def handle_grpc_network_errors(func): + """Decorator to add a network handling logic. + + It is a helper method for `StateDataSourceClient`. + The method can only be used for async methods. + """ + assert inspect.iscoroutinefunction(func) + + @wraps(func) + async def api_with_network_error_handler(*args, **kwargs): + """Apply the network error handling logic to each APIs, + such as retry or exception policies. + + Returns: + If RPC succeeds, it returns what the original function returns. + If RPC fails, it raises exceptions. + Exceptions: + DataSourceUnavailable: if the source is unavailable because it is down + or there's a slow network issue causing timeout. + Otherwise, the raw network exceptions (e.g., gRPC) will be raised. + """ + try: + return await func(*args, **kwargs) + except grpc.aio.AioRpcError as e: + if ( + e.code() == grpc.StatusCode.DEADLINE_EXCEEDED + or e.code() == grpc.StatusCode.UNAVAILABLE + ): + raise DataSourceUnavailable( + "Failed to query the data source. " + "It is either there's a network issue, or the source is down." + ) + else: + logger.exception(e) + raise e + + return api_with_network_error_handler + + +class IdToIpMap: + def __init__(self): + # Node IP to node ID mapping. + self._ip_to_node_id = defaultdict(str) + # Node ID to node IP mapping. + self._node_id_to_ip = defaultdict(str) + + def put(self, node_id: str, address: str): + self._ip_to_node_id[address] = node_id + self._node_id_to_ip[node_id] = address + + def get_ip(self, node_id: str): + return self._node_id_to_ip.get(node_id) + + def get_node_id(self, address: str): + return self._ip_to_node_id.get(address) + + def pop(self, node_id: str): + """Pop the given node id. + + Returns: + False if the corresponding node id doesn't exist. + True if it pops correctly. + """ + ip = self._node_id_to_ip.get(node_id) + if not ip: + return None + assert ip in self._ip_to_node_id + self._node_id_to_ip.pop(node_id) + self._ip_to_node_id.pop(ip) + return True + + +class StateDataSourceClient: + """The client to query states from various data sources such as Raylet, GCS, Agents. + + Note that it doesn't directly query core workers. They are proxied through raylets. + + The module is not in charge of service discovery. The caller is responsible for + finding services and register stubs through `register*` APIs. + + Non `register*` APIs + - Return the protobuf directly if it succeeds to query the source. + - Raises an exception if there's any network issue. + - throw a ValueError if it cannot find the source. + """ + + def __init__(self, gcs_channel: grpc.aio.Channel, gcs_aio_client: GcsAioClient): + self.register_gcs_client(gcs_channel) + self._raylet_stubs = {} + self._runtime_env_agent_stub = {} + self._log_agent_stub = {} + self._job_client = JobInfoStorageClient(gcs_aio_client) + self._id_id_map = IdToIpMap() + self._gcs_aio_client = gcs_aio_client + + def register_gcs_client(self, gcs_channel: grpc.aio.Channel): + self._gcs_actor_info_stub = gcs_service_pb2_grpc.ActorInfoGcsServiceStub( + gcs_channel + ) + self._gcs_pg_info_stub = gcs_service_pb2_grpc.PlacementGroupInfoGcsServiceStub( + gcs_channel + ) + self._gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub( + gcs_channel + ) + self._gcs_worker_info_stub = gcs_service_pb2_grpc.WorkerInfoGcsServiceStub( + gcs_channel + ) + self._gcs_task_info_stub = gcs_service_pb2_grpc.TaskInfoGcsServiceStub( + gcs_channel + ) + + def register_raylet_client(self, node_id: str, address: str, port: int): + full_addr = f"{address}:{port}" + options = _STATE_MANAGER_GRPC_OPTIONS + channel = ray._private.utils.init_grpc_channel( + full_addr, options, asynchronous=True + ) + self._raylet_stubs[node_id] = NodeManagerServiceStub(channel) + self._id_id_map.put(node_id, address) + + def unregister_raylet_client(self, node_id: str): + self._raylet_stubs.pop(node_id) + self._id_id_map.pop(node_id) + + def register_agent_client(self, node_id, address: str, port: int): + options = _STATE_MANAGER_GRPC_OPTIONS + channel = ray._private.utils.init_grpc_channel( + f"{address}:{port}", options=options, asynchronous=True + ) + self._runtime_env_agent_stub[node_id] = RuntimeEnvServiceStub(channel) + self._log_agent_stub[node_id] = LogServiceStub(channel) + self._id_id_map.put(node_id, address) + + def unregister_agent_client(self, node_id: str): + self._runtime_env_agent_stub.pop(node_id) + self._log_agent_stub.pop(node_id) + self._id_id_map.pop(node_id) + + def get_all_registered_raylet_ids(self) -> List[str]: + return self._raylet_stubs.keys() + + def get_all_registered_agent_ids(self) -> List[str]: + assert len(self._log_agent_stub) == len(self._runtime_env_agent_stub) + return self._runtime_env_agent_stub.keys() + + def ip_to_node_id(self, ip: Optional[str]) -> Optional[str]: + """Return the node id that corresponds to the given ip. + + Args: + ip: The ip address. + + Returns: + None if the corresponding id doesn't exist. + Node id otherwise. If None node_ip is given, + it will also return None. + """ + if not ip: + return None + return self._id_id_map.get_node_id(ip) + + @handle_grpc_network_errors + async def get_all_actor_info( + self, + timeout: int = None, + limit: int = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + ) -> Optional[GetAllActorInfoReply]: + if not limit: + limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE + if filters is None: + filters = [] + + req_filters = GetAllActorInfoRequest.Filters() + for filter in filters: + key, predicate, value = filter + if predicate != "=": + # We only support EQUAL predicate for source side filtering. + continue + if key == "actor_id": + req_filters.actor_id = ActorID(hex_to_binary(value)).binary() + elif key == "state": + if value not in ActorTableData.ActorState.keys(): + raise ValueError(f"Invalid actor state for filtering: {value}") + req_filters.state = ActorTableData.ActorState.Value(value) + elif key == "job_id": + req_filters.job_id = JobID(hex_to_binary(value)).binary() + + request = GetAllActorInfoRequest(limit=limit, filters=req_filters) + reply = await self._gcs_actor_info_stub.GetAllActorInfo( + request, timeout=timeout + ) + return reply + + @handle_grpc_network_errors + async def get_all_task_info( + self, + timeout: int = None, + limit: int = None, + filters: Optional[List[Tuple[str, PredicateType, SupportedFilterType]]] = None, + exclude_driver: bool = False, + ) -> Optional[GetTaskEventsReply]: + if not limit: + limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE + + if filters is None: + filters = [] + + req_filters = GetTaskEventsRequest.Filters() + for filter in filters: + key, predicate, value = filter + if predicate != "=": + # We only support EQUAL predicate for source side filtering. + continue + + if key == "actor_id": + req_filters.actor_id = ActorID(hex_to_binary(value)).binary() + elif key == "job_id": + req_filters.job_id = JobID(hex_to_binary(value)).binary() + elif key == "name": + req_filters.name = value + elif key == "task_id": + req_filters.task_ids.append(TaskID(hex_to_binary(value)).binary()) + else: + continue + + # Remove the filter from the list so that we don't have to + # filter it again later. + filters.remove(filter) + + req_filters.exclude_driver = exclude_driver + + request = GetTaskEventsRequest(limit=limit, filters=req_filters) + reply = await self._gcs_task_info_stub.GetTaskEvents(request, timeout=timeout) + return reply + + @handle_grpc_network_errors + async def get_all_placement_group_info( + self, timeout: int = None, limit: int = None + ) -> Optional[GetAllPlacementGroupReply]: + if not limit: + limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE + + request = GetAllPlacementGroupRequest(limit=limit) + reply = await self._gcs_pg_info_stub.GetAllPlacementGroup( + request, timeout=timeout + ) + return reply + + @handle_grpc_network_errors + async def get_all_node_info( + self, timeout: int = None + ) -> Optional[GetAllNodeInfoReply]: + request = GetAllNodeInfoRequest() + reply = await self._gcs_node_info_stub.GetAllNodeInfo(request, timeout=timeout) + return reply + + @handle_grpc_network_errors + async def get_all_worker_info( + self, timeout: int = None, limit: int = None + ) -> Optional[GetAllWorkerInfoReply]: + if not limit: + limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE + + request = GetAllWorkerInfoRequest(limit=limit) + reply = await self._gcs_worker_info_stub.GetAllWorkerInfo( + request, timeout=timeout + ) + return reply + + # TODO(rickyx): + # This is currently mirroring dashboard/modules/job/job_head.py::list_jobs + # We should eventually unify the logic. + async def get_job_info(self, timeout: int = None) -> List[JobDetails]: + # Cannot use @handle_grpc_network_errors because async def is not supported yet. + + driver_jobs, submission_job_drivers = await get_driver_jobs( + self._gcs_aio_client, timeout=timeout + ) + submission_jobs = await self._job_client.get_all_jobs(timeout=timeout) + submission_jobs = [ + JobDetails( + **dataclasses.asdict(job), + submission_id=submission_id, + job_id=submission_job_drivers.get(submission_id).id + if submission_id in submission_job_drivers + else None, + driver_info=submission_job_drivers.get(submission_id), + type=JobType.SUBMISSION, + ) + for submission_id, job in submission_jobs.items() + ] + + return list(driver_jobs.values()) + submission_jobs + + async def get_all_cluster_events(self) -> Dictionary: + return DataSource.events + + @handle_grpc_network_errors + async def get_task_info( + self, node_id: str, timeout: int = None, limit: int = None + ) -> Optional[GetTasksInfoReply]: + if not limit: + limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE + stub = self._raylet_stubs.get(node_id) + if not stub: + raise ValueError(f"Raylet for a node id, {node_id} doesn't exist.") + + reply = await stub.GetTasksInfo( + GetTasksInfoRequest(limit=limit), timeout=timeout + ) + return reply + + @handle_grpc_network_errors + async def get_object_info( + self, node_id: str, timeout: int = None, limit: int = None + ) -> Optional[GetObjectsInfoReply]: + if not limit: + limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE + + stub = self._raylet_stubs.get(node_id) + if not stub: + raise ValueError(f"Raylet for a node id, {node_id} doesn't exist.") + + reply = await stub.GetObjectsInfo( + GetObjectsInfoRequest(limit=limit), + timeout=timeout, + ) + return reply + + @handle_grpc_network_errors + async def get_runtime_envs_info( + self, node_id: str, timeout: int = None, limit: int = None + ) -> Optional[GetRuntimeEnvsInfoReply]: + if not limit: + limit = RAY_MAX_LIMIT_FROM_DATA_SOURCE + + stub = self._runtime_env_agent_stub.get(node_id) + if not stub: + raise ValueError(f"Agent for a node id, {node_id} doesn't exist.") + + reply = await stub.GetRuntimeEnvsInfo( + GetRuntimeEnvsInfoRequest(limit=limit), + timeout=timeout, + ) + return reply + + @handle_grpc_network_errors + async def list_logs( + self, node_id: str, glob_filter: str, timeout: int = None + ) -> ListLogsReply: + stub = self._log_agent_stub.get(node_id) + if not stub: + raise ValueError(f"Agent for node id: {node_id} doesn't exist.") + return await stub.ListLogs( + ListLogsRequest(glob_filter=glob_filter), timeout=timeout + ) + + @handle_grpc_network_errors + async def stream_log( + self, + node_id: str, + log_file_name: str, + keep_alive: bool, + lines: int, + interval: Optional[float], + timeout: int, + task_id: Optional[str] = None, + attempt_number: Optional[int] = None, + ) -> UnaryStreamCall: + stub = self._log_agent_stub.get(node_id) + if not stub: + raise ValueError(f"Agent for node id: {node_id} doesn't exist.") + stream = stub.StreamLog( + StreamLogRequest( + keep_alive=keep_alive, + log_file_name=log_file_name, + lines=lines, + interval=interval, + task_id=task_id, + attempt_number=attempt_number, + ), + timeout=timeout, + ) + metadata = await stream.initial_metadata() + if metadata.get(log_consts.LOG_GRPC_ERROR) == log_consts.FILE_NOT_FOUND: + raise ValueError(f'File "{log_file_name}" not found on node {node_id}') + return stream diff --git a/python/ray/util/state/util.py b/python/ray/util/state/util.py new file mode 100644 index 000000000000..16a5221e458f --- /dev/null +++ b/python/ray/util/state/util.py @@ -0,0 +1,61 @@ +from typing import Optional, Union + + +def convert_string_to_type( + val: Optional[Union[str, int, float, bool]], convert_type: Union[int, float, bool] +) -> Union[int, float, bool]: + """Convert the given value to a convert type. + + If the given val is None, it will just return None without the conversion. + + It supports, + str -> int/float/bool + int -> int + bool -> bool + float -> float + """ + if val is None: + return None + elif type(val) is convert_type: + return val + elif convert_type is int: + try: + val = int(val) + except ValueError: + raise ValueError( + f"Failed to convert a value {val} of type {type(val)} to {convert_type}" + ) + elif convert_type is float: + try: + val = float(val) + except ValueError: + raise ValueError( + f"Failed to convert a value {val} of type {type(val)} to {convert_type}" + ) + elif convert_type is bool: + # Without this, "False" will become True. + if val == "False" or val == "false" or val == "0": + val = False + elif val == "True" or val == "true" or val == "1": + val = True + else: + raise ValueError( + f"Failed to convert a value {val} of type {type(val)} to {convert_type}" + ) + else: + assert False, f"Unsupported convert type {convert_type}" + return val + + +def record_deprecated_state_api_import(): + import warnings + from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag + + warnings.warn( + "Ray state API is no longer experimental. Please import from `ray.util.state`. " + "instead. Importing from `ray.experimental` will be deprecated in " + "future releases. ", + DeprecationWarning, + ) + + record_extra_usage_tag(TagKey.EXPERIMENTAL_STATE_API_IMPORT, "1") diff --git a/python/ray/widgets/util.py b/python/ray/widgets/util.py index 056d5ca0d385..6991384779f2 100644 --- a/python/ray/widgets/util.py +++ b/python/ray/widgets/util.py @@ -5,15 +5,12 @@ from functools import wraps from typing import Any, Callable, Iterable, Optional, TypeVar, Union +from packaging.version import Version + +from ray._private.thirdparty.tabulate.tabulate import tabulate from ray.util.annotations import DeveloperAPI from ray.widgets import Template -try: - from packaging.version import Version -except ImportError: - from distutils.version import LooseVersion as Version - - logger = logging.getLogger(__name__) F = TypeVar("F", bound=Callable[..., Any]) @@ -34,14 +31,6 @@ def make_table_html_repr( Returns: HTML representation of the object """ - try: - from tabulate import tabulate - except ImportError: - return ( - "Tabulate isn't installed. Run " - "`pip install tabulate` for rich notebook output." - ) - data = {} for k, v in vars(obj).items(): if isinstance(v, (str, bool, int, float)): @@ -84,10 +73,10 @@ def ensure_notebook_deps( ) -> Callable[[F], F]: """Generate a decorator which checks for soft dependencies. - This decorator is meant to wrap _ipython_display_. If the dependency is not found, + This decorator is meant to wrap repr methods. If the dependency is not found, or a version is specified here and the version of the package is older than the - specified version, the wrapped function is not executed and None is returned. If - the dependency is missing or the version is old, a log message is displayed. + specified version, the original repr is used. + If the dependency is missing or the version is old, a log message is displayed. Args: *deps: Iterable of (dependency name, min version (optional)) @@ -101,18 +90,56 @@ def ensure_notebook_deps( def wrapper(func: F) -> F: @wraps(func) - def wrapped(*args, **kwargs): + def wrapped(self, *args, **kwargs): if _has_missing(*deps, message=missing_message) or _has_outdated( *deps, message=outdated_message ): - return None - return func(*args, **kwargs) + # Fallback to plaintext repr if dependencies are missing. + return {"text/plain": repr(self)} + return func(self, *args, **kwargs) return wrapped return wrapper +@DeveloperAPI +def ensure_ipywidgets_dep(version: str) -> Callable[[F], F]: + """Generate a decorator which checks for a soft ipywidgets dependency. + + This is a convencience function separate from `ensure_notebook_deps` because + of its custom missing and outdated messages, which suggest the user restart the + notebook server after installation/upgrade. + + Args: + version: Version of ipywidgets required. + + Returns: + Wrapped function. Guaranteed to be safe against the specified ipywidgets + version. + """ + text = ( + "Run `pip install {}ipywidgets`, then restart " + "the notebook server for rich notebook output." + ) + + if in_notebook(): + return ensure_notebook_deps( + ["ipywidgets", version], + missing_message=text.format(""), + outdated_message=text.format("-U "), + ) + else: + # If not in a notebook, then immediately short-circuit. + # We do not log has_missing or has_outdated messages if not in a notebook + # setting. + def dummy_decorator(func): + # Return the original function without any changes. + return func + + return dummy_decorator + + def _has_missing( *deps: Iterable[Union[str, Optional[str]]], message: Optional[str] = None ): @@ -162,28 +189,48 @@ def _has_outdated( if not message: message = f"Run `pip install -U {install_str}` for rich notebook output." - # stacklevel=3: First level is this function, then ensure_notebook_deps, then - # the actual function affected. - logger.warning(f"Outdated packages:\n{outdated_str}\n{message}", stacklevel=3) + if sys.version_info < (3, 8): + logger.warning(f"Outdated packages:\n{outdated_str}\n{message}") + else: + # stacklevel=3: First level is this function, then ensure_notebook_deps, + # then the actual function affected. + logger.warning( + f"Outdated packages:\n{outdated_str}\n{message}", stacklevel=3 + ) return outdated @DeveloperAPI -def fallback_if_colab(func: F) -> Callable[[F], F]: +def repr_fallback_if_colab(func: F) -> Callable[[F], F]: + """Decorator which strips rich notebook output from mimebundles if run in colab. + + See https://github.com/googlecolab/colabtools/issues/60 for more information about + the status of this issue. + + Args: + func: Function to wrap; must be a _repr_mimebundle_ method. + + Returns: + A function that returns the usual _repr_mimebundle_ unless it is run in + google colab, in which case it returns a mimebundle that only contains a + single text/plain mimetype, preventing rich notebook integration in colab. + """ try: - ipython = get_ipython() - except NameError: + import IPython + + ipython = IPython.get_ipython() + except (ModuleNotFoundError, ValueError): ipython = None @wraps(func) - def wrapped(self, *args, **kwargs): - if ipython and "google.colab" not in str(ipython): - return func(self, *args, **kwargs) - elif hasattr(self, "__repr__"): - return print(self.__repr__(*args, **kwargs)) - else: - return None + def wrapped(*args, **kwargs): + result = func(*args, **kwargs) + if ipython and "google.colab" in str(ipython): + if isinstance(result, dict) and "text/plain" in result: + return {"text/plain": result["text/plain"]} + + return result return wrapped @@ -192,8 +239,14 @@ def wrapped(self, *args, **kwargs): def in_notebook() -> bool: """Return whether we are in a Jupyter notebook.""" try: - class_name = get_ipython().__class__.__name__ - is_notebook = True if "Terminal" not in class_name else False - except NameError: - is_notebook = False - return is_notebook + import IPython + + shell = IPython.get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type + except (ModuleNotFoundError, NameError, ValueError): + return False diff --git a/python/ray/workflow/api.py b/python/ray/workflow/api.py index 2e3e1d20cdee..6a651dbb0fd2 100644 --- a/python/ray/workflow/api.py +++ b/python/ray/workflow/api.py @@ -107,7 +107,7 @@ def wrapper(*args, **kwargs): # `is_client_mode_enabled_by_default` is used for testing with # `RAY_CLIENT_MODE=1`. This flag means all tests run with client mode. - if client_mode_should_convert(auto_init=False): + if client_mode_should_convert(): f = ray.remote(num_cpus=0)(func) ref = f.remote(*args, **kwargs) return ray.get(ref) diff --git a/python/ray/workflow/tests/test_dataset.py b/python/ray/workflow/tests/test_dataset.py index 862371993ba7..acaeaf20db6d 100644 --- a/python/ray/workflow/tests/test_dataset.py +++ b/python/ray/workflow/tests/test_dataset.py @@ -18,17 +18,17 @@ def gen_dataset_1(): @ray.remote def gen_dataset_2(): - return ray.data.range_table(1000) + return ray.data.range(1000) @ray.remote def transform_dataset(in_data): - return in_data.map(lambda x: x * 2) + return in_data.map(lambda x: {"id": x["id"] * 2}) @ray.remote def transform_dataset_1(in_data): - return in_data.map(lambda r: {"v2": r["value"] * 2}) + return in_data.map(lambda r: {"v2": r["id"] * 2}) @ray.remote diff --git a/python/ray/workflow/tests/test_error_handling.py b/python/ray/workflow/tests/test_error_handling.py index 3c0a02f03106..3fbf9a8e2c46 100644 --- a/python/ray/workflow/tests/test_error_handling.py +++ b/python/ray/workflow/tests/test_error_handling.py @@ -165,7 +165,7 @@ def test_disable_auto_lineage_reconstruction(ray_start_cluster, tmp_path): @ray.remote def get_node_id(): - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() lock_path = str(tmp_path / "lock") diff --git a/python/ray/workflow/tests/utils.py b/python/ray/workflow/tests/utils.py index c558d5f4ae18..b96bc7a68c39 100644 --- a/python/ray/workflow/tests/utils.py +++ b/python/ray/workflow/tests/utils.py @@ -55,5 +55,5 @@ def skip_client_mode_test(): import pytest from ray._private.client_mode_hook import client_mode_should_convert - if client_mode_should_convert(auto_init=False): + if client_mode_should_convert(): pytest.skip("Not for Ray client test") diff --git a/python/ray/workflow/workflow_state_from_dag.py b/python/ray/workflow/workflow_state_from_dag.py index 9fd44a9448e3..b7f39ad6dc9d 100644 --- a/python/ray/workflow/workflow_state_from_dag.py +++ b/python/ray/workflow/workflow_state_from_dag.py @@ -156,7 +156,7 @@ def _node_visitor(node: Any) -> Any: # so it won't be mutated later. This guarantees correct # semantics. See "tests/test_variable_mutable.py" as # an example. - if client_mode_should_convert(auto_init=False): + if client_mode_should_convert(): # Handle client mode. The Ray client would serialize and # then deserialize objects in the Ray client server. When # the object is being deserialized, the serialization context diff --git a/python/requirements.txt b/python/requirements.txt index fb0ea7d1698d..e6bfd5a7e466 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -18,7 +18,7 @@ pyyaml aiosignal frozenlist requests -virtualenv>=20.0.24 +packaging # Python version-specific requirements dataclasses; python_version < '3.7' @@ -30,7 +30,6 @@ grpcio >= 1.32.0, <= 1.51.3; python_version < '3.10' and sys_platform != 'darwin grpcio >= 1.42.0, <= 1.51.3; python_version >= '3.10' and sys_platform != 'darwin' numpy>=1.16; python_version < '3.9' numpy>=1.19.3; python_version >= '3.9' -packaging; python_version >= '3.10' typing_extensions; python_version < '3.8' # ray[all] @@ -38,7 +37,6 @@ uvicorn py-spy>=0.2.0 rich urllib3 -tabulate scikit-image scipy aiohttp>=3.7 @@ -54,6 +52,7 @@ requests kubernetes colorful lz4 +virtualenv>=20.0.24, < 20.21.1 # Manually parse pandas requirement pandas>=1.0.5; python_version < '3.7' pandas>=1.3.0; python_version >= '3.7' diff --git a/python/requirements/data_processing/requirements.txt b/python/requirements/data_processing/requirements.txt index e33537d03db4..325a383c2cc1 100644 --- a/python/requirements/data_processing/requirements.txt +++ b/python/requirements/data_processing/requirements.txt @@ -3,12 +3,12 @@ dask[complete]==2022.2.0; python_version < '3.8' dask[complete]==2022.10.1; python_version >= '3.8' -aioboto3==8.3.0 +aioboto3==11.2.0 crc32c==2.3 flask_cors -s3fs -modin>=0.8.3; python_version < '3.7' -modin>=0.11.0; python_version >= '3.7' +s3fs==2023.1.0 +modin==0.12.1; python_version < '3.8' +modin==0.18.1; python_version >= '3.8' pytest-repeat raydp>=0.0.dev0 responses==0.13.4 diff --git a/python/requirements/ml/requirements_dl.txt b/python/requirements/ml/requirements_dl.txt index 5549414a9f69..baab31deaaf9 100644 --- a/python/requirements/ml/requirements_dl.txt +++ b/python/requirements/ml/requirements_dl.txt @@ -5,7 +5,8 @@ tensorflow==2.11.0; sys_platform != 'darwin' or platform_machine != 'arm64' tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64' tensorflow-probability==0.19.0 -# If you make changes below this line, please also make the corresponding changes to `requirements_ml_docker.txt`! +# If you make changes below this line, please also make the corresponding changes to `requirements_ml_docker.txt` +# and to `install-dependencies.sh`! --extra-index-url https://download.pytorch.org/whl/cpu # for CPU versions of torch, torchvision --find-links https://data.pyg.org/whl/torch-1.13.0+cpu.html # for CPU versions of torch-scatter, torch-sparse, torch-cluster, torch-spline-conv diff --git a/python/requirements/ml/requirements_ml_docker.txt b/python/requirements/ml/requirements_ml_docker.txt index 6dd8d1532ed1..3a027b9b4869 100644 --- a/python/requirements/ml/requirements_ml_docker.txt +++ b/python/requirements/ml/requirements_ml_docker.txt @@ -1,5 +1,8 @@ ipython +# Needed for rich visualization for Ray Train and Ray Data. +ipywidgets>=8 + # Needed for Ray Client error message serialization/deserialization. tblib diff --git a/python/requirements/ml/requirements_no_deps.txt b/python/requirements/ml/requirements_no_deps.txt new file mode 100644 index 000000000000..6d2754211cd2 --- /dev/null +++ b/python/requirements/ml/requirements_no_deps.txt @@ -0,0 +1,6 @@ +# These requirements have outdated or incompatible downstream dependencies. +# Thus we install them on a best effort basis before any other packages +# (without constraints), but their dependencies may be overwritten afterwards. + +# mosaicml requires importlib-metadata>5, but flake8 is not compatible with it +mosaicml==0.12.1 diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index c976b3f0889a..2d1b6da4b695 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -43,7 +43,6 @@ imageio-ffmpeg==0.4.5 onnx==1.12.0; sys_platform != 'darwin' or platform_machine != 'arm64' onnxruntime==1.14.1; sys_platform != 'darwin' or platform_machine != 'arm64' tf2onnx==1.13.0; sys_platform != 'darwin' or platform_machine != 'arm64' -typer==0.6.1 rich==12.0.1 # Msgpack checkpoint stuff. msgpack diff --git a/python/requirements/ml/requirements_train.txt b/python/requirements/ml/requirements_train.txt index cbec6de14581..5f999b3648f4 100644 --- a/python/requirements/ml/requirements_train.txt +++ b/python/requirements/ml/requirements_train.txt @@ -2,8 +2,8 @@ -r requirements_dl.txt -mosaicml==0.12.1 -mlflow==1.30.0 +mlflow==1.30.0; python_version <= '3.7' +mlflow==2.2.2; python_version > '3.7' tensorboardX==2.4.1 # Dependencies for Hugging Face examples & tests: diff --git a/python/requirements/ml/requirements_tune.txt b/python/requirements/ml/requirements_tune.txt index a121cb2b2d60..f85c4d9299f5 100644 --- a/python/requirements/ml/requirements_tune.txt +++ b/python/requirements/ml/requirements_tune.txt @@ -11,7 +11,7 @@ dragonfly-opt==0.1.6 flaml==1.1.1 freezegun==1.1.0 # Requires decord which is unavailable for arm64 -gluoncv==0.10.1.post0; platform_machine != "arm64" +gluoncv==0.10.5.post0; platform_machine != "arm64" gpy==1.10.0 # Requires libtorrent which is unavailable for arm64 autorom[accept-rom-license]; platform_machine != "arm64" @@ -22,7 +22,8 @@ hyperopt==0.2.5 jupyterlab==3.6.1 lightgbm==3.3.5 matplotlib!=3.4.3 -mlflow==1.30.0 +mlflow==1.30.0; python_version <= '3.7' +mlflow==2.2.2; python_version > '3.7' # Unavailable for arm64 in more recent versions mxnet==1.8.0.post0; platform_machine != "arm64" nevergrad==0.4.3.post7 @@ -31,9 +32,9 @@ optuna==2.10.0 pymoo==0.5.0 pytest-remotedata==0.3.2 lightning-bolts==0.4.0 -protobuf==3.19.6 pytorch-lightning==1.6.5 fairscale==0.4.6 +s3fs==2023.1.0 shortuuid==1.0.1 scikit-optimize==0.9.0 sigopt==7.5.0 diff --git a/python/requirements/ml/requirements_upstream.txt b/python/requirements/ml/requirements_upstream.txt index a5da3cce16ed..7c0b3abbeaee 100644 --- a/python/requirements/ml/requirements_upstream.txt +++ b/python/requirements/ml/requirements_upstream.txt @@ -6,5 +6,5 @@ ray_lightning==0.3.0 tune-sklearn==0.4.4 xgboost_ray==0.1.15 lightgbm_ray==0.1.8 -modin==0.18.1; python_version >= '3.8' modin==0.12.1; python_version < '3.8' +modin==0.18.1; python_version >= '3.8' diff --git a/python/requirements_test.txt b/python/requirements_test.txt index be87fb41babb..96b6fc6e98c7 100644 --- a/python/requirements_test.txt +++ b/python/requirements_test.txt @@ -12,7 +12,7 @@ azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4 beautifulsoup4==4.11.1 -boto3==1.23.10 +boto3==1.24.59 # Todo: investigate if we can get rid of this and exchange for ray.cloudpickle cloudpickle==2.2.0 # Keep in sync with `ci/build/upload_build_info.sh` @@ -41,7 +41,7 @@ openpyxl==3.0.10 opentelemetry-api==1.1.0 opentelemetry-sdk==1.1.0 opentelemetry-exporter-otlp==1.1.0 -opentelemetry-exporter-opencensus +opentelemetry-exporter-opencensus==0.20b0 pexpect==4.8.0 Pillow==9.2.0; platform_system != "Windows" proxy.py==2.4.3 @@ -54,10 +54,12 @@ PyOpenSSL==22.1.0 pygame==2.1.2; python_version < '3.11' Pygments==2.13.0 pymongo==4.3.2 -# TODO: Replace this with pyspark==3.4 once it is released. -https://ml-team-public-read.s3.us-west-2.amazonaws.com/spark-pkgs/pyspark-3.4.0.dev0-0cb0fa313979e1b82ddd711a05d8c4e78cf6c9f5.tar.gz +# TODO: Upgrade to pyspark 3.4.0 once raydp supports it +# https://ml-team-public-read.s3.us-west-2.amazonaws.com/spark-pkgs/pyspark-3.4.0.dev0-0cb0fa313979e1b82ddd711a05d8c4e78cf6c9f5.tar.gz +pyspark==3.3.1 pytest==7.0.1 pytest-asyncio==0.16.0 +pytest-httpserver==1.0.6 pytest-rerunfailures==10.2 pytest-sugar==0.9.5 pytest-lazy-fixture==0.6.3 @@ -67,6 +69,7 @@ redis==4.4.2 scikit-learn==1.0.2; python_version < '3.11' smart_open[s3]==6.2.0 tqdm==4.64.1 +trustme==0.9.0 testfixtures==7.0.0 werkzeug==2.1.2 xlrd==2.0.1 @@ -75,13 +78,20 @@ memray; platform_system != "Windows" and sys_platform != "darwin" # For doc tests myst-parser==0.15.2 myst-nb==0.13.1 +sphinx==4.3.2 jupytext==1.13.6 +jinja2==3.0.3 pytest-docker-tools==3.1.3 pytest-forked==1.4.0 # For dataset tests polars==0.14.21 +# ale-py requires 4.10.0. It's also compatible with flake8. +# It's not compatible with mosaicml though. Try to remove once +# we are at Python 3.8 +importlib-metadata==4.10.0 + # Some packages have downstream dependencies that we have to specify here to resolve conflicts. # Feel free to add (or remove!) packages here liberally. tensorboardX==2.4.1 @@ -89,7 +99,16 @@ starlette==0.17.1 h11==0.12.0 markdown-it-py==1.1.0 attrs==21.4.0 -importlib-metadata==4.13.0 +pytz==2022.7.1 +# Compatibility with spacy 3.5 (model en_core_web_sm) +typing-extensions==4.5.0 +networkx==2.6.3; python_version <= '3.7' +# Aim requires segment-analytics-python, which requires backoff~=2.10, +# which conflicts with the opentelemetry-api 1.1.0. +segment-analytics-python==2.2.0 +httpcore==0.15.0 +backoff==1.10 +sympy==1.10.1; python_version <= '3.7' # For test_basic.py::test_omp_threads_set threadpoolctl==3.1.0 diff --git a/python/setup.py b/python/setup.py index 6cb8a41be1d5..11b725ed85a6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -252,9 +252,10 @@ def get_packages(self): "pydantic", "prometheus_client >= 0.7.1", "smart_open", + "virtualenv >=20.0.24, < 20.21.1", # For pip runtime env. ], "serve": ["uvicorn", "requests", "starlette", "fastapi", "aiorwlock"], - "tune": ["pandas", "tabulate", "tensorboardX>=1.9", "requests", pyarrow_dep], + "tune": ["pandas", "tensorboardX>=1.9", "requests", pyarrow_dep], "k8s": ["kubernetes", "urllib3"], "observability": [ "opentelemetry-api", @@ -321,7 +322,7 @@ def get_packages(self): "msgpack >= 1.0.0, < 2.0.0", "numpy >= 1.16; python_version < '3.9'", "numpy >= 1.19.3; python_version >= '3.9'", - "packaging; python_version >= '3.10'", + "packaging", "protobuf >= 3.15.3, != 3.19.5", "pyyaml", "aiosignal", @@ -330,7 +331,6 @@ def get_packages(self): # Light weight requirement, can be replaced with "typing" once # we deprecate Python 3.7 (this will take a while). "typing_extensions; python_version < '3.8'", - "virtualenv>=20.0.24", # For pip runtime env. ] diff --git a/release/BUILD b/release/BUILD index f246db06bcdf..9caa3ae5879c 100644 --- a/release/BUILD +++ b/release/BUILD @@ -1,4 +1,17 @@ -load("@rules_python//python:defs.bzl", "py_test") +load("@rules_python//python:defs.bzl", "py_library", "py_runtime", "py_runtime_pair", "py_test") +load("@rules_python//python:pip.bzl", "compile_pip_requirements") +load("@py_deps_buildkite//:requirements.bzl", bk_require = "requirement") +load("@python3_9//:defs.bzl", bk_python = "interpreter") + +compile_pip_requirements( + name = "requirements_buildkite", + requirements_in = "requirements_buildkite.in", + requirements_txt = "requirements_buildkite.txt", + tags = [ + "team:ci", + ], + visibility = ["//visibility:private"], +) test_srcs = glob(["**/*.py"]) @@ -186,16 +199,15 @@ py_test( ], ) - #### # AIR smoke tests #### - py_test( name = "air_benchmark_xgboost_smoke_test", size = "small", srcs = test_srcs, + args = ["--smoke-test"], main = "air_tests/air_benchmarks/workloads/xgboost_benchmark.py", tags = [ "exclusive", @@ -205,13 +217,16 @@ py_test( "//:ray_lib", "//python/ray/air:ml_lib", ], - args = ["--smoke-test"] ) py_test( name = "air_benchmark_data_smoke_test", size = "small", srcs = test_srcs, + args = [ + "--dataset-size-gb=1", + "--num-workers=1", + ], main = "air_tests/air_benchmarks/workloads/data_benchmark.py", tags = [ "exclusive", @@ -221,29 +236,35 @@ py_test( "//:ray_lib", "//python/ray/air:ml_lib", ], - args = ["--dataset-size-gb=1", "--num-workers=1"] ) py_test( - name = "air_benchmark_gpu_batch_prediction_smoke_test", - size = "small", - srcs = test_srcs, - main = "air_tests/air_benchmarks/workloads/gpu_batch_prediction.py", - tags = [ - "exclusive", - "team:ml", - ], - deps = [ - "//:ray_lib", - "//python/ray/air:ml_lib", - ], - args = ["--data-size-gb=1", "--smoke-test"] + name = "air_benchmark_gpu_batch_prediction_smoke_test", + size = "small", + srcs = test_srcs, + args = [ + "--data-size-gb=1", + "--smoke-test", + ], + main = "air_tests/air_benchmarks/workloads/gpu_batch_prediction.py", + tags = [ + "exclusive", + "team:ml", + ], + deps = [ + "//:ray_lib", + "//python/ray/air:ml_lib", + ], ) py_test( name = "air_benchmark_pytorch_training_e2e_smoke_test", size = "small", srcs = test_srcs, + args = [ + "--data-size-gb=1", + "--smoke-test", + ], main = "air_tests/air_benchmarks/workloads/pytorch_training_e2e.py", tags = [ "exclusive", @@ -253,13 +274,22 @@ py_test( "//:ray_lib", "//python/ray/air:ml_lib", ], - args = ["--data-size-gb=1", "--smoke-test"] ) py_test( name = "air_benchmark_tensorflow_smoke_test", size = "large", srcs = test_srcs, + args = [ + "run", + "--num-runs=1", + "--num-epochs=1", + "--num-workers=1", + "--cpus-per-worker=1", + "--batch-size=1", + "--smoke-test", + "--local", + ], main = "air_tests/air_benchmarks/workloads/tensorflow_benchmark.py", tags = [ "exclusive", @@ -269,13 +299,22 @@ py_test( "//:ray_lib", "//python/ray/air:ml_lib", ], - args = ["run", "--num-runs=1", "--num-epochs=1", "--num-workers=1", "--cpus-per-worker=1", "--batch-size=1", "--smoke-test", "--local"] ) py_test( name = "air_benchmark_torch_smoke_test", size = "large", srcs = test_srcs, + args = [ + "run", + "--num-runs=1", + "--num-epochs=1", + "--num-workers=1", + "--cpus-per-worker=1", + "--batch-size=1", + "--smoke-test", + "--local", + ], main = "air_tests/air_benchmarks/workloads/torch_benchmark.py", tags = [ "exclusive", @@ -285,7 +324,6 @@ py_test( "//:ray_lib", "//python/ray/air:ml_lib", ], - args = ["run", "--num-runs=1", "--num-epochs=1", "--num-workers=1", "--cpus-per-worker=1", "--batch-size=1", "--smoke-test", "--local"] ) py_test( @@ -294,6 +332,12 @@ py_test( # (runtime is shorter when air_benchmark_torch_smoke_test is executed first) size = "medium", srcs = test_srcs, + args = [ + "--num-runs=1", + "--num-trials=1", + "--num-workers=1", + "--smoke-test", + ], main = "air_tests/air_benchmarks/workloads/tune_torch_benchmark.py", tags = [ "exclusive", @@ -303,80 +347,305 @@ py_test( "//:ray_lib", "//python/ray/air:ml_lib", ], - args = ["--num-runs=1", "--num-trials=1", "--num-workers=1", "--smoke-test"] ) - #### # RELEASE TEST INFRA unit tests #### +py_runtime( + name = "python3_runtime", + interpreter = bk_python, + python_version = "PY3", + visibility = ["//visibility:private"], +) + +py_runtime_pair( + name = "python_runtime_pair", + py2_runtime = None, + py3_runtime = ":python3_runtime", + visibility = ["//visibility:private"], +) + +constraint_setting(name = "hermetic") + +constraint_value( + name = "hermetic_python", + constraint_setting = ":hermetic", + visibility = ["//visibility:private"], +) + +toolchain( + name = "python_toolchain", + exec_compatible_with = [":hermetic_python"], + toolchain = ":python_runtime_pair", + toolchain_type = "@bazel_tools//tools/python:toolchain_type", +) + +platform( + name = "hermetic_python_platform", + constraint_values = [":hermetic_python"], + parents = ["@local_config_platform//:host"], + visibility = ["//visibility:private"], +) + +py_library( + name = "ray_release", + srcs = glob( + ["ray_release/**/*.py"], + exclude = ["ray_release/tests/*.py"], + ), + data = glob(["ray_release/environments/*.env"]) + [ + "ray_release/buildkite/aws_instance_types.csv", + "ray_release/schema.json", + ], + imports = ["."], + visibility = ["//visibility:public"], + deps = [ + bk_require("anyscale"), + bk_require("bazel-runfiles"), + bk_require("boto3"), + bk_require("botocore"), + bk_require("click"), + bk_require("google-cloud-storage"), + bk_require("jinja2"), + bk_require("retry"), + ], +) + +py_library( + name = "test_utils", + srcs = ["ray_release/tests/utils.py"], + imports = ["."], + visibility = ["//visibility:private"], +) + py_test( name = "test_alerts", - tags = ["team:ci", "release_unit"], size = "small", - srcs = ["ray_release/tests/test_alerts.py"] + srcs = ["ray_release/tests/test_alerts.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + +py_test( + name = "test_anyscale_job_manager", + size = "small", + srcs = ["ray_release/tests/test_anyscale_job_manager.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], ) py_test( name = "test_anyscale_job_wrapper", - tags = ["team:ci", "release_unit"], size = "small", - srcs = ["ray_release/tests/test_anyscale_job_wrapper.py"] + srcs = ["ray_release/tests/test_anyscale_job_wrapper.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + +py_test( + name = "test_bisect", + size = "small", + srcs = ["ray_release/tests/test_bisect.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], ) py_test( name = "test_buildkite", - tags = ["team:ci", "release_unit"], size = "small", - srcs = ["ray_release/tests/test_buildkite.py"] + srcs = ["ray_release/tests/test_buildkite.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pyyaml"), + bk_require("pytest"), + ], ) py_test( name = "test_cluster_manager", - tags = ["team:ci", "release_unit"], size = "small", - srcs = ["ray_release/tests/test_cluster_manager.py"] + srcs = ["ray_release/tests/test_cluster_manager.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + ":test_utils", + bk_require("freezegun"), + bk_require("pytest"), + ], ) py_test( name = "test_config", - tags = ["team:ci", "release_unit"], size = "small", srcs = ["ray_release/tests/test_config.py"], - data = ["release_tests.yaml"], + data = glob( + ["**/*.yaml"], + exclude = ["ray_release/**/*.yaml"], + ) + [ + "//python/ray/autoscaler/aws:test_configs", + "//python/ray/autoscaler/gcp:test_configs", + ], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], ) py_test( name = "test_env", - tags = ["team:ci", "release_unit"], size = "small", - srcs = ["ray_release/tests/test_env.py"] + srcs = ["ray_release/tests/test_env.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], ) py_test( name = "test_glue", - tags = ["team:ci", "release_unit"], size = "small", - srcs = ["ray_release/tests/test_glue.py"] + srcs = ["ray_release/tests/test_glue.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + ":test_utils", + bk_require("pytest"), + ], +) + +py_test( + name = "test_log_aggregator", + size = "small", + srcs = ["ray_release/tests/test_log_aggregator.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + +py_test( + name = "test_result", + size = "small", + srcs = ["ray_release/tests/test_result.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], ) py_test( name = "test_run_script", - tags = ["team:ci", "release_unit"], size = "small", srcs = ["ray_release/tests/test_run_script.py"], data = [ - "run_release_test.sh", "ray_release/tests/_test_catch_args.py", "ray_release/tests/_test_run_release_test_sh.py", + "run_release_test.sh", + ], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + +py_test( + name = "test_template", + size = "small", + srcs = ["ray_release/tests/test_template.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), ], ) py_test( name = "test_wheels", - tags = ["team:ci", "release_unit"], size = "small", srcs = ["ray_release/tests/test_wheels.py"], - deps = ["//:ray_lib"], + data = [ + "//:python_sources", + ], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("freezegun"), + bk_require("pytest"), + ], ) diff --git a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml new file mode 100644 index 000000000000..7966578a31b1 --- /dev/null +++ b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml @@ -0,0 +1,20 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: g4dn.8xlarge + +worker_node_types: + - name: worker_node + instance_type: g4dn.4xlarge + min_workers: 15 + max_workers: 15 + use_spot: false + +aws: + TagSpecifications: + - ResourceType: "instance" + Tags: + - Key: ttl-hours + Value: '24' diff --git a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_env.yaml b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_env.yaml new file mode 100644 index 000000000000..26017cde6ae0 --- /dev/null +++ b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_env.yaml @@ -0,0 +1,21 @@ +base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray:nightly-py38-cu118") }} +env_vars: {} +debian_packages: + - curl + +python: + pip_packages: + - "datasets" + - "evaluate" + - "scikit-learn" + - "boto3" + - myst-parser==0.15.2 + - myst-nb==0.13.1 + - jupytext==1.13.6 + conda_packages: [] + +post_build_cmds: + - pip uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} + - pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 + - pip3 install "pytorch_lightning>=2.0.0" "transformers>=4.28.0" "accelerate>=0.18.0" diff --git a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/lightning-llm-finetuning-7b.ipynb b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/lightning-llm-finetuning-7b.ipynb new file mode 120000 index 000000000000..5f90fb2ae158 --- /dev/null +++ b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/lightning-llm-finetuning-7b.ipynb @@ -0,0 +1 @@ +../../../doc/source/ray-air/examples/dolly_lightning_fsdp_finetuning.ipynb \ No newline at end of file diff --git a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/test_myst_doc.py b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/test_myst_doc.py new file mode 120000 index 000000000000..c265ccc7b062 --- /dev/null +++ b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/test_myst_doc.py @@ -0,0 +1 @@ +../../../doc/test_myst_doc.py \ No newline at end of file diff --git a/release/air_examples/dreambooth/dreambooth_compute.yaml b/release/air_examples/dreambooth/dreambooth_compute_aws.yaml similarity index 100% rename from release/air_examples/dreambooth/dreambooth_compute.yaml rename to release/air_examples/dreambooth/dreambooth_compute_aws.yaml diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml similarity index 100% rename from release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute.yaml rename to release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml new file mode 100644 index 000000000000..036b337e92e8 --- /dev/null +++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml @@ -0,0 +1,22 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +head_node_type: + name: head_node + instance_type: n1-standard-16-nvidia-tesla-t4-1 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-16-nvidia-tesla-t4-1 + min_workers: 15 + max_workers: 15 + use_spot: false + +#aws: +# TagSpecifications: +# - ResourceType: "instance" +# Tags: +# - Key: ttl-hours +# Value: '24' diff --git a/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml new file mode 100644 index 000000000000..1ac93d59eb91 --- /dev/null +++ b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml @@ -0,0 +1,15 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: p3.16xlarge + +worker_node_types: [] + +aws: + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + DeleteOnTermination: true + VolumeSize: 500 diff --git a/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_env.yaml b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_env.yaml new file mode 100644 index 000000000000..20afed88b6e1 --- /dev/null +++ b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_env.yaml @@ -0,0 +1,18 @@ +base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} +env_vars: {} +debian_packages: + - curl + +python: + pip_packages: + # Install boto3 to workaround a temporary issue with python and urllib3. + # TODO(jungong) : remove this. + - boto3 + - myst-parser==0.15.2 + - myst-nb==0.13.1 + - jupytext==1.13.6 + conda_packages: [] + +post_build_cmds: + - pip uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} diff --git a/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb new file mode 120000 index 000000000000..1c219bcfcb46 --- /dev/null +++ b/release/air_examples/opt_deepspeed_batch_inference/opt_deepspeed_batch_inference.ipynb @@ -0,0 +1 @@ +../../../doc/source/ray-air/examples/opt_deepspeed_batch_inference.ipynb \ No newline at end of file diff --git a/release/air_examples/opt_deepspeed_batch_inference/test_myst_doc.py b/release/air_examples/opt_deepspeed_batch_inference/test_myst_doc.py new file mode 120000 index 000000000000..c265ccc7b062 --- /dev/null +++ b/release/air_examples/opt_deepspeed_batch_inference/test_myst_doc.py @@ -0,0 +1 @@ +../../../doc/test_myst_doc.py \ No newline at end of file diff --git a/release/air_tests/air_benchmarks/compute_cpu_1.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml similarity index 100% rename from release/air_tests/air_benchmarks/compute_cpu_1.yaml rename to release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml new file mode 100644 index 000000000000..90de98eb18e6 --- /dev/null +++ b/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml @@ -0,0 +1,12 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 0 + +head_node_type: + name: head_node + instance_type: n1-standard-8 + +worker_node_types: [] diff --git a/release/air_tests/air_benchmarks/compute_cpu_4.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml similarity index 100% rename from release/air_tests/air_benchmarks/compute_cpu_4.yaml rename to release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml diff --git a/release/air_tests/air_benchmarks/compute_gpu_4_g4_12xl.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml similarity index 63% rename from release/air_tests/air_benchmarks/compute_gpu_4_g4_12xl.yaml rename to release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml index 0bb94dc6c3dc..03f5772f88be 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_4_g4_12xl.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml @@ -1,15 +1,17 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} -region: us-west-2 +region: us-west1 +allowed_azs: + - us-west1-b max_workers: 3 head_node_type: name: head_node - instance_type: g4dn.12xlarge + instance_type: n1-standard-8 worker_node_types: - name: worker_node - instance_type: g4dn.12xlarge + instance_type: n1-standard-8 max_workers: 3 min_workers: 3 use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_cpu_8.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml similarity index 100% rename from release/air_tests/air_benchmarks/compute_cpu_8.yaml rename to release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml diff --git a/release/air_tests/air_benchmarks/compute_gpu_8_g4_12xl.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml similarity index 63% rename from release/air_tests/air_benchmarks/compute_gpu_8_g4_12xl.yaml rename to release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml index 630fe5690d39..b15168fcb260 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_8_g4_12xl.yaml +++ b/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml @@ -1,15 +1,17 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} -region: us-west-2 +region: us-west1 +allowed_azs: + - us-west1-b max_workers: 7 head_node_type: name: head_node - instance_type: g4dn.12xlarge + instance_type: n1-standard-8 worker_node_types: - name: worker_node - instance_type: g4dn.12xlarge + instance_type: n1-standard-8 max_workers: 7 min_workers: 7 use_spot: false diff --git a/release/air_tests/air_benchmarks/data_20_nodes.yaml b/release/air_tests/air_benchmarks/compute_data_20_nodes_aws.yaml similarity index 100% rename from release/air_tests/air_benchmarks/data_20_nodes.yaml rename to release/air_tests/air_benchmarks/compute_data_20_nodes_aws.yaml diff --git a/release/air_tests/air_benchmarks/compute_data_20_nodes_gce.yaml b/release/air_tests/air_benchmarks/compute_data_20_nodes_gce.yaml new file mode 100644 index 000000000000..1248701435d0 --- /dev/null +++ b/release/air_tests/air_benchmarks/compute_data_20_nodes_gce.yaml @@ -0,0 +1,17 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 19 + +head_node_type: + name: head_node + instance_type: n1-standard-16 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-16 + max_workers: 19 + min_workers: 19 + use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_gpu_1.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml similarity index 100% rename from release/air_tests/air_benchmarks/compute_gpu_1.yaml rename to release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_g4_8xl.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_cpu_16_aws.yaml similarity index 80% rename from release/air_tests/air_benchmarks/compute_gpu_1_g4_8xl.yaml rename to release/air_tests/air_benchmarks/compute_gpu_1_cpu_16_aws.yaml index b4de6db623ed..e38dc1a84d88 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_1_g4_8xl.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_1_cpu_16_aws.yaml @@ -5,6 +5,6 @@ max_workers: 0 head_node_type: name: head_node - instance_type: g4dn.8xlarge + instance_type: g4dn.4xlarge worker_node_types: [] diff --git a/release/air_tests/air_benchmarks/compute_gce_gpu_1_g4_8xl.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_cpu_16_gce.yaml similarity index 100% rename from release/air_tests/air_benchmarks/compute_gce_gpu_1_g4_8xl.yaml rename to release/air_tests/air_benchmarks/compute_gpu_1_cpu_16_gce.yaml diff --git a/release/air_tests/air_benchmarks/compute_gce_gpu_1.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml similarity index 82% rename from release/air_tests/air_benchmarks/compute_gce_gpu_1.yaml rename to release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml index a4e3746e2cbf..4776275bbc19 100644 --- a/release/air_tests/air_benchmarks/compute_gce_gpu_1.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml @@ -7,7 +7,7 @@ max_workers: 0 head_node_type: name: head_node - instance_type: n1-standard-32-nvidia-tesla-t4-2 # aws g3.8xlarge + instance_type: n1-standard-32-nvidia-tesla-t4-2 worker_node_types: [] diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml similarity index 100% rename from release/air_tests/air_benchmarks/compute_gpu_2x2.yaml rename to release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml new file mode 100644 index 000000000000..3bf0b4eca9d0 --- /dev/null +++ b/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml @@ -0,0 +1,17 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 1 + +head_node_type: + name: head_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + max_workers: 1 + min_workers: 1 + use_spot: false diff --git a/release/air_tests/air_benchmarks/compute_gpu_16.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml similarity index 77% rename from release/air_tests/air_benchmarks/compute_gpu_16.yaml rename to release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml index e19da50cc89d..ee7d1436e7cf 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_16.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml @@ -5,11 +5,11 @@ max_workers: 3 head_node_type: name: head_node - instance_type: g3.16xlarge + instance_type: g4dn.12xlarge worker_node_types: - name: worker_node - instance_type: g3.16xlarge + instance_type: g4dn.12xlarge max_workers: 3 min_workers: 3 use_spot: false @@ -22,5 +22,4 @@ aws: VolumeSize: 800 Iops: 5000 Throughput: 1000 - VolumeSize: 1000 - VolumeType: gp3 + VolumeType: gp3 \ No newline at end of file diff --git a/release/air_tests/air_benchmarks/compute_gce_gpu_16.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml similarity index 72% rename from release/air_tests/air_benchmarks/compute_gce_gpu_16.yaml rename to release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml index 8f94d0691021..5702b44d240e 100644 --- a/release/air_tests/air_benchmarks/compute_gce_gpu_16.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml @@ -1,17 +1,17 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-b max_workers: 3 head_node_type: name: head_node - instance_type: n1-standard-64-nvidia-tesla-t4-4 # aws g3.16xlarge + instance_type: n1-standard-64-nvidia-tesla-t4-4 worker_node_types: - name: worker_node - instance_type: n1-standard-64-nvidia-tesla-t4-4 # aws g3.16xlarge + instance_type: n1-standard-64-nvidia-tesla-t4-4 max_workers: 3 min_workers: 3 use_spot: false diff --git a/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml b/release/air_tests/air_benchmarks/compute_xgboost_aws.yaml similarity index 100% rename from release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml rename to release/air_tests/air_benchmarks/compute_xgboost_aws.yaml diff --git a/release/air_tests/air_benchmarks/compute_xgboost_gce.yaml b/release/air_tests/air_benchmarks/compute_xgboost_gce.yaml new file mode 100644 index 000000000000..13159b6cc420 --- /dev/null +++ b/release/air_tests/air_benchmarks/compute_xgboost_gce.yaml @@ -0,0 +1,28 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 10 + +head_node_type: + name: head_node + instance_type: n1-standard-8 + resources: + cpu: 0 + + +worker_node_types: + - name: worker_node + instance_type: n1-standard-16 + max_workers: 10 + min_workers: 10 + use_spot: false + +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 1000 diff --git a/release/air_tests/air_benchmarks/mlperf-train/resnet50_ray_air.py b/release/air_tests/air_benchmarks/mlperf-train/resnet50_ray_air.py index c04b207b425a..da2e0b319c56 100644 --- a/release/air_tests/air_benchmarks/mlperf-train/resnet50_ray_air.py +++ b/release/air_tests/air_benchmarks/mlperf-train/resnet50_ray_air.py @@ -107,7 +107,9 @@ def ray_dataset_to_tf_dataset( if online_processing: # Apply online preprocessing on the decoded images, cropping and # flipping. - dataset = dataset.map_batches(crop_and_flip_image_batch) + dataset = dataset.map_batches( + crop_and_flip_image_batch, batch_format="pandas" + ) def to_tensor_iterator(): num_steps = 0 diff --git a/release/air_tests/frequent_pausing/compute_config.yaml b/release/air_tests/frequent_pausing/compute_config_aws.yaml similarity index 100% rename from release/air_tests/frequent_pausing/compute_config.yaml rename to release/air_tests/frequent_pausing/compute_config_aws.yaml diff --git a/release/air_tests/frequent_pausing/compute_config_gce.yaml b/release/air_tests/frequent_pausing/compute_config_gce.yaml new file mode 100644 index 000000000000..62833efbd455 --- /dev/null +++ b/release/air_tests/frequent_pausing/compute_config_gce.yaml @@ -0,0 +1,12 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 0 + +head_node_type: + name: head_node + instance_type: n1-standard-2 + +worker_node_types: [] diff --git a/release/alpa_tests/1_p3_16xlarge.yaml b/release/alpa_tests/gpu_1x8_v100_aws.yaml similarity index 100% rename from release/alpa_tests/1_p3_16xlarge.yaml rename to release/alpa_tests/gpu_1x8_v100_aws.yaml diff --git a/release/alpa_tests/gpu_1x8_v100_gce.yaml b/release/alpa_tests/gpu_1x8_v100_gce.yaml new file mode 100644 index 000000000000..4d5a6a692a70 --- /dev/null +++ b/release/alpa_tests/gpu_1x8_v100_gce.yaml @@ -0,0 +1,20 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 1 + +head_node_type: + name: head_node + instance_type: n1-highmem-64-nvidia-tesla-v100-8 + +worker_node_types: [] + +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 500 diff --git a/release/alpa_tests/2_g4dn_12xlarge.yaml b/release/alpa_tests/gpu_2x4_t4_aws.yaml similarity index 100% rename from release/alpa_tests/2_g4dn_12xlarge.yaml rename to release/alpa_tests/gpu_2x4_t4_aws.yaml diff --git a/release/alpa_tests/gpu_2x4_t4_gce.yaml b/release/alpa_tests/gpu_2x4_t4_gce.yaml new file mode 100644 index 000000000000..c831b68a400f --- /dev/null +++ b/release/alpa_tests/gpu_2x4_t4_gce.yaml @@ -0,0 +1,25 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 2 + +head_node_type: + name: head_node + instance_type: n1-standard-64-nvidia-tesla-t4-4 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-64-nvidia-tesla-t4-4 + min_workers: 1 + max_workers: 1 + use_spot: false + +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 500 diff --git a/release/alpa_tests/run_inference_opt_30b.sh b/release/alpa_tests/run_inference_opt_30b.sh index 24d5cc261450..2430fcf78da0 100755 --- a/release/alpa_tests/run_inference_opt_30b.sh +++ b/release/alpa_tests/run_inference_opt_30b.sh @@ -3,9 +3,35 @@ # Integration test for Alpa and Ray. # Exit if any of the test commands fail. -set -x -e pipeline +set -x -e -o pipefail + +# Parse command line args +STORAGE_PROVIDER="aws" + +while [[ $# -gt 0 ]] +do + key="$1" + case $key in + --storage) + STORAGE_PROVIDER="$2" + shift + shift + ;; + *) # Unknown option + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [ "$STORAGE_PROVIDER" != "aws" ] && [ "$STORAGE_PROVIDER" != "gcs" ]; then + echo "Invalid storage provider: $STORAGE_PROVIDER" + exit 1 +fi S3_MODEL_DIR=s3://air-example-data-2/alpa/opt/models/models--facebook--opt-30b/ +GS_MODEL_DIR=gs://air-example-data/alpa/opt/models/models--facebook--opt-30b/ + LOCAL_MODEL_DIR=/tmp/opt-30b/ mkdir -p $LOCAL_MODEL_DIR @@ -13,7 +39,11 @@ mkdir -p $LOCAL_MODEL_DIR # Download weights and tokenizer. Excluding the original # FLAX weights. We only need the alpa converted np weights # for this test. -aws s3 sync $S3_MODEL_DIR $LOCAL_MODEL_DIR --exclude="*.msgpack" +if [ "$STORAGE_PROVIDER" = "aws" ]; then + aws s3 sync $S3_MODEL_DIR $LOCAL_MODEL_DIR --exclude="*.msgpack" +else + gsutil rsync -r -x ".*\.msgpack" $GS_MODEL_DIR $LOCAL_MODEL_DIR +fi # Run training. python inference_opt_30b.py --model_dir $LOCAL_MODEL_DIR diff --git a/release/alpa_tests/run_train_opt_2_7b.sh b/release/alpa_tests/run_train_opt_2_7b.sh index 96885235bad0..90e43a94d2dc 100755 --- a/release/alpa_tests/run_train_opt_2_7b.sh +++ b/release/alpa_tests/run_train_opt_2_7b.sh @@ -3,19 +3,54 @@ # Integration test for Alpa and Ray. # Exit if any of the test commands fail. -set -x -e pipeline +set -x -e -o pipefail -TRAIN_FILE=https://air-example-data-2.s3.us-west-2.amazonaws.com/alpa/alllines.txt +# Parse command line args +STORAGE_PROVIDER="aws" + +while [[ $# -gt 0 ]] +do + key="$1" + case $key in + --storage) + STORAGE_PROVIDER="$2" + shift + shift + ;; + *) # Unknown option + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +if [ "$STORAGE_PROVIDER" != "aws" ] && [ "$STORAGE_PROVIDER" != "gcs" ]; then + echo "Invalid storage provider: $STORAGE_PROVIDER" + exit 1 +fi + +S3_TRAIN_FILE="https://air-example-data-2.s3.us-west-2.amazonaws.com/alpa/alllines.txt" S3_MODEL_DIR=s3://air-example-data-2/alpa/opt/models/models--facebook--opt-2.7b/ + +GS_TRAIN_FILE="https://storage.googleapis.com/air-example-data/alpa/alllines.txt" +GS_MODEL_DIR=gs://air-example-data/alpa/opt/models/models--facebook--opt-2.7b/ + LOCAL_MODEL_DIR=/tmp/opt-2.7b/ OUTPUT_DIR=/tmp/alpa_outputs/ mkdir -p $LOCAL_MODEL_DIR mkdir -p $OUTPUT_DIR + # Download weights and tokenizer. # We only need the FLAX weights to run this test. -aws s3 sync $S3_MODEL_DIR $LOCAL_MODEL_DIR --exclude="*.bin,*.h5" +if [ "$STORAGE_PROVIDER" = "aws" ]; then + aws s3 sync $S3_MODEL_DIR $LOCAL_MODEL_DIR --exclude="*.bin,*.h5" + TRAIN_FILE=$S3_TRAIN_FILE +else + gsutil rsync -r -x ".*\.bin|.*\.h5" $GS_MODEL_DIR $LOCAL_MODEL_DIR + TRAIN_FILE=$GS_TRAIN_FILE +fi # Run training. # 2 instances, 4 GPUs each. So set the pipeline parallelism to 2, diff --git a/release/benchmark-worker-startup/benchmark_worker_startup.py b/release/benchmark-worker-startup/benchmark_worker_startup.py index 8c0c1d2119ef..67ae1e3c05f3 100755 --- a/release/benchmark-worker-startup/benchmark_worker_startup.py +++ b/release/benchmark-worker-startup/benchmark_worker_startup.py @@ -172,7 +172,10 @@ def generate_test_matrix( for with_tasks in [True, False]: for with_gpu in [True, False]: - for with_runtime_env in [True, False]: + # Do not run without runtime env. TODO(cade) Infra team added cgroups to + # default runtime env, need to find some way around that if we want + # "pure" (non-runtime-env) measurements. + for with_runtime_env in [True]: for import_to_try in imports_to_try: for num_jobs in num_jobs_per_type.values(): diff --git a/release/benchmarks/distributed/dashboard_test.py b/release/benchmarks/distributed/dashboard_test.py index 36fb815b9acb..4fdde6973829 100644 --- a/release/benchmarks/distributed/dashboard_test.py +++ b/release/benchmarks/distributed/dashboard_test.py @@ -8,7 +8,7 @@ import logging from collections import defaultdict -from ray.experimental.state.api import list_nodes +from ray.util.state import list_nodes from ray._private.test_utils import fetch_prometheus_metrics from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from pydantic import BaseModel diff --git a/release/benchmarks/distributed/test_many_tasks.py b/release/benchmarks/distributed/test_many_tasks.py index bb7639700534..8a6864a28a08 100644 --- a/release/benchmarks/distributed/test_many_tasks.py +++ b/release/benchmarks/distributed/test_many_tasks.py @@ -4,7 +4,7 @@ import time import tqdm -from ray.experimental.state.api import summarize_tasks +from ray.util.state import summarize_tasks from dashboard_test import DashboardTestAtScale from ray._private.state_api_test_utils import ( StateAPICallSpec, diff --git a/release/benchmarks/distributed/test_scheduling.py b/release/benchmarks/distributed/test_scheduling.py index 2e4dd138d7a7..619a6efea0ee 100644 --- a/release/benchmarks/distributed/test_scheduling.py +++ b/release/benchmarks/distributed/test_scheduling.py @@ -3,6 +3,7 @@ from time import time, sleep from math import floor from ray._private.test_utils import safe_write_to_results_json +import ray._private.test_utils as test_utils @ray.remote @@ -86,6 +87,7 @@ def start_actor(num_actors, num_actors_per_nodes, job): ) ray.init(address="auto") + monitor_actor = test_utils.monitor_memory_usage() total_cpus_per_node = [node["Resources"].get("CPU", 0) for node in ray.nodes()] num_nodes = len(total_cpus_per_node) @@ -104,6 +106,12 @@ def start_actor(num_actors, num_actors_per_nodes, job): args.total_num_actors, args.num_actors_per_nodes, job ) + ray.get(monitor_actor.stop_run.remote()) + used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote()) + print(f"Peak memory usage: {round(used_gb, 2)}GB") + print(f"Peak memory usage per processes:\n {usage}") + del monitor_actor + result = { "total_num_task": args.total_num_task, "num_cpu_per_task": args.num_cpu_per_task, @@ -115,6 +123,8 @@ def start_actor(num_actors, num_actors_per_nodes, job): "submission_cost": submission_cost, "ready_cost": ready_cost, "actor_job_cost": actor_job_cost, + "_peak_memory": round(used_gb, 2), + "_peak_process_memory": usage, "_runtime": submission_cost + ready_cost + actor_job_cost, } diff --git a/release/benchmarks/object_store/test_object_store.py b/release/benchmarks/object_store/test_object_store.py index 2403be078a07..ef8fb5534080 100644 --- a/release/benchmarks/object_store/test_object_store.py +++ b/release/benchmarks/object_store/test_object_store.py @@ -28,8 +28,8 @@ class Actor: def foo(self): pass - def sum(self, arr): - return np.sum(arr) + def data_len(self, arr): + return len(arr) actors = [Actor.remote() for _ in range(NUM_NODES)] @@ -39,25 +39,28 @@ def sum(self, arr): for actor in tqdm(actors, desc="Ensure all actors have started."): ray.get(actor.foo.remote()) + start = perf_counter() result_refs = [] for actor in tqdm(actors, desc="Broadcasting objects"): - result_refs.append(actor.sum.remote(ref)) + result_refs.append(actor.data_len.remote(ref)) results = ray.get(result_refs) + end = perf_counter() + for result in results: assert result == OBJECT_SIZE + return end - start + ray.init(address="auto") -start = perf_counter() -test_object_broadcast() -end = perf_counter() -print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)") +duration = test_object_broadcast() +print(f"Broadcast time: {duration} ({OBJECT_SIZE} B x {NUM_NODES} nodes)") if "TEST_OUTPUT_JSON" in os.environ: out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") results = { - "broadcast_time": end - start, + "broadcast_time": duration, "object_size": OBJECT_SIZE, "num_nodes": NUM_NODES, "success": "1", @@ -66,7 +69,7 @@ def sum(self, arr): results["perf_metrics"] = [ { "perf_metric_name": perf_metric_name, - "perf_metric_value": end - start, + "perf_metric_value": duration, "perf_metric_type": "LATENCY", } ] diff --git a/release/golden_notebook_tests/gpu_tpl.yaml b/release/golden_notebook_tests/gpu_tpl_aws.yaml similarity index 100% rename from release/golden_notebook_tests/gpu_tpl.yaml rename to release/golden_notebook_tests/gpu_tpl_aws.yaml diff --git a/release/golden_notebook_tests/gpu_tpl_gce.yaml b/release/golden_notebook_tests/gpu_tpl_gce.yaml new file mode 100644 index 000000000000..ff65fb25f5fd --- /dev/null +++ b/release/golden_notebook_tests/gpu_tpl_gce.yaml @@ -0,0 +1,17 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 2 + +head_node_type: + name: head_node + instance_type: n1-standard-4 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + min_workers: 2 + max_workers: 2 + use_spot: true diff --git a/release/jobs_tests/workloads/jobs_check_cuda_available.py b/release/jobs_tests/workloads/jobs_check_cuda_available.py index 7489cae88afe..06205c908759 100644 --- a/release/jobs_tests/workloads/jobs_check_cuda_available.py +++ b/release/jobs_tests/workloads/jobs_check_cuda_available.py @@ -37,7 +37,7 @@ def f(): @ray.remote(num_cpus=1, scheduling_strategy="SPREAD") def get_node_id(): - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() node_ids = set(ray.get([get_node_id.remote() for _ in range(100)])) diff --git a/release/jobs_tests/workloads/jobs_remote_multi_node.py b/release/jobs_tests/workloads/jobs_remote_multi_node.py index 0d5c6a2b2677..bf7169ee4d2d 100644 --- a/release/jobs_tests/workloads/jobs_remote_multi_node.py +++ b/release/jobs_tests/workloads/jobs_remote_multi_node.py @@ -23,7 +23,7 @@ @ray.remote(num_cpus=1) def get_node_id(): - return ray.get_runtime_context().node_id + return ray.get_runtime_context().get_node_id() # Allow one fewer node in case a node fails to come up. diff --git a/release/lightning_tests/compute_tpl.yaml b/release/lightning_tests/compute_tpl_aws.yaml similarity index 100% rename from release/lightning_tests/compute_tpl.yaml rename to release/lightning_tests/compute_tpl_aws.yaml diff --git a/release/lightning_tests/compute_tpl_gce.yaml b/release/lightning_tests/compute_tpl_gce.yaml new file mode 100644 index 000000000000..22697c374da9 --- /dev/null +++ b/release/lightning_tests/compute_tpl_gce.yaml @@ -0,0 +1,24 @@ +# 3 x g4dn.4xlarge = 48 cpus + 3 gpus total + +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +head_node_type: + name: head_node + instance_type: n1-standard-16-nvidia-tesla-t4-1 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-16-nvidia-tesla-t4-1 + min_workers: 2 + max_workers: 2 + use_spot: false + +#aws: +# TagSpecifications: +# - ResourceType: "instance" +# Tags: +# - Key: ttl-hours +# Value: '24' diff --git a/release/lightning_tests/workloads/lightning_test_utils.py b/release/lightning_tests/workloads/lightning_test_utils.py index 885954b1e5b9..150e2bc3e23a 100644 --- a/release/lightning_tests/workloads/lightning_test_utils.py +++ b/release/lightning_tests/workloads/lightning_test_utils.py @@ -39,8 +39,11 @@ def validation_step(self, val_batch, batch_idx): def validation_epoch_end(self, outputs): avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean() - self.log("ptl/val_loss", avg_loss, sync_dist=True) - self.log("ptl/val_accuracy", avg_acc, sync_dist=True) + + # TODO(yunxuanx): change this back to ptl/val_loss after + # we resolved the metric unpacking issue + self.log("val_loss", avg_loss, sync_dist=True) + self.log("val_accuracy", avg_acc, sync_dist=True) def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) diff --git a/release/lightning_tests/workloads/test_trainer.py b/release/lightning_tests/workloads/test_trainer.py index 845ff2b4c5e8..117f6ee85ab9 100644 --- a/release/lightning_tests/workloads/test_trainer.py +++ b/release/lightning_tests/workloads/test_trainer.py @@ -23,7 +23,7 @@ logger=CSVLogger("logs", name="my_exp_name"), ) .fit_params(datamodule=MNISTDataModule(batch_size=128)) - .checkpointing(monitor="ptl/val_accuracy", mode="max", save_last=True) + .checkpointing(monitor="val_accuracy", mode="max", save_last=True) .build() ) @@ -41,7 +41,7 @@ taken = time.time() - start result = { "time_taken": taken, - "ptl/val_accuracy": result.metrics["ptl/val_accuracy"], + "val_accuracy": result.metrics["val_accuracy"], } test_output_json = os.environ.get( "TEST_OUTPUT_JSON", "/tmp/lightning_trainer_test.json" diff --git a/release/lightning_tests/workloads/test_tuner.py b/release/lightning_tests/workloads/test_tuner.py index cc6c6b79d0ab..36ff2b257dc4 100644 --- a/release/lightning_tests/workloads/test_tuner.py +++ b/release/lightning_tests/workloads/test_tuner.py @@ -29,7 +29,7 @@ logger=CSVLogger("logs", name="my_exp_name"), ) .fit_params(datamodule=MNISTDataModule(batch_size=200)) - .checkpointing(monitor="ptl/val_accuracy", mode="max") + .checkpointing(monitor="val_accuracy", mode="max") .build() ) @@ -57,12 +57,12 @@ verbose=2, checkpoint_config=CheckpointConfig( num_to_keep=2, - checkpoint_score_attribute="ptl/val_accuracy", + checkpoint_score_attribute="val_accuracy", checkpoint_score_order="max", ), ), tune_config=tune.TuneConfig( - metric="ptl/val_accuracy", + metric="val_accuracy", mode="max", num_samples=2, scheduler=PopulationBasedTraining( @@ -73,7 +73,7 @@ ), ) results = tuner.fit() - best_result = results.get_best_result(metric="ptl/val_accuracy", mode="max") + best_result = results.get_best_result(metric="val_accuracy", mode="max") best_result assert len(results.errors) == 0 @@ -83,7 +83,7 @@ # Report experiment results result = { "time_taken": taken, - "ptl/val_accuracy": best_result.metrics["ptl/val_accuracy"], + "val_accuracy": best_result.metrics["val_accuracy"], } test_output_json = os.environ.get( diff --git a/release/ml_user_tests/horovod/compute_tpl.yaml b/release/ml_user_tests/horovod/compute_tpl_aws.yaml similarity index 100% rename from release/ml_user_tests/horovod/compute_tpl.yaml rename to release/ml_user_tests/horovod/compute_tpl_aws.yaml diff --git a/release/ml_user_tests/horovod/compute_tpl_gce.yaml b/release/ml_user_tests/horovod/compute_tpl_gce.yaml new file mode 100644 index 000000000000..2cad8d220fba --- /dev/null +++ b/release/ml_user_tests/horovod/compute_tpl_gce.yaml @@ -0,0 +1,24 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 3 + +head_node_type: + name: head_node + instance_type: n1-standard-4 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + max_workers: 3 + min_workers: 3 + use_spot: false + +#aws: +# TagSpecifications: +# - ResourceType: "instance" +# Tags: +# - Key: ttl-hours +# Value: '24' diff --git a/release/ml_user_tests/ray-lightning/compute_tpl.yaml b/release/ml_user_tests/ray-lightning/compute_tpl_aws.yaml similarity index 100% rename from release/ml_user_tests/ray-lightning/compute_tpl.yaml rename to release/ml_user_tests/ray-lightning/compute_tpl_aws.yaml diff --git a/release/ml_user_tests/ray-lightning/compute_tpl_gce.yaml b/release/ml_user_tests/ray-lightning/compute_tpl_gce.yaml new file mode 100644 index 000000000000..ffe5cfde17dc --- /dev/null +++ b/release/ml_user_tests/ray-lightning/compute_tpl_gce.yaml @@ -0,0 +1,24 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 2 + +head_node_type: + name: head_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + min_workers: 2 + max_workers: 2 + use_spot: false + +#aws: +# TagSpecifications: +# - ResourceType: "instance" +# Tags: +# - Key: ttl-hours +# Value: '24' diff --git a/release/ml_user_tests/ray-lightning/driver_requirements.txt b/release/ml_user_tests/ray-lightning/driver_requirements.txt old mode 100755 new mode 100644 index e7ab66c99970..082b6e3ef034 --- a/release/ml_user_tests/ray-lightning/driver_requirements.txt +++ b/release/ml_user_tests/ray-lightning/driver_requirements.txt @@ -1,3 +1,3 @@ -torch==1.11.0 -torchvision==0.12.0 +torch==1.13.1 +torchvision==0.14.1 pytorch-lightning \ No newline at end of file diff --git a/release/ml_user_tests/ray-lightning/ray_lightning_user_test.py b/release/ml_user_tests/ray-lightning/ray_lightning_user_test.py index 5357f9b5ba63..ee78efc9746e 100644 --- a/release/ml_user_tests/ray-lightning/ray_lightning_user_test.py +++ b/release/ml_user_tests/ray-lightning/ray_lightning_user_test.py @@ -8,23 +8,13 @@ if __name__ == "__main__": start = time.time() - addr = os.environ.get("RAY_ADDRESS") - job_name = os.environ.get("RAY_JOB_NAME", "ray_lightning_user_test") - - # Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on - # anyscale_default_cloud. - # See https://github.com/pytorch/pytorch/issues/68893 for more details. # Passing in runtime_env to ray.init() will also set it for all the # workers. runtime_env = { - "env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}, "working_dir": os.path.dirname(__file__), } - if addr.startswith("anyscale://"): - ray.init(address=addr, job_name=job_name, runtime_env=runtime_env) - else: - ray.init(address="auto", runtime_env=runtime_env) + ray.init(address="auto", runtime_env=runtime_env) main(num_workers=6, use_gpu=True, max_steps=50) diff --git a/release/ml_user_tests/train/compute_tpl.yaml b/release/ml_user_tests/train/compute_tpl_aws.yaml similarity index 100% rename from release/ml_user_tests/train/compute_tpl.yaml rename to release/ml_user_tests/train/compute_tpl_aws.yaml diff --git a/release/ml_user_tests/train/compute_tpl_gce.yaml b/release/ml_user_tests/train/compute_tpl_gce.yaml new file mode 100644 index 000000000000..57049d10efd4 --- /dev/null +++ b/release/ml_user_tests/train/compute_tpl_gce.yaml @@ -0,0 +1,17 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 2 + +head_node_type: + name: head_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-32-nvidia-tesla-t4-2 + min_workers: 2 + max_workers: 2 + use_spot: false diff --git a/release/ml_user_tests/tune_rllib/compute_tpl.yaml b/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml similarity index 100% rename from release/ml_user_tests/tune_rllib/compute_tpl.yaml rename to release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml diff --git a/release/ml_user_tests/tune_rllib/compute_tpl_gce.yaml b/release/ml_user_tests/tune_rllib/compute_tpl_gce.yaml new file mode 100644 index 000000000000..dbab3926c68d --- /dev/null +++ b/release/ml_user_tests/tune_rllib/compute_tpl_gce.yaml @@ -0,0 +1,31 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 8 + +head_node_type: + name: head_node + instance_type: n1-standard-4 + +# We should be good with 2 GPUs and 50 CPUs. +worker_node_types: + - name: worker_node_cpu + instance_type: n1-standard-4 # 4 CPU + min_workers: 6 + max_workers: 6 + use_spot: false + - name: worker_node_gpu + instance_type: n1-standard-16-nvidia-tesla-t4-1 # 1 GPU and 16 CPU + min_workers: 2 + max_workers: 2 + use_spot: false + +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 500 diff --git a/release/ml_user_tests/xgboost/tpl_gpu_small_scaling.yaml b/release/ml_user_tests/xgboost/tpl_gpu_small_scaling_aws.yaml similarity index 100% rename from release/ml_user_tests/xgboost/tpl_gpu_small_scaling.yaml rename to release/ml_user_tests/xgboost/tpl_gpu_small_scaling_aws.yaml diff --git a/release/ml_user_tests/xgboost/tpl_gpu_small_scaling_gce.yaml b/release/ml_user_tests/xgboost/tpl_gpu_small_scaling_gce.yaml new file mode 100644 index 000000000000..a08bbb742f72 --- /dev/null +++ b/release/ml_user_tests/xgboost/tpl_gpu_small_scaling_gce.yaml @@ -0,0 +1,18 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 4 + +head_node_type: + name: head_node + instance_type: n1-standard-4 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-16-nvidia-tesla-t4-1 + min_workers: 4 + max_workers: 4 + use_spot: false + diff --git a/release/ml_user_tests/xgboost/train_gpu_connect.py b/release/ml_user_tests/xgboost/train_gpu_connect.py index 4a2877a97b77..b3b6188636da 100644 --- a/release/ml_user_tests/xgboost/train_gpu_connect.py +++ b/release/ml_user_tests/xgboost/train_gpu_connect.py @@ -15,26 +15,16 @@ if __name__ == "__main__": os.environ["RXGB_PLACEMENT_GROUP_TIMEOUT_S"] = "1200" - addr = os.environ.get("RAY_ADDRESS") - job_name = os.environ.get("RAY_JOB_NAME", "train_gpu_connect") - - # Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on - # anyscale_default_cloud. - # See https://github.com/pytorch/pytorch/issues/68893 for more details. # Passing in runtime_env to ray.init() will also set it for all the # workers. runtime_env = { "env_vars": { "RXGB_PLACEMENT_GROUP_TIMEOUT_S": "1200", - "NCCL_SOCKET_IFNAME": "ens3", }, "working_dir": os.path.dirname(__file__), } - if addr.startswith("anyscale://"): - ray.init(address=addr, job_name=job_name, runtime_env=runtime_env) - else: - ray.init(address="auto", runtime_env=runtime_env) + ray.init(address="auto", runtime_env=runtime_env) from xgboost_ray import RayParams from release_test_util import train_ray, get_parquet_files diff --git a/release/nightly_tests/chaos_test/compute_template.yaml b/release/nightly_tests/chaos_test/compute_template.yaml index c1319d3c5660..925f482cdf12 100644 --- a/release/nightly_tests/chaos_test/compute_template.yaml +++ b/release/nightly_tests/chaos_test/compute_template.yaml @@ -15,7 +15,7 @@ worker_node_types: instance_type: m5.4xlarge min_workers: 9 max_workers: 9 - use_spot: true + use_spot: false resources: custom_resources: worker: 1 diff --git a/release/nightly_tests/chaos_test/compute_template_gce.yaml b/release/nightly_tests/chaos_test/compute_template_gce.yaml index af329090a4d3..55d4fd840415 100644 --- a/release/nightly_tests/chaos_test/compute_template_gce.yaml +++ b/release/nightly_tests/chaos_test/compute_template_gce.yaml @@ -16,7 +16,7 @@ worker_node_types: instance_type: n2-standard-16 min_workers: 9 max_workers: 9 - use_spot: true + use_spot: false resources: custom_resources: worker: 1 diff --git a/release/nightly_tests/chaos_test/test_chaos_basic.py b/release/nightly_tests/chaos_test/test_chaos_basic.py index 5c53e5959ba7..dd8213e2ec63 100644 --- a/release/nightly_tests/chaos_test/test_chaos_basic.py +++ b/release/nightly_tests/chaos_test/test_chaos_basic.py @@ -11,6 +11,7 @@ import ray from ray._private.test_utils import monitor_memory_usage, wait_for_condition from ray.data._internal.progress_bar import ProgressBar +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy def run_task_workload(total_num_cpus, smoke): @@ -87,9 +88,13 @@ def add(self, letter): if smoke: multiplier = 1 TOTAL_TASKS = int(300 * multiplier) - current_node_ip = ray._private.worker.global_worker.node_ip_address + head_node_id = ray.get_runtime_context().get_node_id() db_actors = [ - DBActor.options(resources={f"node:{current_node_ip}": 0.001}).remote() + DBActor.options( + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=head_node_id, soft=False + ) + ).remote() for _ in range(NUM_CPUS) ] @@ -186,6 +191,7 @@ def main(): print("Warm up... Prestarting workers if necessary.") start = time.time() workload(total_num_cpus, args.smoke) + print(f"Runtime when warm up: {time.time() - start}") # Step 2 print("Running without failures") diff --git a/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml b/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml index 1eb827083779..6c098b2f2167 100644 --- a/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml +++ b/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml @@ -10,13 +10,13 @@ aws: head_node_type: name: head_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge resources: cpu: 8 worker_node_types: - name: worker_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge min_workers: 32 max_workers: 32 use_spot: false diff --git a/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml b/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml index 1368f33934ea..e249486f0377 100644 --- a/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml +++ b/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml @@ -10,11 +10,11 @@ aws: head_node_type: name: head_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge worker_node_types: - name: worker_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge min_workers: 20 max_workers: 20 - use_spot: true + use_spot: false diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml index 4495d182e64a..da67eec060c4 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml @@ -10,6 +10,6 @@ aws: head_node_type: name: head_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge worker_node_types: [] diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml index 988a2298a0fb..e249486f0377 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml @@ -10,11 +10,11 @@ aws: head_node_type: name: head_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge worker_node_types: - name: worker_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge min_workers: 20 max_workers: 20 use_spot: false diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_gce.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_gce.yaml index 7c0c9098a4b7..2302c7030951 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_gce.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_gce.yaml @@ -1,14 +1,15 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-1 allowed_azs: -- us-west1-c + - us-west1-c -aws: - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - DeleteOnTermination: true - VolumeSize: 500 +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 500 head_node_type: name: head_node diff --git a/release/nightly_tests/dataset/aggregate_benchmark.py b/release/nightly_tests/dataset/aggregate_benchmark.py index a2d2a929873e..af4f551e614e 100644 --- a/release/nightly_tests/dataset/aggregate_benchmark.py +++ b/release/nightly_tests/dataset/aggregate_benchmark.py @@ -2,8 +2,8 @@ import ray from ray.data.aggregate import _AggregateOnKeyBase, Max, Mean, Min, Sum -from ray.data.block import Block, KeyFn -from ray.data.datastream import Dataset +from ray.data.block import Block +from ray.data.dataset import Dataset import pyarrow.compute as pac from benchmark import Benchmark @@ -73,7 +73,10 @@ def h2oai_q6(ds: Dataset) -> Dataset: def h2oai_q7(ds: Dataset) -> Dataset: ds = ds.groupby("id3").aggregate(Max("v1"), Min("v2")) - ds = ds.map_batches(lambda df: df.assign(result=df["max(v1)"] - df["min(v2)"])) + ds = ds.map_batches( + lambda df: df.assign(result=df["max(v1)"] - df["min(v2)"]), + batch_format="pandas", + ) return ds @@ -99,7 +102,7 @@ def merge( return (value1, value2) class Top2(_AggregateOnKeyBase): - def __init__(self, on: KeyFn): + def __init__(self, on): self._set_key_fn(on) super().__init__( init=lambda _: (float("-inf"), float("-inf")), diff --git a/release/nightly_tests/dataset/app_config.yaml b/release/nightly_tests/dataset/app_config.yaml index 613fd9e44294..12e8cd86d84b 100644 --- a/release/nightly_tests/dataset/app_config.yaml +++ b/release/nightly_tests/dataset/app_config.yaml @@ -3,7 +3,6 @@ base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightl python: pip_packages: - boto3 - - pyarrow<7.0.0 - tqdm conda_packages: [] diff --git a/release/nightly_tests/dataset/benchmark.py b/release/nightly_tests/dataset/benchmark.py index b40c8e599b26..0b650e72c5ec 100644 --- a/release/nightly_tests/dataset/benchmark.py +++ b/release/nightly_tests/dataset/benchmark.py @@ -4,7 +4,7 @@ import time from typing import Callable -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset class Benchmark: diff --git a/release/nightly_tests/dataset/data_ingest_benchmark.py b/release/nightly_tests/dataset/data_ingest_benchmark.py index db91cd7856a3..4cf3e6bc91bf 100644 --- a/release/nightly_tests/dataset/data_ingest_benchmark.py +++ b/release/nightly_tests/dataset/data_ingest_benchmark.py @@ -10,23 +10,39 @@ from ray.data import DatasetPipeline import pandas as pd +import torch GiB = 1024 * 1024 * 1024 -@ray.remote(num_cpus=0.5) +@ray.remote class ConsumingActor: def __init__(self, rank): self._rank = rank - def consume(self, split): - DoConsume(split, self._rank) + def consume( + self, + split, + use_gpu=False, + max_bytes_to_read=None, + ): + do_consume( + split, + self._rank, + use_gpu, + max_bytes_to_read, + ) def get_location(self): return ray.get_runtime_context().get_node_id() -def DoConsume(split, rank): +def do_consume( + split, + rank, + use_gpu=False, + max_bytes_to_read=None, +): prefetch_batches = 1 batch_size = 4096 num_epochs = 1 @@ -56,9 +72,16 @@ def generate_epochs(data, epochs: int): prefetch_blocks=prefetch_batches, batch_size=batch_size ) else: - batch_iterator = epoch_data.iter_batches( - prefetch_batches=prefetch_batches, batch_size=batch_size - ) + if not use_gpu: + batch_iterator = epoch_data.iter_batches( + prefetch_batches=prefetch_batches, batch_size=batch_size + ) + else: + batch_iterator = epoch_data.iter_torch_batches( + prefetch_batches=prefetch_batches, + batch_size=batch_size, + device="cuda", + ) for batch in batch_iterator: batch_delay = time.perf_counter() - batch_start @@ -68,11 +91,19 @@ def generate_epochs(data, epochs: int): bytes_read += int(batch.memory_usage(index=True, deep=True).sum()) elif isinstance(batch, np.ndarray): bytes_read += batch.nbytes + elif isinstance(batch, dict) and isinstance( + batch.get("data"), torch.Tensor + ): + tensor = batch["data"] + bytes_read += tensor.element_size() * tensor.nelement() else: # NOTE: This isn't recursive and will just return the size of # the object pointers if list of non-primitive types. bytes_read += sys.getsizeof(batch) batch_start = time.perf_counter() + if max_bytes_to_read is not None: + if bytes_read >= max_bytes_to_read: + break delta = time.perf_counter() - start print("Time to read all data", delta, "seconds") @@ -94,7 +125,7 @@ def generate_epochs(data, epochs: int): def make_ds(size_gb: int, parallelism: int = -1): # Dataset of 10KiB tensor records. total_size = 1024 * 1024 * 1024 * size_gb - record_dim = 1280 + record_dim = 1024 record_size = record_dim * 8 num_records = int(total_size / record_size) dataset = ray.data.range_tensor( @@ -104,26 +135,40 @@ def make_ds(size_gb: int, parallelism: int = -1): return dataset -def run_ingest_streaming(dataset_size_gb, num_workers): +def run_ingest_streaming(dataset_size_gb, num_workers, use_gpu, early_stop): ds = make_ds(dataset_size_gb) + resources = {"num_cpus": 0.5} + if use_gpu: + resources["num_gpus"] = 0.5 consumers = [ - ConsumingActor.options(scheduling_strategy="SPREAD").remote(i) + ConsumingActor.options(scheduling_strategy="SPREAD", **resources).remote(i) for i in range(num_workers) ] locality_hints = ray.get([actor.get_location.remote() for actor in consumers]) - ds = ds.map_batches(lambda df: df * 2) + ds = ds.map_batches(lambda df: df * 2, batch_format="pandas") splits = ds.streaming_split(num_workers, equal=True, locality_hints=locality_hints) - future = [consumers[i].consume.remote(s) for i, s in enumerate(splits)] + max_bytes_to_read = None + if early_stop: + max_bytes_to_read = dataset_size_gb * GiB // num_workers // 2 + # Early stop when we've read half the dataset. + future = [ + consumers[i].consume.remote( + s, + use_gpu, + max_bytes_to_read, + ) + for i, s in enumerate(splits) + ] ray.get(future) def run_ingest_bulk(dataset_size_gb, num_workers): ds = make_ds(dataset_size_gb, parallelism=200) consumers = [ - ConsumingActor.options(scheduling_strategy="SPREAD").remote(i) + ConsumingActor.options(scheduling_strategy="SPREAD", num_cpus=0.5).remote(i) for i in range(num_workers) ] - ds = ds.map_batches(lambda df: df * 2) + ds = ds.map_batches(lambda df: df * 2, batch_format="pandas") splits = ds.split(num_workers, equal=True, locality_hints=consumers) future = [consumers[i].consume.remote(s) for i, s in enumerate(splits)] ray.get(future) @@ -146,10 +191,14 @@ def run_ingest_bulk(dataset_size_gb, num_workers): def run_ingest_dataset_pipeline(dataset_size_gb, num_workers): ds = make_ds(dataset_size_gb) consumers = [ - ConsumingActor.options(scheduling_strategy="SPREAD").remote(i) + ConsumingActor.options(scheduling_strategy="SPREAD", num_cpus=0.5).remote(i) for i in range(num_workers) ] - p = ds.window(bytes_per_window=40 * GiB).repeat().map_batches(lambda df: df * 2) + p = ( + ds.window(bytes_per_window=40 * GiB) + .repeat() + .map_batches(lambda df: df * 2, batch_format="pandas") + ) splits = p.split(num_workers, equal=True, locality_hints=consumers) future = [consumers[i].consume.remote(s) for i, s in enumerate(splits)] ray.get(future) @@ -182,11 +231,15 @@ def run_ingest_dataset_pipeline(dataset_size_gb, num_workers): parser.add_argument("--dataset-size-gb", type=int, default=200) parser.add_argument("--streaming", action="store_true", default=False) parser.add_argument("--new_streaming", action="store_true", default=False) + parser.add_argument("--use-gpu", action="store_true", default=False) + parser.add_argument("--early-stop", action="store_true", default=False) args = parser.parse_args() start = time.time() if args.new_streaming: - run_ingest_streaming(args.dataset_size_gb, args.num_workers) + run_ingest_streaming( + args.dataset_size_gb, args.num_workers, args.use_gpu, args.early_stop + ) elif args.streaming: run_ingest_dataset_pipeline(args.dataset_size_gb, args.num_workers) else: diff --git a/release/nightly_tests/dataset/data_ingest_benchmark_compute_gpu.yaml b/release/nightly_tests/dataset/data_ingest_benchmark_compute_gpu.yaml new file mode 100644 index 000000000000..5ab624706d30 --- /dev/null +++ b/release/nightly_tests/dataset/data_ingest_benchmark_compute_gpu.yaml @@ -0,0 +1,15 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +max_workers: 3 + +head_node_type: + name: head_node + instance_type: m5.2xlarge + +worker_node_types: + - name: worker_node + instance_type: g4dn.4xlarge + max_workers: 2 + min_workers: 2 + use_spot: false diff --git a/release/nightly_tests/dataset/data_ingest_benchmark_compute_gpu_gce.yaml b/release/nightly_tests/dataset/data_ingest_benchmark_compute_gpu_gce.yaml new file mode 100644 index 000000000000..58b28b6980ea --- /dev/null +++ b/release/nightly_tests/dataset/data_ingest_benchmark_compute_gpu_gce.yaml @@ -0,0 +1,17 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 3 + +head_node_type: + name: head_node + instance_type: n2-standard-8 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-4-nvidia-tesla-t4-1 + max_workers: 2 + min_workers: 2 + use_spot: false diff --git a/release/nightly_tests/dataset/dataset_random_access.py b/release/nightly_tests/dataset/dataset_random_access.py index e67c6cc24864..dfe955520496 100644 --- a/release/nightly_tests/dataset/dataset_random_access.py +++ b/release/nightly_tests/dataset/dataset_random_access.py @@ -26,8 +26,8 @@ def main(): num_workers = 400 run_time = 15 - ds = ray.data.range_table(nrow, parallelism=parallelism) - rmap = ds.to_random_access_dataset("value", num_workers=num_workers) + ds = ray.data.range(nrow, parallelism=parallelism) + rmap = ds.to_random_access_dataset("id", num_workers=num_workers) print("Multiget throughput: ", end="") start = time.time() diff --git a/release/nightly_tests/dataset/dataset_shuffle_data_loader.py b/release/nightly_tests/dataset/dataset_shuffle_data_loader.py index 5c2aba492c7e..d2c1c1f4d062 100644 --- a/release/nightly_tests/dataset/dataset_shuffle_data_loader.py +++ b/release/nightly_tests/dataset/dataset_shuffle_data_loader.py @@ -5,13 +5,20 @@ import ray +from pyarrow import fs import numpy as np import torch -PATH = [ - f"s3://shuffling-data-loader-benchmarks/data/input_data_{i}.parquet.snappy" - for i in range(0, 25) -] +PATHS = { + "aws": [ + f"s3://shuffling-data-loader-benchmarks/data/input_data_{i}.parquet.snappy" + for i in range(0, 25) + ], + "gcp": [ + f"gcs://shuffling-data-loader-benchmarks/data/input_data_{i}.parquet.snappy" + for i in range(0, 25) + ], +} def create_parser(): @@ -26,6 +33,7 @@ def create_parser(): ) parser.add_argument("--num-workers", type=int, default=4) parser.add_argument("--repeat-times", type=int, default=16) + parser.add_argument("--cloud", type=str, choices=["aws", "gcp"]) return parser @@ -83,9 +91,14 @@ def create_torch_iterator(split, batch_size, rank=None): return torch_iterator -def create_dataset(filenames, repeat_times): +def create_dataset(filenames, repeat_times, cloud): + if cloud == "gcp": + filesystem = fs.GcsFileSystem() + else: + filesystem = None + pipeline = ( - ray.data.read_parquet(list(filenames)) + ray.data.read_parquet(list(filenames), filesystem=filesystem) .repeat(times=repeat_times) .random_shuffle_each_window() ) @@ -100,7 +113,7 @@ def create_dataset(filenames, repeat_times): start = time.time() - pipeline = create_dataset(PATH, args.repeat_times) + pipeline = create_dataset(PATHS[args.cloud], args.repeat_times, args.cloud) splits = pipeline.split(args.num_workers) @ray.remote(num_gpus=1) diff --git a/release/nightly_tests/dataset/inference.py b/release/nightly_tests/dataset/inference.py index ba1e4b1d9b11..4534c86a854e 100644 --- a/release/nightly_tests/dataset/inference.py +++ b/release/nightly_tests/dataset/inference.py @@ -1,122 +1,70 @@ -from io import BytesIO -from PIL import Image +import json +import os +import time +from typing import Any, Dict +import numpy as np import torch from torchvision import transforms from torchvision.models import resnet50 import ray -import boto3 -import json -import time -import os -from tqdm import tqdm -import numpy as np - - -class Preprocessor: - def __init__(self): - self.torch_transform = transforms.Compose( - [ - transforms.Resize(224), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Lambda(lambda t: t[:3, ...]), # remove alpha channel - transforms.Normalize( - mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] - ), - ] - ) - - def __call__(self, img_bytes): - try: - img = Image.open(BytesIO(img_bytes)).convert("RGB") - tensor = self.torch_transform(img) - return tensor - except Exception as e: - raise e -class ImageModel: +class ImageClassifier: def __init__(self): self.model = resnet50(pretrained=True).eval().half().cuda() - def __call__(self, input_tensor_np): - input_tensor = torch.from_numpy(input_tensor_np).half().cuda() + def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + inputs = torch.from_numpy(batch["image"]).half().cuda() with torch.no_grad(): - output_tensor = self.model(input_tensor) - result = torch.argmax(output_tensor, dim=1).cpu() - return result.numpy() - - -def get_paths(bucket, path, max_files=100 * 1000): - s3 = boto3.resource("s3") - s3_objects = s3.Bucket(bucket).objects.filter(Prefix=path).limit(max_files).all() - materialized = [(obj.bucket_name, obj.key) for obj in tqdm(s3_objects)] - return materialized - - -def preprocess(batch): - preprocessor = Preprocessor() - return preprocessor(batch) - - -infer_initialized = False -model_fn = None - + outputs = self.model(inputs) + predictions = torch.argmax(outputs, dim=1).cpu() + batch["predictions"] = predictions + return batch + + +transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Resize(224), + transforms.CenterCrop(224), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ] +) -def infer(batch): - global infer_initialized, model_fn - if not infer_initialized: - infer_initialized = True - model_fn = ImageModel() - ndarr_obj = batch.values - input_tensor_np = np.array([img.numpy() for img in ndarr_obj.reshape(-1)]) - return list(model_fn(input_tensor_np)) +def preprocess(record: Dict[str, Any]) -> Dict[str, Any]: + record["image"] = transform(record["image"]) + return record -ray.init() start_time = time.time() -print("Downloading...") -ds = ray.data.read_binary_files( - "s3://anyscale-data/small-images/", - parallelism=1000, - ray_remote_args={"num_cpus": 0.5}, +ds = ( + ray.data.read_images( + "s3://anyscale-data/small-images/", + parallelism=1000, + ray_remote_args={"num_cpus": 0.5}, + mode="RGB", + ) + .map(preprocess) + .map_batches( + ImageClassifier, + num_gpus=0.25, + batch_size=128, + compute=ray.data.ActorPoolStrategy(), + ) + .materialize() ) -# Do a blocking map so that we can measure the download time. -ds = ds.map(lambda x: x).materialize() - -end_download_time = time.time() -print("Preprocessing...") -ds = ds.map(preprocess).materialize() -end_preprocess_time = time.time() -print("Inferring...") -# NOTE: set a small batch size to avoid OOM on GRAM when doing inference. -ds = ds.map_batches( - infer, num_gpus=0.25, batch_size=128, batch_format="pandas", compute="actors" -).materialize() - -end_time = time.time() - -download_time = end_download_time - start_time -preprocess_time = end_preprocess_time - end_download_time -infer_time = end_time - end_preprocess_time -total_time = end_time - start_time -print("Download time", download_time) -print("Preprocess time", preprocess_time) -print("Infer time", infer_time) +total_time = time.time() - start_time print("total time", total_time) if "TEST_OUTPUT_JSON" in os.environ: out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") results = { - "download_time": download_time, - "preprocess_time": preprocess_time, - "inference_time": infer_time, "total_time": total_time, } json.dump(results, out_file) diff --git a/release/nightly_tests/dataset/iter_batches_benchmark.py b/release/nightly_tests/dataset/iter_batches_benchmark.py index 18c288f25f49..2720a733e2f8 100644 --- a/release/nightly_tests/dataset/iter_batches_benchmark.py +++ b/release/nightly_tests/dataset/iter_batches_benchmark.py @@ -2,7 +2,7 @@ from typing import Optional import ray -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from benchmark import Benchmark diff --git a/release/nightly_tests/dataset/iter_tensor_batches_benchmark.py b/release/nightly_tests/dataset/iter_tensor_batches_benchmark.py index f7bc69821b0a..9541adb412a6 100644 --- a/release/nightly_tests/dataset/iter_tensor_batches_benchmark.py +++ b/release/nightly_tests/dataset/iter_tensor_batches_benchmark.py @@ -3,7 +3,7 @@ from typing import Optional, Union, List import ray -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from benchmark import Benchmark @@ -67,10 +67,10 @@ def run_iter_tensor_batches_benchmark(benchmark: Benchmark, data_size_gb: int): # Add a label column. def add_label(batch): label = np.ones(shape=(len(batch), 1)) - batch["__value__"] = label + batch["label"] = label return batch - ds = ds.map_batches(add_label).materialize() + ds = ds.map_batches(add_label, batch_format="pandas").materialize() # Test iter_torch_batches() with default args. benchmark.run( @@ -86,7 +86,7 @@ def add_label(batch): to_tf, ds=ds, feature_columns="image", - label_columns="__value__", + label_columns="label", use_default_params=True, ) @@ -105,7 +105,7 @@ def add_label(batch): to_tf, ds=ds, feature_columns="image", - label_columns="__value__", + label_columns="label", batch_size=batch_size, ) @@ -139,7 +139,7 @@ def add_label(batch): to_tf, ds=ds, feature_columns="image", - label_columns="__value__", + label_columns="label", batch_size=batch_size, local_shuffle_buffer_size=shuffle_buffer_size, ) diff --git a/release/nightly_tests/dataset/map_batches_benchmark.py b/release/nightly_tests/dataset/map_batches_benchmark.py index 5518ceb6b27c..0a2bc8596e9d 100644 --- a/release/nightly_tests/dataset/map_batches_benchmark.py +++ b/release/nightly_tests/dataset/map_batches_benchmark.py @@ -3,7 +3,7 @@ import ray from ray.data._internal.compute import ActorPoolStrategy, ComputeStrategy -from ray.data.datastream import Dataset, MaterializedDatastream +from ray.data.dataset import Dataset, MaterializedDataset from benchmark import Benchmark @@ -22,7 +22,7 @@ def map_batches( is_eager_executed: Optional[bool] = False, ) -> Dataset: - assert isinstance(input_ds, MaterializedDatastream) + assert isinstance(input_ds, MaterializedDataset) ds = input_ds for _ in range(num_calls): @@ -72,9 +72,9 @@ def run_map_batches_benchmark(benchmark: Benchmark): # Test multiple calls of map_batches. for num_calls in num_calls_list: - for compute in ["tasks", ActorPoolStrategy(size=1)]: + for compute in [None, ActorPoolStrategy(size=1)]: batch_size = 4096 - if compute == "tasks": + if compute is None: compute_strategy = "tasks" else: compute_strategy = "actors" @@ -130,8 +130,13 @@ def run_map_batches_benchmark(benchmark: Benchmark): ).materialize() for batch_format in batch_formats: - for compute in ["tasks", "actors"]: - test_name = f"map-batches-{batch_format}-{compute}-multi-files" + for compute in [None, ActorPoolStrategy(min_size=1, max_size=float("inf"))]: + if compute is None: + compute_strategy = "tasks" + else: + compute_strategy = "actors" + test_name = f"map-batches-{batch_format}-{compute_strategy}-multi-files" + benchmark.run( test_name, map_batches, diff --git a/release/air_tests/air_benchmarks/compute_gce_gpu_4_g4_12xl.yaml b/release/nightly_tests/dataset/multi_node_benchmark_compute_gce.yaml similarity index 52% rename from release/air_tests/air_benchmarks/compute_gce_gpu_4_g4_12xl.yaml rename to release/nightly_tests/dataset/multi_node_benchmark_compute_gce.yaml index 4182417f7ce7..12b52948af59 100644 --- a/release/air_tests/air_benchmarks/compute_gce_gpu_4_g4_12xl.yaml +++ b/release/nightly_tests/dataset/multi_node_benchmark_compute_gce.yaml @@ -1,17 +1,17 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: - - us-west1-b +allowed_azs: +- us-west1-c -max_workers: 3 +max_workers: 0 head_node_type: name: head_node - instance_type: n1-standard-64-nvidia-tesla-t4-4 # g4dn.12xlarge + instance_type: n2-standard-16 # m5.4xlarge worker_node_types: - name: worker_node - instance_type: n1-standard-64-nvidia-tesla-t4-4 # g4dn.12xlarge + instance_type: n2-standard-16 # m5.4xlarge max_workers: 3 min_workers: 3 use_spot: false diff --git a/release/nightly_tests/dataset/operator_fusion_benchmark.py b/release/nightly_tests/dataset/operator_fusion_benchmark.py index 6b2817f35741..c8b6ea579eb2 100644 --- a/release/nightly_tests/dataset/operator_fusion_benchmark.py +++ b/release/nightly_tests/dataset/operator_fusion_benchmark.py @@ -123,7 +123,7 @@ def _summarize_results(results: List[Dict[str, float]]) -> Dict[str, float]: "--ops-spec", type=str, default=( - '[{"op": "map_batches", "batch_size": 1024, "batch_format": "default"}]' + '[{"op": "map_batches", "batch_size": 1024, "batch_format": "pandas"}]' ), ) parser.add_argument("--target-max-block-size", type=int, default=None) diff --git a/release/nightly_tests/dataset/parquet_metadata_resolution.py b/release/nightly_tests/dataset/parquet_metadata_resolution.py index f3b59a554d92..119b2803ad61 100644 --- a/release/nightly_tests/dataset/parquet_metadata_resolution.py +++ b/release/nightly_tests/dataset/parquet_metadata_resolution.py @@ -5,6 +5,7 @@ parser = argparse.ArgumentParser(description="Parquet Metadata Read") parser.add_argument("--num-files", type=int, default=30) +parser.add_argument("--cloud", type=str, choices=["aws", "gcp"]) if __name__ == "__main__": @@ -16,11 +17,15 @@ num = args.num_files - files = [ - f"s3://shuffling-data-loader-benchmarks/data/r10_000_000_000-f1000" - f"/input_data_{i}.parquet.snappy" - for i in range(args.num_files) - ] + assert args.cloud in {"aws", "gcp"}, args.cloud + if args.cloud == "aws": + prefix = "s3://shuffling-data-loader-benchmarks/data/r10_000_000_000-f1000" + if args.cloud == "gcp": + # NOTE(@bveeramani): I made a mistake while transferring the files from S3 to + # GCS, so there's an extra "r10_000_000_000-f1000" in the URI. Don't worry about + # it. The files are the same. + prefix = "gs://shuffling-data-loader-benchmarks/data/r10_000_000_000-f1000/r10_000_000_000-f1000" # noqa: E501 + files = [f"{prefix}/input_data_{i}.parquet.snappy" for i in range(args.num_files)] start = time.time() ray.data.read_parquet(files).count() # This should only read Parquet metadata. diff --git a/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml b/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml index 65d377d83643..b8b25b2def6c 100644 --- a/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml +++ b/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml @@ -12,16 +12,16 @@ aws: head_node_type: name: head_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge worker_node_types: - name: memory_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge min_workers: 20 max_workers: 20 use_spot: false - name: gpu_node - instance_type: i3.8xlarge + instance_type: m6i.8xlarge min_workers: 4 max_workers: 4 use_spot: false diff --git a/release/nightly_tests/dataset/pipelined_ingestion_compute_gce.yaml b/release/nightly_tests/dataset/pipelined_ingestion_compute_gce.yaml index dc4aea7d096a..4c9c2a497ccd 100644 --- a/release/nightly_tests/dataset/pipelined_ingestion_compute_gce.yaml +++ b/release/nightly_tests/dataset/pipelined_ingestion_compute_gce.yaml @@ -5,12 +5,13 @@ allowed_azs: max_workers: 999 -aws: - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - DeleteOnTermination: true - VolumeSize: 500 +gcp_advanced_configurations_json: + instance_properties: + disks: + - boot: true + auto_delete: true + initialize_params: + disk_size_gb: 500 head_node_type: name: head_node diff --git a/release/nightly_tests/dataset/pipelined_training.py b/release/nightly_tests/dataset/pipelined_training.py index b3f3b55b4a85..01c9dce0a1e9 100644 --- a/release/nightly_tests/dataset/pipelined_training.py +++ b/release/nightly_tests/dataset/pipelined_training.py @@ -14,7 +14,7 @@ from ray_shuffling_data_loader.data_generation import DATA_SPEC from ray_shuffling_data_loader.embedding_model import MyModel, annotation, huber_loss -from ray.data.datastream_pipeline import DatasetPipeline +from ray.data import DatasetPipeline # Training settings parser = argparse.ArgumentParser(description="Dataset ingestion Example") diff --git a/release/nightly_tests/dataset/pipelined_training_compute.yaml b/release/nightly_tests/dataset/pipelined_training_compute.yaml index 966765049ff8..d666b8274f5a 100644 --- a/release/nightly_tests/dataset/pipelined_training_compute.yaml +++ b/release/nightly_tests/dataset/pipelined_training_compute.yaml @@ -14,11 +14,11 @@ aws: head_node_type: name: head_node - instance_type: i3.8xlarge + instance_type: m6i.16xlarge worker_node_types: - - name: memory_node - instance_type: i3.8xlarge + - name: memory_node + instance_type: m6i.16xlarge min_workers: 10 max_workers: 10 use_spot: false diff --git a/release/nightly_tests/dataset/read_images_benchmark.py b/release/nightly_tests/dataset/read_images_benchmark.py index 111fb3cd8b82..7eb4c8e26a6a 100644 --- a/release/nightly_tests/dataset/read_images_benchmark.py +++ b/release/nightly_tests/dataset/read_images_benchmark.py @@ -7,7 +7,7 @@ from PIL import Image import ray -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from benchmark import Benchmark diff --git a/release/nightly_tests/dataset/read_parquet_benchmark.py b/release/nightly_tests/dataset/read_parquet_benchmark.py index 3205b9a18d55..6e6fff795627 100644 --- a/release/nightly_tests/dataset/read_parquet_benchmark.py +++ b/release/nightly_tests/dataset/read_parquet_benchmark.py @@ -1,5 +1,5 @@ import ray -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from benchmark import Benchmark from parquet_data_generator import generate_data diff --git a/release/nightly_tests/dataset/read_tfrecords_benchmark.py b/release/nightly_tests/dataset/read_tfrecords_benchmark.py index 358a92f7ce4b..46526fad3fc1 100644 --- a/release/nightly_tests/dataset/read_tfrecords_benchmark.py +++ b/release/nightly_tests/dataset/read_tfrecords_benchmark.py @@ -4,7 +4,7 @@ from typing import List, Tuple import ray -from ray.data.datastream import Dataset +from ray.data.dataset import Dataset from benchmark import Benchmark from read_images_benchmark import generate_images @@ -64,7 +64,7 @@ def generate_features(batch): features = {k: v for (k, v) in features.items() if len(v) > 0} return pa.table(features) - ds = ray.data.range(num_rows).map_batches(generate_features) + ds = ray.data.range(num_rows).map_batches(generate_features, batch_format="pandas") tfrecords_dir = tempfile.mkdtemp() ds.write_tfrecords(tfrecords_dir) return tfrecords_dir diff --git a/release/nightly_tests/dataset/shuffle_app_config.yaml b/release/nightly_tests/dataset/shuffle_app_config.yaml index 50791b5bf15b..c0728acd33e8 100644 --- a/release/nightly_tests/dataset/shuffle_app_config.yaml +++ b/release/nightly_tests/dataset/shuffle_app_config.yaml @@ -3,7 +3,6 @@ base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightl python: pip_packages: - boto3 - - pyarrow<7.0.0 conda_packages: [] post_build_cmds: diff --git a/release/nightly_tests/dataset/sort.py b/release/nightly_tests/dataset/sort.py index 9dccf85bc2a4..658725b85c57 100644 --- a/release/nightly_tests/dataset/sort.py +++ b/release/nightly_tests/dataset/sort.py @@ -10,14 +10,13 @@ import ray from ray._private.internal_api import memory_summary -from ray.data._internal.arrow_block import ArrowRow from ray.data._internal.util import _check_pyarrow_version from ray.data.block import Block, BlockMetadata from ray.data.context import DataContext from ray.data.datasource import Datasource, ReadTask -class RandomIntRowDatasource(Datasource[ArrowRow]): +class RandomIntRowDatasource(Datasource): """An example datasource that generates rows with random int64 columns. Examples: diff --git a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml index 7408ca8f065a..f9c1742fcbda 100644 --- a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml +++ b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances_gce.yaml @@ -13,11 +13,11 @@ gcp_advanced_configurations_json: head_node_type: name: head_node - instance_type: e2-standard-16 # aws m5.4xlarge + instance_type: n2-standard-16 # aws m5.4xlarge worker_node_types: - name: worker_node - instance_type: e2-standard-16 # aws m5.4xlarge + instance_type: n2-standard-16 # aws m5.4xlarge min_workers: 19 max_workers: 19 use_spot: false diff --git a/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml b/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml index 4257f215b5d8..38091a3f12b6 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml @@ -10,11 +10,11 @@ aws: head_node_type: name: head_node - instance_type: i3.4xlarge + instance_type: m6i.4xlarge worker_node_types: - name: worker_node - instance_type: i3.4xlarge + instance_type: m6i.4xlarge min_workers: 0 max_workers: 19 use_spot: false diff --git a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml index 38af0ffe31c2..a726988aeda0 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml @@ -12,12 +12,12 @@ aws: head_node_type: name: head_node - instance_type: i3.4xlarge + instance_type: m6i.4xlarge resources: {"object_store_memory": 21474836480} worker_node_types: - name: worker_node2 - instance_type: i3.4xlarge + instance_type: m6i.4xlarge min_workers: 3 max_workers: 3 use_spot: false diff --git a/release/nightly_tests/shuffle/shuffle_compute_single.yaml b/release/nightly_tests/shuffle/shuffle_compute_single.yaml index 7ec607996cd8..df8d84edc81f 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_single.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_single.yaml @@ -12,7 +12,7 @@ aws: head_node_type: name: head_node - instance_type: i3.4xlarge + instance_type: m6i.4xlarge resources: {"object_store_memory": 21474836480} worker_node_types: [] diff --git a/release/nightly_tests/stress_tests/test_state_api_scale.py b/release/nightly_tests/stress_tests/test_state_api_scale.py index 9972fae1f6d1..b50c008c8611 100644 --- a/release/nightly_tests/stress_tests/test_state_api_scale.py +++ b/release/nightly_tests/stress_tests/test_state_api_scale.py @@ -1,7 +1,6 @@ import click import json import ray -from ray._private.ray_constants import LOG_PREFIX_ACTOR_NAME, LOG_PREFIX_JOB_ID from ray._private.state_api_test_utils import ( STATE_LIST_LIMIT, StateAPIMetric, @@ -16,7 +15,7 @@ import time import os -from ray.experimental.state.api import ( +from ray.util.state import ( get_log, list_actors, list_objects, @@ -226,7 +225,7 @@ def _split(a, n): list_objects, filters=[ ("reference_type", "=", "LOCAL_REFERENCE"), - ("type", "=", "Worker"), + ("type", "=", "WORKER"), ], key_suffix=f"{num_objects}", limit=STATE_LIST_LIMIT, @@ -251,9 +250,6 @@ def test_large_log_file(log_file_size_byte: int): class LogActor: def write_log(self, log_file_size_byte: int): ctx = hashlib.md5() - job_id = ray.get_runtime_context().get_job_id() - prefix = f"{LOG_PREFIX_JOB_ID}{job_id}\n{LOG_PREFIX_ACTOR_NAME}LogActor\n" - ctx.update(prefix.encode()) while log_file_size_byte > 0: n = min(log_file_size_byte, 4 * MiB) chunk = "".join(random.choices(string.ascii_letters, k=n)) @@ -262,12 +258,12 @@ def write_log(self, log_file_size_byte: int): log_file_size_byte -= n sys.stdout.flush() - return ctx.hexdigest(), ray.get_runtime_context().node_id.hex() + return ctx.hexdigest(), ray.get_runtime_context().get_node_id() actor = LogActor.remote() - expected_hash, node_id = ray.get( - actor.write_log.remote(log_file_size_byte=log_file_size_byte) - ) + + task = actor.write_log.remote(log_file_size_byte=log_file_size_byte) + expected_hash, node_id = ray.get(task) assert expected_hash is not None, "Empty checksum from the log actor" assert node_id is not None, "Empty node id from the log actor" @@ -276,7 +272,7 @@ def write_log(self, log_file_size_byte: int): time_taken = 0 t_start = time.perf_counter() - for s in get_log(actor_id=actor._actor_id.hex(), tail=-1): + for s in get_log(task_id=task.task_id().hex(), tail=1000000000): t_end = time.perf_counter() time_taken += t_end - t_start # Not including this time diff --git a/release/nightly_tests/stress_tests/test_state_api_with_other_tests.py b/release/nightly_tests/stress_tests/test_state_api_with_other_tests.py index be8c396a7df3..b5367d7712f8 100644 --- a/release/nightly_tests/stress_tests/test_state_api_with_other_tests.py +++ b/release/nightly_tests/stress_tests/test_state_api_with_other_tests.py @@ -6,7 +6,7 @@ import ray -from ray.experimental.state.api import ( +from ray.util.state import ( list_actors, list_nodes, list_objects, @@ -85,7 +85,6 @@ def run_release_test_in_subprocess(test_file: str, args: List[str]) -> bool: def run_test(test_name: str, test_args: List[str]): - monitor_actor = test_utils.monitor_memory_usage() start = time.perf_counter() @@ -110,7 +109,6 @@ def run_test_with_state_api( call_interval_s: int = 3, print_interval_s: int = 15, ) -> Dict: - start_time = time.perf_counter() # Stage 1: Run with state APIs @@ -175,7 +173,6 @@ def test( test_args, call_interval_s, ): - # Set up state API calling methods def not_none(res): return res is not None diff --git a/release/ray_release/anyscale_util.py b/release/ray_release/anyscale_util.py index 6552dae281da..6ef84cbea060 100644 --- a/release/ray_release/anyscale_util.py +++ b/release/ray_release/anyscale_util.py @@ -7,7 +7,7 @@ from anyscale.sdk.anyscale_client.sdk import AnyscaleSDK -LAST_LOGS_LENGTH = 10 +LAST_LOGS_LENGTH = 30 def find_cloud_by_name( diff --git a/release/ray_release/bazel.py b/release/ray_release/bazel.py new file mode 100644 index 000000000000..ba68ba0c82e1 --- /dev/null +++ b/release/ray_release/bazel.py @@ -0,0 +1,22 @@ +import os + +import runfiles + +REPO_NAME = "com_github_ray_project_ray" +_LEGACY_REPO_ROOT = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../.."), +) + +the_runfiles = runfiles.Create() + + +def _norm_path_join(*args): + return os.path.normpath(os.path.join(*args)) + + +def bazel_runfile(*args): + """Return the path to a runfile in the release directory.""" + p = _norm_path_join(*args) + if the_runfiles: + return the_runfiles.Rlocation(os.path.join(REPO_NAME, p)) + return os.path.join(_LEGACY_REPO_ROOT, p) diff --git a/release/ray_release/buildkite/concurrency.py b/release/ray_release/buildkite/concurrency.py index e414ec344d60..1e201f9628dd 100644 --- a/release/ray_release/buildkite/concurrency.py +++ b/release/ray_release/buildkite/concurrency.py @@ -1,9 +1,9 @@ import csv -import os from collections import namedtuple from typing import Tuple, Optional, Dict -from ray_release.config import Test, RELEASE_PACKAGE_DIR +from ray_release.bazel import bazel_runfile +from ray_release.config import Test from ray_release.template import load_test_cluster_compute from ray_release.logger import logger @@ -31,10 +31,10 @@ ] gce_gpu_cpu_to_concurrent_groups = [ - Condition(min_gpu=8, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=1), - Condition(min_gpu=4, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=1), - Condition(min_gpu=2, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=3), - Condition(min_gpu=1, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=4), + Condition(min_gpu=8, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=4), + Condition(min_gpu=4, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=8), + Condition(min_gpu=2, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=16), + Condition(min_gpu=1, max_gpu=-1, min_cpu=0, max_cpu=-1, group="gpu-gce", limit=32), Condition( min_gpu=0, max_gpu=0, min_cpu=1025, max_cpu=-1, group="enormous-gce", limit=1 ), @@ -61,14 +61,16 @@ "n1-standard-16-nvidia-tesla-t4-1": (16, 1), "n1-standard-64-nvidia-tesla-t4-4": (64, 4), "n1-standard-32-nvidia-tesla-t4-2": (32, 2), + "n1-highmem-64-nvidia-tesla-v100-8": {64, 8}, "n1-highmem-96-nvidia-tesla-v100-8": {96, 8}, } def load_instance_types(path: Optional[str] = None) -> Dict[str, Tuple[int, int]]: - path = path or os.path.join( - RELEASE_PACKAGE_DIR, "ray_release", "buildkite", "aws_instance_types.csv" - ) + if not path: + path = bazel_runfile( + "release/ray_release/buildkite/aws_instance_types.csv", + ) instance_to_resources = {} with open(path, "rt") as fp: diff --git a/release/ray_release/buildkite/filter.py b/release/ray_release/buildkite/filter.py index 73af1b841252..4307068615ec 100644 --- a/release/ray_release/buildkite/filter.py +++ b/release/ray_release/buildkite/filter.py @@ -21,6 +21,7 @@ def filter_tests( frequency: Frequency, test_attr_regex_filters: Optional[Dict[str, str]] = None, prefer_smoke_tests: bool = False, + run_jailed_tests: bool = False, ) -> List[Tuple[Test, bool]]: if test_attr_regex_filters is None: test_attr_regex_filters = {} @@ -35,6 +36,8 @@ def filter_tests( break if attr_mismatch: continue + if not run_jailed_tests and test.get("jailed", False): + continue test_frequency = get_frequency(test["frequency"]) diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py index a13bde1575d8..7bd2f0a089dd 100644 --- a/release/ray_release/buildkite/step.py +++ b/release/ray_release/buildkite/step.py @@ -121,13 +121,16 @@ def get_step( if test.get("run", {}).get("type") == "client": step["agents"]["queue"] = str(RELEASE_QUEUE_CLIENT) - # If a test is not stable, allow to soft fail + # If a test is jailed or not stable, allow to soft fail stable = test.get("stable", True) - if not stable: + jailed = test.get("jailed", False) + full_label = "" + if jailed or not stable: step["soft_fail"] = True - full_label = "[unstable] " - else: - full_label = "" + if not stable: + full_label += "[unstable]" + if jailed: + full_label += "[jailed]" full_label += test["name"] if smoke_test: diff --git a/release/ray_release/command_runner/_anyscale_job_wrapper.py b/release/ray_release/command_runner/_anyscale_job_wrapper.py index f99bc918914f..6249ac3c8c75 100644 --- a/release/ray_release/command_runner/_anyscale_job_wrapper.py +++ b/release/ray_release/command_runner/_anyscale_job_wrapper.py @@ -65,13 +65,12 @@ def run_storage_cp(source: str, target: str): return False if not Path(source).exists(): - logger.error(f"Couldn't upload to cloud storage: '{source}' does not exist.") + logger.warning(f"Couldn't upload to cloud storage: '{source}' does not exist.") return False storage_service = urlparse(target).scheme cp_cmd_args = [] if storage_service == "s3": - install_pip("awscli") cp_cmd_args = [ "aws", "s3", diff --git a/release/ray_release/command_runner/sdk_runner.py b/release/ray_release/command_runner/sdk_runner.py deleted file mode 100644 index 7bc088595308..000000000000 --- a/release/ray_release/command_runner/sdk_runner.py +++ /dev/null @@ -1,206 +0,0 @@ -import json -import os -import tempfile -import time -from typing import TYPE_CHECKING, Any, Dict, Optional - -from ray_release.anyscale_util import LAST_LOGS_LENGTH -from ray_release.cluster_manager.cluster_manager import ClusterManager -from ray_release.command_runner.command_runner import CommandRunner -from ray_release.exception import ( - ClusterNodesWaitTimeout, - CommandError, - CommandTimeout, - LogsError, - RemoteEnvSetupError, - FetchResultError, -) -from ray_release.file_manager.file_manager import FileManager -from ray_release.logger import logger -from ray_release.util import ( - exponential_backoff_retry, - format_link, - get_anyscale_sdk, - ANYSCALE_HOST, -) - -if TYPE_CHECKING: - from anyscale.sdk.anyscale_client.sdk import AnyscaleSDK - - -class SDKRunner(CommandRunner): - def __init__( - self, - cluster_manager: ClusterManager, - file_manager: FileManager, - working_dir: str, - sdk: Optional["AnyscaleSDK"] = None, - artifact_path: Optional[str] = None, - ): - super(SDKRunner, self).__init__( - cluster_manager=cluster_manager, - file_manager=file_manager, - working_dir=working_dir, - ) - self.sdk = sdk or get_anyscale_sdk() - - self.last_command_scd_id = None - - def prepare_local_env(self, ray_wheels_url: Optional[str] = None): - pass - - def prepare_remote_env(self): - # Copy wait script to working dir - wait_script = os.path.join(os.path.dirname(__file__), "_wait_cluster.py") - # Copy wait script to working dir - if os.path.exists("wait_cluster.py"): - os.unlink("wait_cluster.py") - os.link(wait_script, "wait_cluster.py") - - # Copy prometheus metrics script to working dir - metrics_script = os.path.join( - os.path.dirname(__file__), "_prometheus_metrics.py" - ) - # Copy wait script to working dir - if os.path.exists("prometheus_metrics.py"): - os.unlink("prometheus_metrics.py") - os.link(metrics_script, "prometheus_metrics.py") - - try: - self.file_manager.upload() - except Exception as e: - raise RemoteEnvSetupError( - f"Error setting up remote environment: {e}" - ) from e - - def wait_for_nodes(self, num_nodes: int, timeout: float = 900): - # Wait script should be uploaded already. Kick off command - try: - # Give 30 seconds more to acount for communication - self.run_prepare_command( - f"python wait_cluster.py {num_nodes} {timeout}", timeout=timeout + 30 - ) - except (CommandError, CommandTimeout) as e: - raise ClusterNodesWaitTimeout( - f"Not all {num_nodes} nodes came up within {timeout} seconds." - ) from e - - def save_metrics(self, start_time: float, timeout: float = 900): - self.run_prepare_command( - f"python prometheus_metrics.py {start_time}", timeout=timeout - ) - - def run_command( - self, - command: str, - env: Optional[Dict] = None, - timeout: float = 3600.0, - raise_on_timeout: bool = True, - ) -> float: - full_env = self.get_full_command_env(env) - - if full_env: - env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " " - else: - env_str = "" - - full_command = f"{env_str}{command}" - logger.info( - f"Running command in cluster {self.cluster_manager.cluster_name}: " - f"{full_command}" - ) - - logger.info( - f"Link to cluster: " - f"{format_link(self.cluster_manager.get_cluster_url())}" - ) - - result = self.sdk.create_session_command( - dict(session_id=self.cluster_manager.cluster_id, shell_command=full_command) - ) - - scd_id = result.result.id - self.last_command_scd_id = scd_id - - completed = result.result.finished_at is not None - - start_time = time.monotonic() - timeout_at = start_time + timeout - next_status = start_time + 30 - - while not completed: - now = time.monotonic() - if now >= timeout_at: - raise CommandTimeout( - f"Cluster command timed out after {timeout} seconds." - ) - - if now >= next_status: - logger.info( - f"... command still running ..." - f"({int(now - start_time)} seconds) ..." - ) - next_status += 30 - - # Sleep 1 sec before next check. - time.sleep(1) - - result = exponential_backoff_retry( - lambda: self.sdk.get_session_command(session_command_id=scd_id), - retry_exceptions=Exception, - initial_retry_delay_s=10, - max_retries=3, - ) - completed = result.result.finished_at - - status_code = result.result.status_code - time_taken = time.monotonic() - start_time - - if status_code != 0: - raise CommandError(f"Command returned non-success status: {status_code}") - - return time_taken - - def get_last_logs_ex(self, scd_id: Optional[str] = None): - scd_id = scd_id or self.last_command_scd_id - if not scd_id: - raise LogsError( - "Must specify scd_id to fetch command logs. Did " - "you already kick off a command?" - ) - - # Todo: It would be nice to get an actual SDK API here - result, _, _ = self.sdk.api_client.call_api( - "/api/v2/session_commands/{session_command_id}/execution_logs", - "GET", - path_params={"session_command_id": scd_id}, - query_params={"start_line": -LAST_LOGS_LENGTH, "end_line": 0}, - header_params={}, - response_type=object, - _host=str(ANYSCALE_HOST), - _preload_content=True, - _return_http_data_only=False, - ) - return result["result"]["lines"] - - def _fetch_json(self, path: str) -> Dict[str, Any]: - try: - tmpfile = tempfile.mktemp() - self.file_manager.download(path, tmpfile) - - with open(tmpfile, "rt") as f: - data = json.load(f) - - os.unlink(tmpfile) - return data - except Exception as e: - raise FetchResultError(f"Could not fetch results from session: {e}") from e - - def fetch_results(self) -> Dict[str, Any]: - return self._fetch_json(self._RESULT_OUTPUT_JSON) - - def fetch_metrics(self) -> Dict[str, Any]: - return self._fetch_json(self._METRICS_OUTPUT_JSON) - - def fetch_artifact(self): - raise NotImplementedError diff --git a/release/ray_release/config.py b/release/ray_release/config.py index 47266b02f970..cc72baf69e8a 100644 --- a/release/ray_release/config.py +++ b/release/ray_release/config.py @@ -7,6 +7,7 @@ import jsonschema import yaml from ray_release.anyscale_util import find_cloud_by_name +from ray_release.bazel import bazel_runfile from ray_release.exception import ReleaseTestCLIError, ReleaseTestConfigError from ray_release.logger import logger from ray_release.util import DeferredEnvVar, deep_update @@ -50,9 +51,7 @@ class TestDefinition(dict): RELEASE_PACKAGE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) -RELEASE_TEST_SCHEMA_FILE = os.path.join( - RELEASE_PACKAGE_DIR, "ray_release", "schema.json" -) +RELEASE_TEST_SCHEMA_FILE = bazel_runfile("release/ray_release/schema.json") def read_and_validate_release_test_collection( @@ -98,7 +97,7 @@ def parse_test_definition(test_definitions: List[TestDefinition]) -> List[Test]: ) test = copy.deepcopy(test_definition) test["name"] = f'{test["name"]}.{variation.pop("__suffix__")}' - test.update(variation) + test = deep_update(test, variation) tests.append(test) return tests @@ -135,6 +134,13 @@ def validate_release_test_collection( ) num_errors += 1 + error = validate_test_cluster_env(test) + if error: + logger.error( + f"Failed to validate test {test.get('name', '(unnamed)')}: {error}" + ) + num_errors += 1 + if num_errors > 0: raise ReleaseTestConfigError( f"Release test configuration error: Found {num_errors} test " @@ -180,6 +186,19 @@ def validate_cluster_compute(cluster_compute: Dict[str, Any]) -> Optional[str]: return None +def validate_test_cluster_env(test: Test) -> Optional[str]: + from ray_release.template import get_cluster_env_path + + cluster_env_path = get_cluster_env_path(test) + + if not os.path.exists(cluster_env_path): + raise ReleaseTestConfigError( + f"Cannot load yaml template from {cluster_env_path}: Path not found." + ) + + return None + + def validate_aws_config(aws_config: Dict[str, Any]) -> Optional[str]: for block_device_mapping in aws_config.get("BlockDeviceMappings", []): ebs = block_device_mapping.get("Ebs") diff --git a/release/ray_release/env.py b/release/ray_release/env.py index 98e3f329e154..475db22648e0 100644 --- a/release/ray_release/env.py +++ b/release/ray_release/env.py @@ -1,16 +1,16 @@ import os from typing import Dict +from ray_release.bazel import bazel_runfile from ray_release.exception import ReleaseTestConfigError -DEFAULT_ENVIRONMENT = "staging_v2" +DEFAULT_ENVIRONMENT = "aws" def load_environment(environment_name: str) -> Dict[str, str]: - this_dir = os.path.dirname(__file__) - env_file = os.path.join(this_dir, "environments", f"{environment_name}.env") - - if not os.path.exists(env_file): + file_base = f"{environment_name}.env" + env_file = bazel_runfile("release/ray_release/environments", file_base) + if not env_file or not os.path.isfile(env_file): raise ReleaseTestConfigError( f"Unknown environment with name: {environment_name}" ) diff --git a/release/ray_release/environments/staging_v2.env b/release/ray_release/environments/aws.env similarity index 100% rename from release/ray_release/environments/staging_v2.env rename to release/ray_release/environments/aws.env diff --git a/release/ray_release/environments/prod_v1.env b/release/ray_release/environments/prod_v1.env deleted file mode 100644 index 5548d11add98..000000000000 --- a/release/ray_release/environments/prod_v1.env +++ /dev/null @@ -1,5 +0,0 @@ -ANYSCALE_HOST=https://console.anyscale.com -RELEASE_AWS_ANYSCALE_SECRET_ARN="arn:aws:secretsmanager:us-west-2:029272617770:secret:release-automation/anyscale-token20210505220406333800000001-BcUuKB" -RELEASE_DEFAULT_CLOUD_ID="cld_4F7k8814aZzGG8TNUGPKnc" -RELEASE_DEFAULT_PROJECT="prj_FKRmeV5pA6X72aVscFALNC32" -ANYSCALE_PROJECT="prj_FKRmeV5pA6X72aVscFALNC32" \ No newline at end of file diff --git a/release/ray_release/environments/staging_v1.env b/release/ray_release/environments/staging_v1.env deleted file mode 100644 index 111c6ffeede2..000000000000 --- a/release/ray_release/environments/staging_v1.env +++ /dev/null @@ -1,5 +0,0 @@ -ANYSCALE_HOST=https://console.anyscale-staging.com -RELEASE_AWS_ANYSCALE_SECRET_ARN="arn:aws:secretsmanager:us-west-2:029272617770:secret:release-automation/anyscale-staging-token20221014164754935800000001-pfQunc" -RELEASE_DEFAULT_CLOUD_ID="cld_401TPoxgB8MM6A0NNQauOV" -RELEASE_DEFAULT_PROJECT="prj_qC3ZfndQWYYjx2cz8KWGNUL4" -ANYSCALE_PROJECT="prj_qC3ZfndQWYYjx2cz8KWGNUL4" \ No newline at end of file diff --git a/release/ray_release/file_manager/session_controller.py b/release/ray_release/file_manager/session_controller.py deleted file mode 100644 index 5aab4d5807fc..000000000000 --- a/release/ray_release/file_manager/session_controller.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -from typing import TYPE_CHECKING, Optional - -from ray_release.cluster_manager.cluster_manager import ClusterManager -from ray_release.file_manager.file_manager import FileManager -from ray_release.logger import logger - -if TYPE_CHECKING: - from anyscale.controllers.session_controller import SessionController - - -class SessionControllerFileManager(FileManager): - def __init__( - self, - cluster_manager: ClusterManager, - session_controller: Optional["SessionController"] = None, - ): - from anyscale.controllers.session_controller import SessionController - - super(SessionControllerFileManager, self).__init__(cluster_manager) - self.session_controller = session_controller or SessionController() - - # Write legacy anyscale project yaml - with open(os.path.join(os.getcwd(), ".anyscale.yaml"), "wt") as f: - f.write(f"project_id: {self.cluster_manager.project_id}") - - def upload(self, source: Optional[str] = None, target: Optional[str] = None): - logger.info( - f"Uploading {source or ''} to {target or ''} " - f"using SessionController" - ) - - if source and os.path.isdir(source) and target: - # Add trailing slashes - source = os.path.join(source, "") - target = os.path.join(target, "") - - self.session_controller.push( - session_name=self.cluster_manager.cluster_name, - source=source, - target=target, - config=None, - all_nodes=False, - ) - - def download(self, source: str, target: str): - logger.info( - f"Downloading {source or ''} to {target or ''} " - f"using SessionController" - ) - self.session_controller.pull( - session_name=self.cluster_manager.cluster_name, - source=source, - target=target, - config=None, - ) diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 46ef68f45ce5..428a01671d6b 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -12,7 +12,6 @@ from ray_release.command_runner.job_runner import JobRunner from ray_release.command_runner.command_runner import CommandRunner from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner -from ray_release.command_runner.sdk_runner import SDKRunner from ray_release.config import ( Test, DEFAULT_BUILD_TIMEOUT, @@ -21,7 +20,6 @@ DEFAULT_WAIT_FOR_NODES_TIMEOUT, RELEASE_PACKAGE_DIR, DEFAULT_AUTOSUSPEND_MINS, - validate_test, ) from ray_release.template import load_test_cluster_env, load_test_cluster_compute from ray_release.exception import ( @@ -33,11 +31,9 @@ PrepareCommandTimeout, TestCommandError, TestCommandTimeout, - LocalEnvSetupError, ClusterEnvCreateError, ) from ray_release.file_manager.job_file_manager import JobFileManager -from ray_release.file_manager.session_controller import SessionControllerFileManager from ray_release.logger import logger from ray_release.reporter.reporter import Reporter from ray_release.result import Result, handle_exception @@ -46,37 +42,17 @@ reset_signal_handling, register_handler, ) -from ray_release.util import ( - run_bash_script, - get_pip_packages, - reinstall_anyscale_dependencies, -) type_str_to_command_runner = { - "command": SDKRunner, - "sdk_command": SDKRunner, + "job": JobRunner, "anyscale_job": AnyscaleJobRunner, } command_runner_to_cluster_manager = { - SDKRunner: FullClusterManager, JobRunner: FullClusterManager, AnyscaleJobRunner: MinimalClusterManager, } -file_manager_str_to_file_manager = { - "sdk": SessionControllerFileManager, - "job": JobFileManager, - "anyscale_job": JobFileManager, -} - -command_runner_to_file_manager = { - SDKRunner: JobFileManager, # Use job file manager per default - JobRunner: JobFileManager, - AnyscaleJobRunner: JobFileManager, -} - - DEFAULT_RUN_TYPE = "anyscale_job" TIMEOUT_BUFFER_MINUTES = 15 @@ -100,7 +76,6 @@ def _load_test_configuration( smoke_test: bool = False, no_terminate: bool = False, ) -> Tuple[ClusterManager, CommandRunner, str]: - validate_test(test) logger.info(f"Test config: {test}") # Populate result paramaters @@ -139,20 +114,7 @@ def _load_test_configuration( ) cluster_manager_cls = command_runner_to_cluster_manager[command_runner_cls] - - file_manager_str = test["run"].get("file_manager", None) - if file_manager_str: - if file_manager_str not in file_manager_str_to_file_manager: - raise ReleaseTestConfigError( - f"Unknown file manager: {file_manager_str}. Must be one of " - f"{list(file_manager_str_to_file_manager.keys())}" - ) - file_manager_cls = file_manager_str_to_file_manager[file_manager_str] - else: - file_manager_cls = command_runner_to_file_manager[command_runner_cls] - logger.info(f"Got command runner cls: {command_runner_cls}") - logger.info(f"Got file manager cls: {file_manager_cls}") # Extra tags to be set on resources on cloud provider's side extra_tags = _get_extra_tags_from_env() # We don't need other attributes as they can be derived from the name @@ -169,9 +131,11 @@ def _load_test_configuration( anyscale_project, smoke_test=smoke_test, ) - file_manager = file_manager_cls(cluster_manager=cluster_manager) command_runner = command_runner_cls( - cluster_manager, file_manager, working_dir, artifact_path=artifact_path + cluster_manager, + JobFileManager(cluster_manager=cluster_manager), + working_dir, + artifact_path=artifact_path, ) except Exception as e: raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e @@ -263,26 +227,6 @@ def _setup_cluster_environment( return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout -def _setup_local_environment( - test: Test, - command_runner: CommandRunner, - ray_wheels_url: str, -) -> None: - driver_setup_script = test.get("driver_setup", None) - if driver_setup_script: - try: - run_bash_script(driver_setup_script) - except Exception as e: - raise LocalEnvSetupError(f"Driver setup script failed: {e}") from e - - # Install local dependencies - command_runner.prepare_local_env(ray_wheels_url) - - # Re-install anyscale package as local dependencies might have changed - # from local env setup - reinstall_anyscale_dependencies() - - def _local_environment_information( result: Result, cluster_manager: ClusterManager, @@ -293,10 +237,6 @@ def _local_environment_information( cluster_id: Optional[str], cluster_env_id: Optional[str], ) -> None: - pip_packages = get_pip_packages() - pip_package_string = "\n".join(pip_packages) - logger.info(f"Installed python packages:\n{pip_package_string}") - if isinstance(cluster_manager, FullClusterManager): if not no_terminate: register_handler( @@ -484,10 +424,6 @@ def run_release_test( cluster_env_id, ) - buildkite_group(":nut_and_bolt: Setting up local environment") - _setup_local_environment(test, command_runner, ray_wheels_url) - - # Print installed pip packages buildkite_group(":bulb: Local environment information") _local_environment_information( result, diff --git a/release/ray_release/job_manager/anyscale_job_manager.py b/release/ray_release/job_manager/anyscale_job_manager.py index 61561a596d47..16e87bac4bfd 100644 --- a/release/ray_release/job_manager/anyscale_job_manager.py +++ b/release/ray_release/job_manager/anyscale_job_manager.py @@ -1,7 +1,9 @@ -import io import os import time -from contextlib import redirect_stdout, redirect_stderr, contextmanager +import subprocess +import tempfile +from collections import deque +from contextlib import contextmanager from typing import Any, Dict, Optional, Tuple @@ -9,8 +11,6 @@ CreateProductionJob, HaJobStates, ) -from anyscale.controllers.job_controller import JobController, terminal_state - from ray_release.anyscale_util import LAST_LOGS_LENGTH, get_cluster_name from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.exception import ( @@ -22,6 +22,7 @@ from ray_release.signal_handling import register_handler, unregister_handler from ray_release.util import ( ANYSCALE_HOST, + ERROR_LOG_PATTERNS, exponential_backoff_retry, anyscale_job_url, format_link, @@ -33,6 +34,7 @@ HaJobStates.BROKEN: -2, HaJobStates.TERMINATED: -3, } +terminal_state = set(job_status_to_return_code.keys()) class AnyscaleJobManager: @@ -105,7 +107,7 @@ def last_job_result(self): def last_job_result(self, value): cluster_id = value.state.cluster_id # Set this only once. - if self._last_job_result is None and cluster_id: + if self.cluster_manager.cluster_id is None and cluster_id: self.cluster_manager.cluster_id = value.state.cluster_id self.cluster_manager.cluster_name = get_cluster_name( value.state.cluster_id, self.sdk @@ -259,6 +261,63 @@ def run_and_wait( ) return self._wait_job(timeout) + def _get_ray_logs(self) -> Tuple[Optional[str], Optional[str]]: + """ + Obtain any ray logs that contain keywords that indicate a crash, such as + ERROR or Traceback + """ + tmpdir = tempfile.mktemp() + try: + subprocess.check_output( + [ + "anyscale", + "logs", + "cluster", + "--id", + self.cluster_manager.cluster_id, + "--head-only", + "--download", + "--download-dir", + tmpdir, + ] + ) + except Exception as e: + logger.log(f"Failed to download logs from anyscale {e}") + return None + return AnyscaleJobManager._find_job_driver_and_ray_error_logs(tmpdir) + + @staticmethod + def _find_job_driver_and_ray_error_logs( + tmpdir: str, + ) -> Tuple[Optional[str], Optional[str]]: + # Ignored some ray files that do not crash ray despite having exceptions + ignored_ray_files = [ + "monitor.log", + "event_AUTOSCALER.log", + "event_JOBS.log", + ] + error_output = None + job_driver_output = None + matched_pattern_count = 0 + for root, _, files in os.walk(tmpdir): + for file in files: + if file in ignored_ray_files: + continue + with open(os.path.join(root, file)) as lines: + output = "".join(deque(lines, maxlen=3 * LAST_LOGS_LENGTH)) + # job-driver logs + if file.startswith("job-driver-"): + job_driver_output = output + continue + # ray error logs, favor those that match with the most number of + # error patterns + if ( + len([error for error in ERROR_LOG_PATTERNS if error in output]) + > matched_pattern_count + ): + error_output = output + return job_driver_output, error_output + def get_last_logs(self): if not self.job_id: raise RuntimeError( @@ -268,20 +327,13 @@ def get_last_logs(self): if self._last_logs: return self._last_logs - # TODO: replace with an actual API call. def _get_logs(): - buf = io.StringIO() - with open(os.devnull, "w") as devnull: - with redirect_stdout(buf), redirect_stderr(devnull): - job_controller = JobController() - job_controller.logs( - job_id=self.job_id, - should_follow=False, - ) - print("", flush=True) - output = buf.getvalue().strip() - assert "### Starting ###" in output, "No logs fetched" - return "\n".join(output.splitlines()[-LAST_LOGS_LENGTH * 3 :]) + job_driver_log, ray_error_log = self._get_ray_logs() + assert job_driver_log or ray_error_log, "No logs fetched" + if job_driver_log: + return job_driver_log + else: + return ray_error_log ret = exponential_backoff_retry( _get_logs, diff --git a/release/ray_release/log_aggregator.py b/release/ray_release/log_aggregator.py new file mode 100644 index 000000000000..28bc5e08a322 --- /dev/null +++ b/release/ray_release/log_aggregator.py @@ -0,0 +1,103 @@ +import re +from typing import List + +TRACEBACK_PATTERN = "Traceback (most recent call last)" + + +class LogAggregator: + def __init__(self, log: str): + self.log = log + + def compute_crash_pattern(self) -> str: + stack_trace = LogAggregator._compute_stack_trace(self.log.splitlines()) + # truncate short enough to store in databases, but long enough to keep the + # pattern unique + return LogAggregator._compute_signature(stack_trace)[:4000] + + @staticmethod + def _compute_signature(stack_trace: List[str]) -> str: + """ + Compute signature pattern from stack trace, by remove factors such as date, + time, temp directory, line numbers, etc. This help to aggregate similar logs + into same bug patterns + """ + massaged_trace = [] + for line in stack_trace: + # remove any hashes that are more than 10 characters + line = re.sub(r"[a-z0-9]{10,}", "", line.strip()) + # remove any numbers + line = re.sub(r"\d", "", line) + if line == "Traceback (most recent call last):": + continue + file_line = re.search(r'File "(.*)", (.*)', line) + if file_line: + # append the file's base name and caller information; the result string + # is not something meaningful to human, we just need something that + # uniquely represent the stack trace + line = f'{file_line.group(1).split("/")[-1]}{file_line.group(2)}' + massaged_trace.append(line) + return "".join(massaged_trace) + + @staticmethod + def _compute_stack_trace(logs: List[str]) -> List[str]: + """ + Extract stack trace pattern from the logs. Stack trace pattern often matches + the following: + ERROR ... + Traceback (most recent call last): + File "...", line ..., in ... + ... + Exception: exception error + """ + error_stacktrace = [] + stacktrace = [] + i = 0 + while i < len(logs): + stack = [] + trace = error_stacktrace + # Search for lines that are either + # ... ERROR ... + # or + # ... ERROR ... + # Traceback (most recent call last): + if "ERROR" in logs[i]: + stack.append(logs[i]) + next = i + 1 + if i + 1 < len(logs) and TRACEBACK_PATTERN in logs[i + 1]: + stack.append(logs[i + 1]) + next = i + 2 + # Or if the line with ERROR does not exist, just search for the line with + # Traceback (most recent call last): + elif TRACEBACK_PATTERN in logs[i]: + stack.append(logs[i]) + trace = stacktrace + next = i + 1 + # Or else, skip this line and continue + else: + i = i + 1 + continue + # If the line that contains ERROR, Traceback, etc. is found, scan the logs + # until the line no longer has indentation. This is because stack trace + # is always indented, and stops when the line is no longer indented + while next < len(logs): + if logs[next].startswith((" ", "\t")): + stack.append(logs[next]) + next = next + 1 + else: + break + # Finished capturing the entire stack trace + if next < len(logs): + stack.append(logs[next]) + if stack: + trace.append(stack) + i = next + 1 + + # Favor stack trace that contains the ERROR keyword + if error_stacktrace: + return error_stacktrace[-1] + + # Otherwise any stack trace is fine + if stacktrace: + return stacktrace[-1] + + return [] diff --git a/release/ray_release/reporter/db.py b/release/ray_release/reporter/db.py index e9295140f921..ec816739a306 100644 --- a/release/ray_release/reporter/db.py +++ b/release/ray_release/reporter/db.py @@ -7,6 +7,7 @@ from ray_release.result import Result from ray_release.config import Test from ray_release.logger import logger +from ray_release.log_aggregator import LogAggregator class DBReporter(Reporter): @@ -40,6 +41,9 @@ def report_result(self, test: Test, result: Result): "return_code": result.return_code, "smoke_test": result.smoke_test, "extra_tags": result.extra_tags or {}, + "crash_pattern": LogAggregator( + result.last_logs or "" + ).compute_crash_pattern(), } logger.debug(f"Result json: {json.dumps(result_json)}") diff --git a/release/ray_release/schema.json b/release/ray_release/schema.json index a0e766d2fb39..4c8caeb75295 100644 --- a/release/ray_release/schema.json +++ b/release/ray_release/schema.json @@ -21,6 +21,9 @@ "stable": { "type": "boolean" }, + "jailed": { + "type": "boolean" + }, "python": { "type": "string", "enum": [ @@ -42,9 +45,6 @@ "team": { "type": "string" }, - "driver_setup": { - "type": "string" - }, "cluster": { "$ref": "#/definitions/Cluster" }, @@ -109,14 +109,6 @@ "anyscale_job" ] }, - "file_manager": { - "type": "string", - "enum": [ - "sdk", - "client", - "job" - ] - }, "wait_for_nodes": { "$ref": "#/definitions/WaitForNodes" }, @@ -172,9 +164,6 @@ "env": { "type": "string" }, - "driver_setup": { - "type": "string" - }, "cluster": { "type": "object" }, diff --git a/release/ray_release/scripts/build_pipeline.py b/release/ray_release/scripts/build_pipeline.py index c184e7f3b58e..0913656b5dbb 100644 --- a/release/ray_release/scripts/build_pipeline.py +++ b/release/ray_release/scripts/build_pipeline.py @@ -22,6 +22,7 @@ find_and_wait_for_ray_wheels_url, find_ray_wheels_url, get_buildkite_repo_branch, + parse_commit_from_wheel_url, ) PIPELINE_ARTIFACT_PATH = "/tmp/pipeline_artifacts" @@ -44,7 +45,18 @@ "(for internal use)." ), ) -def main(test_collection_file: Optional[str] = None, no_clone_repo: bool = False): +@click.option( + "--run-jailed-tests", + is_flag=True, + show_default=True, + default=False, + help=("Will run jailed tests."), +) +def main( + test_collection_file: Optional[str] = None, + no_clone_repo: bool = False, + run_jailed_tests: bool = False, +): settings = get_pipeline_settings() repo = settings["ray_test_repo"] @@ -131,6 +143,7 @@ def main(test_collection_file: Optional[str] = None, no_clone_repo: bool = False frequency=frequency, test_attr_regex_filters=test_attr_regex_filters, prefer_smoke_tests=prefer_smoke_tests, + run_jailed_tests=run_jailed_tests, ) logger.info(f"Found {len(filtered_tests)} tests to run.") if len(filtered_tests) == 0: @@ -185,6 +198,9 @@ def main(test_collection_file: Optional[str] = None, no_clone_repo: bool = False else: this_ray_wheels_url = ray_wheels_url + ray_commit = parse_commit_from_wheel_url(this_ray_wheels_url) + if ray_commit: + env.update({"RAY_COMMIT_OF_WHEEL": ray_commit}) step = get_step( test, report=report, diff --git a/release/ray_release/scripts/ray_bisect.py b/release/ray_release/scripts/ray_bisect.py index 58844d291531..9985a8534416 100644 --- a/release/ray_release/scripts/ray_bisect.py +++ b/release/ray_release/scripts/ray_bisect.py @@ -3,12 +3,15 @@ import os import json import time -from typing import List +from typing import Dict, List, Set + from ray_release.logger import logger from ray_release.buildkite.step import get_step from ray_release.config import ( read_and_validate_release_test_collection, + parse_python_version, DEFAULT_WHEEL_WAIT_TIMEOUT, + DEFAULT_PYTHON_VERSION, Test, ) from ray_release.wheels import find_and_wait_for_ray_wheels_url @@ -18,64 +21,173 @@ @click.argument("test_name", required=True, type=str) @click.argument("passing_commit", required=True, type=str) @click.argument("failing_commit", required=True, type=str) -def main(test_name: str, passing_commit: str, failing_commit: str) -> None: +@click.option( + "--concurrency", + default=3, + type=int, + help=( + "Maximum number of concurrent test jobs to run. Higher number uses more " + "capacity, but reduce the bisect duration" + ), +) +@click.option( + "--run-per-commit", + default=1, + type=int, + help=( + "The number of time we run test on the same commit, to account for test " + "flakiness. Commit passes only when it passes on all runs" + ), +) +def main( + test_name: str, + passing_commit: str, + failing_commit: str, + concurrency: int = 1, + run_per_commit: int = 1, +) -> None: + if concurrency <= 0: + raise ValueError( + f"Concurrency input need to be a positive number, received: {concurrency}" + ) + test = _get_test(test_name) + pre_sanity_check = _sanity_check( + test, passing_commit, failing_commit, run_per_commit + ) + if not pre_sanity_check: + logger.info( + "Failed pre-saniy check, the test might be flaky or fail due to" + " an external (not a code change) factors" + ) + return commit_lists = _get_commit_lists(passing_commit, failing_commit) - blamed_commit = _bisect(test_name, commit_lists) + blamed_commit = _bisect(test, commit_lists, concurrency, run_per_commit) logger.info(f"Blamed commit found for test {test_name}: {blamed_commit}") -def _bisect(test_name: str, commit_list: List[str]) -> str: - test = _get_test(test_name) - while len(commit_list) > 1: +def _bisect( + test: Test, + commit_list: List[str], + concurrency: int, + run_per_commit: int, +) -> str: + while len(commit_list) > 2: logger.info( f"Bisecting between {len(commit_list)} commits: " - f"{commit_list[0]} to {commit_list[-1]}" + f"{commit_list[0]} to {commit_list[-1]} with concurrency {concurrency}" ) - middle_commit_idx = len(commit_list) // 2 - middle_commit = commit_list[middle_commit_idx] - is_passing = _run_test(test, middle_commit) - if is_passing: - commit_list = commit_list[middle_commit_idx + 1 :] - else: - commit_list = commit_list[:middle_commit_idx] + idx_to_commit = {} + for i in range(concurrency): + idx = len(commit_list) * (i + 1) // (concurrency + 1) + # make sure that idx is not at the boundary; this avoids rerun bisect + # on the previously run revision + idx = min(max(idx, 1), len(commit_list) - 2) + idx_to_commit[idx] = commit_list[idx] + outcomes = _run_test(test, set(idx_to_commit.values()), run_per_commit) + passing_idx = 0 + failing_idx = len(commit_list) - 1 + for idx, commit in idx_to_commit.items(): + is_passing = all( + outcome == "passed" for outcome in outcomes[commit].values() + ) + if is_passing and idx > passing_idx: + passing_idx = idx + if not is_passing and idx < failing_idx: + failing_idx = idx + commit_list = commit_list[passing_idx : failing_idx + 1] return commit_list[-1] -def _run_test(test: Test, commit: str) -> bool: - logger.info(f'Running test {test["name"]} on commit {commit}') - _trigger_test_run(test, commit) - return _obtain_test_result(commit) +def _sanity_check( + test: Test, passing_revision: str, failing_revision: str, run_per_commit: int +) -> bool: + """ + Sanity check that the test indeed passes on the passing revision, and fails on the + failing revision + """ + logger.info( + f"Sanity check passing revision: {passing_revision}" + f" and failing revision: {failing_revision}" + ) + outcomes = _run_test(test, [passing_revision, failing_revision], run_per_commit) + if any(map(lambda x: x != "passed", outcomes[passing_revision].values())): + return False + return any(map(lambda x: x != "passed", outcomes[failing_revision].values())) + + +def _run_test( + test: Test, commits: Set[str], run_per_commit: int +) -> Dict[str, Dict[int, str]]: + logger.info(f'Running test {test["name"]} on commits {commits}') + for commit in commits: + _trigger_test_run(test, commit, run_per_commit) + return _obtain_test_result(commits, run_per_commit) + +def _trigger_test_run(test: Test, commit: str, run_per_commit: int) -> None: + python_version = DEFAULT_PYTHON_VERSION + if "python" in test: + python_version = parse_python_version(test["python"]) -def _trigger_test_run(test: Test, commit: str) -> None: ray_wheels_url = find_and_wait_for_ray_wheels_url( - commit, - timeout=DEFAULT_WHEEL_WAIT_TIMEOUT, - ) - step = get_step(test, ray_wheels=ray_wheels_url) - step["label"] = f'{test["name"]}:{commit[:6]}' - step["key"] = commit - pipeline = json.dumps({"steps": [step]}) - subprocess.check_output( - f'echo "{pipeline}" | buildkite-agent pipeline upload', - shell=True, + commit, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT, python_version=python_version ) + for run in range(run_per_commit): + step = get_step( + test, + ray_wheels=ray_wheels_url, + env={ + "RAY_COMMIT_OF_WHEEL": commit, + }, + ) + step["label"] = f'{test["name"]}:{commit[:7]}-{run}' + step["key"] = f"{commit}-{run}" + pipeline = subprocess.Popen( + ["echo", json.dumps({"steps": [step]})], stdout=subprocess.PIPE + ) + subprocess.check_output( + ["buildkite-agent", "pipeline", "upload"], stdin=pipeline.stdout + ) + pipeline.stdout.close() -def _obtain_test_result(buildkite_step_key: str) -> bool: - outcome = None - wait = 30 +def _obtain_test_result( + commits: Set[str], run_per_commit: int +) -> Dict[str, Dict[int, str]]: + outcomes = {} + wait = 5 total_wait = 0 - while outcome not in ["passed", "hard_failed", "soft_failed"]: + while True: logger.info(f"... waiting for test result ...({total_wait} seconds)") - outcome = subprocess.check_output( - f'buildkite-agent step get "outcome" --step "{buildkite_step_key}"', - shell=True, - ).decode("utf-8") + for commit in commits: + if commit in outcomes and len(outcomes[commit]) == run_per_commit: + continue + for run in range(run_per_commit): + outcome = subprocess.check_output( + [ + "buildkite-agent", + "step", + "get", + "outcome", + "--step", + f"{commit}-{run}", + ] + ).decode("utf-8") + if not outcome: + continue + if commit not in outcomes: + outcomes[commit] = {} + outcomes[commit][run] = outcome + all_commit_finished = len(outcomes) == len(commits) + per_commit_finished = all( + len(outcome) == run_per_commit for outcome in outcomes.values() + ) + if all_commit_finished and per_commit_finished: + break time.sleep(wait) total_wait = total_wait + wait - logger.info(f"Final test outcome: {outcome}") - return outcome == "passed" + logger.info(f"Final test outcomes: {outcomes}") + return outcomes def _get_test(test_name: str) -> Test: @@ -86,17 +198,16 @@ def _get_test(test_name: str) -> Test: def _get_commit_lists(passing_commit: str, failing_commit: str) -> List[str]: - commit_lists = ( + # This command obtains all commits between inclusively + return ( subprocess.check_output( - f"git rev-list --ancestry-path {passing_commit}..{failing_commit}", + f"git rev-list --reverse ^{passing_commit}~ {failing_commit}", shell=True, ) .decode("utf-8") .strip() .split("\n") ) - commit_lists.reverse() - return commit_lists if __name__ == "__main__": diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index b259e5d3bfc9..449dee26557d 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -164,7 +164,6 @@ def main( except ReleaseTestError as e: logger.exception(e) return_code = e.exit_code.value - logger.info( f"Release test pipeline for test {test['name']} completed. " f"Returning with exit code = {return_code}" diff --git a/release/ray_release/template.py b/release/ray_release/template.py index edb81d444607..1f8cf74a86ce 100644 --- a/release/ray_release/template.py +++ b/release/ray_release/template.py @@ -7,8 +7,8 @@ import jinja2 import yaml +from ray_release.bazel import bazel_runfile from ray_release.config import ( - RELEASE_PACKAGE_DIR, parse_python_version, DEFAULT_PYTHON_VERSION, get_test_cloud_id, @@ -73,7 +73,7 @@ def load_and_render_yaml_template( if not template_path: return None - if not os.path.exists(template_path): + if not os.path.isfile(template_path): raise ReleaseTestConfigError( f"Cannot load yaml template from {template_path}: Path not found." ) @@ -98,11 +98,14 @@ def render_yaml_template(template: str, env: Optional[Dict] = None): ) from e -def load_test_cluster_env(test: "Test", ray_wheels_url: str) -> Optional[Dict]: +def get_cluster_env_path(test: "Test") -> str: + working_dir = test.get("working_dir", "") cluster_env_file = test["cluster"]["cluster_env"] - cluster_env_path = os.path.join( - RELEASE_PACKAGE_DIR, test.get("working_dir", ""), cluster_env_file - ) + return bazel_runfile("release", working_dir, cluster_env_file) + + +def load_test_cluster_env(test: "Test", ray_wheels_url: str) -> Optional[Dict]: + cluster_env_path = get_cluster_env_path(test) env = populate_cluster_env_variables(test, ray_wheels_url=ray_wheels_url) @@ -139,12 +142,10 @@ def populate_cluster_env_variables(test: "Test", ray_wheels_url: str) -> Dict: def load_test_cluster_compute(test: "Test") -> Optional[Dict]: cluster_compute_file = test["cluster"]["cluster_compute"] - cluster_compute_path = os.path.join( - RELEASE_PACKAGE_DIR, test.get("working_dir", ""), cluster_compute_file - ) + working_dir = test.get("working_dir", "") + f = bazel_runfile("release", working_dir, cluster_compute_file) env = populate_cluster_compute_variables(test) - - return load_and_render_yaml_template(cluster_compute_path, env=env) + return load_and_render_yaml_template(f, env=env) def populate_cluster_compute_variables(test: "Test") -> Dict: diff --git a/release/ray_release/tests/test_anyscale_job_manager.py b/release/ray_release/tests/test_anyscale_job_manager.py new file mode 100644 index 000000000000..bd28fba1684f --- /dev/null +++ b/release/ray_release/tests/test_anyscale_job_manager.py @@ -0,0 +1,27 @@ +import pytest +import sys +import tempfile +import os + +from ray_release.util import ERROR_LOG_PATTERNS +from ray_release.job_manager.anyscale_job_manager import AnyscaleJobManager + + +def test_get_ray_error_logs(): + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "log01"), "w") as f: + f.writelines(ERROR_LOG_PATTERNS[:1]) + with open(os.path.join(tmpdir, "log02"), "w") as f: + f.writelines(ERROR_LOG_PATTERNS + ["haha"]) + with open(os.path.join(tmpdir, "job-driver-w00t"), "w") as f: + f.writelines("w00t") + ( + job_driver_log, + ray_error_log, + ) = AnyscaleJobManager._find_job_driver_and_ray_error_logs(tmpdir) + assert ray_error_log == "".join(ERROR_LOG_PATTERNS + ["haha"]) + assert job_driver_log == "w00t" + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_anyscale_job_wrapper.py b/release/ray_release/tests/test_anyscale_job_wrapper.py index 7f17df27270b..f7ba2053bef2 100644 --- a/release/ray_release/tests/test_anyscale_job_wrapper.py +++ b/release/ray_release/tests/test_anyscale_job_wrapper.py @@ -1,6 +1,7 @@ import pytest import sys import json + from ray_release.command_runner._anyscale_job_wrapper import ( main, run_bash_command, diff --git a/release/ray_release/tests/test_bisect.py b/release/ray_release/tests/test_bisect.py index e6a8a3b22dd4..3e64f19cbf29 100644 --- a/release/ray_release/tests/test_bisect.py +++ b/release/ray_release/tests/test_bisect.py @@ -1,25 +1,98 @@ +import sys +import pytest from unittest import mock -from ray_release.scripts.ray_bisect import _bisect +from typing import List, Set, Dict +from ray_release.scripts.ray_bisect import _bisect, _obtain_test_result, _sanity_check +from ray_release.config import Test -def test_bisect(): - commit_to_test_result = { - "c0": True, - "c1": True, - "c2": True, - "c3": False, - "c4": False, - } - def _mock_run_test(test_name: str, commit: str) -> bool: - return commit_to_test_result[commit] +def test_sanity_check(): + def _mock_run_test( + test: Test, commit: Set[str], run_per_commit: int + ) -> Dict[str, Dict[int, str]]: + return { + "passing_revision": {0: "passed", 1: "passed"}, + "failing_revision": {0: "failed", 1: "failed"}, + "flaky_revision": {0: "failed", 1: "passed"}, + } with mock.patch( "ray_release.scripts.ray_bisect._run_test", side_effect=_mock_run_test, - ), mock.patch( - "ray_release.scripts.ray_bisect._get_test", - return_value={}, ): - blamed_commit = _bisect("test", list(commit_to_test_result.keys())) - assert blamed_commit == "c3" + assert _sanity_check({}, "passing_revision", "failing_revision", 2) + assert _sanity_check({}, "passing_revision", "flaky_revision", 2) + assert not _sanity_check({}, "failing_revision", "passing_revision", 2) + assert not _sanity_check({}, "passing_revision", "passing_revision", 2) + assert not _sanity_check({}, "failing_revision", "failing_revision", 2) + assert not _sanity_check({}, "flaky_revision", "failing_revision", 2) + + +def test_obtain_test_result(): + test_cases = [ + { + "c0": {0: "passed"}, + }, + { + "c0": {0: "passed", 1: "passed"}, + "c1": {0: "hard_failed", 1: "hard_failed"}, + }, + ] + + def _mock_check_output(input: List[str]) -> str: + commit, run = tuple(input[-1].split("-")) + return bytes(test_case[commit][int(run)], "utf-8") + + for test_case in test_cases: + with mock.patch( + "subprocess.check_output", + side_effect=_mock_check_output, + ): + commits = set(test_case.keys()) + rerun_per_commit = len(test_case[list(commits)[0]]) + _obtain_test_result(commits, rerun_per_commit) == test_case + + +def test_bisect(): + test_cases = { + "c3": { + "c0": {0: "passed"}, + "c1": {0: "passed"}, + "c3": {0: "hard_failed"}, + "c4": {0: "soft_failed"}, + }, + "c1": { + "c0": {0: "passed"}, + "c1": {0: "hard_failed"}, + "c2": {0: "hard_failed"}, + }, + "cc1": { + "cc0": {0: "passed"}, + "cc1": {0: "hard_failed"}, + }, + "c2": { + "c0": {0: "passed", 1: "passed"}, + "c2": {0: "passed", 1: "hard_failed"}, + "c3": {0: "hard_failed", 1: "passed"}, + "c4": {0: "soft_failed", 1: "soft_failed"}, + }, + } + + for output, input in test_cases.items(): + + def _mock_run_test( + test: Test, commit: List[str], rerun_per_commit + ) -> Dict[str, str]: + return input + + with mock.patch( + "ray_release.scripts.ray_bisect._run_test", + side_effect=_mock_run_test, + ): + for concurreny in range(1, 4): + assert _bisect({}, list(input.keys()), concurreny, 1) == output + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_buildkite.py b/release/ray_release/tests/test_buildkite.py index 8d45079db243..253928ecd859 100644 --- a/release/ray_release/tests/test_buildkite.py +++ b/release/ray_release/tests/test_buildkite.py @@ -2,7 +2,7 @@ import sys import tempfile import unittest -from typing import Dict +from typing import Dict, Callable from unittest.mock import patch import yaml @@ -29,7 +29,6 @@ ) from ray_release.config import Test from ray_release.exception import ReleaseTestConfigError -from ray_release.tests.test_glue import MockReturn from ray_release.wheels import ( DEFAULT_BRANCH, ) @@ -43,6 +42,20 @@ def __call__(self, key: str): return self.return_dict.get(key, None) +class MockReturn: + return_dict = {} + + def __getattribute__(self, item): + return_dict = object.__getattribute__(self, "return_dict") + if item in return_dict: + mocked = return_dict[item] + if isinstance(mocked, Callable): + return mocked() + else: + return lambda *a, **kw: mocked + return object.__getattribute__(self, item) + + class MockBuildkitePythonAPI(MockReturn): def builds(self): return self @@ -271,7 +284,6 @@ def testSettingsOverrideBuildkite(self): "ray_release.buildkite.settings.get_buildkite_prompt_value", self.buildkite_mock, ): - # With no buildkite variables, default settings shouldn't be updated updated_settings = settings.copy() update_settings_from_buildkite(updated_settings) diff --git a/release/ray_release/tests/test_config.py b/release/ray_release/tests/test_config.py index 80915758b572..c09900c495e8 100644 --- a/release/ray_release/tests/test_config.py +++ b/release/ray_release/tests/test_config.py @@ -1,8 +1,8 @@ -import os import sys import yaml import pytest +from ray_release.bazel import bazel_runfile from ray_release.config import ( read_and_validate_release_test_collection, Test, @@ -13,10 +13,7 @@ ) from ray_release.exception import ReleaseTestConfigError -TEST_COLLECTION_FILE = os.path.join( - os.path.dirname(__file__), "..", "..", "release_tests.yaml" -) - +_TEST_COLLECTION_FILE = bazel_runfile("release/release_tests.yaml") VALID_TEST = Test( **{ @@ -65,7 +62,6 @@ def test_parse_test_definition(): - __suffix__: aws - __suffix__: gce cluster: - cluster_env: env_gce.yaml cluster_compute: compute_gce.yaml """ ) @@ -79,6 +75,7 @@ def test_parse_test_definition(): assert not validate_test(gce_test, schema) assert aws_test["name"] == "sample_test.aws" assert gce_test["cluster"]["cluster_compute"] == "compute_gce.yaml" + assert gce_test["cluster"]["cluster_env"] == "env.yaml" invalid_test_definition = test_definitions[0] # Intentionally make the test definition invalid by create an empty 'variations' # field. Check that the parser throws exception at runtime @@ -219,7 +216,7 @@ def test_compute_config_invalid_ebs(): def test_load_and_validate_test_collection_file(): - read_and_validate_release_test_collection(TEST_COLLECTION_FILE) + read_and_validate_release_test_collection(_TEST_COLLECTION_FILE) if __name__ == "__main__": diff --git a/release/ray_release/tests/test_env.py b/release/ray_release/tests/test_env.py index 95eeafc81c90..a87ae0071759 100644 --- a/release/ray_release/tests/test_env.py +++ b/release/ray_release/tests/test_env.py @@ -28,7 +28,7 @@ def test_load_env_invalid(): def test_load_env_changes(): old_val = str(DEFAULT_ANYSCALE_PROJECT) - env_dict = load_environment("staging_v2") + env_dict = load_environment("aws") populate_os_env(env_dict) new_val = str(DEFAULT_ANYSCALE_PROJECT) diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py index 1cf9cdcf1dd2..11239ac3db3f 100644 --- a/release/ray_release/tests/test_glue.py +++ b/release/ray_release/tests/test_glue.py @@ -12,15 +12,9 @@ from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.cluster_manager.full import FullClusterManager from ray_release.command_runner.command_runner import CommandRunner -from ray_release.config import ( - Test, - DEFAULT_COMMAND_TIMEOUT, - DEFAULT_WAIT_FOR_NODES_TIMEOUT, -) +from ray_release.config import Test from ray_release.exception import ( ReleaseTestConfigError, - LocalEnvSetupError, - ClusterComputeCreateError, ClusterEnvBuildError, ClusterEnvBuildTimeout, ClusterEnvCreateError, @@ -44,8 +38,6 @@ run_release_test, type_str_to_command_runner, command_runner_to_cluster_manager, - command_runner_to_file_manager, - TIMEOUT_BUFFER_MINUTES, ) from ray_release.logger import logger from ray_release.reporter.reporter import Reporter @@ -74,8 +66,6 @@ def __getattribute__(self, item): return object.__getattribute__(self, item) -@patch("ray_release.glue.reinstall_anyscale_dependencies", lambda: None) -@patch("ray_release.glue.get_pip_packages", lambda: ["pip-packages"]) class GlueTest(unittest.TestCase): def writeClusterEnv(self, content: str): with open(os.path.join(self.tempdir, "cluster_env.yaml"), "wt") as fp: @@ -161,7 +151,6 @@ def mock_alerter(test: Test, result: Result): type_str_to_command_runner["unit_test"] = MockCommandRunner command_runner_to_cluster_manager[MockCommandRunner] = MockClusterManager - command_runner_to_file_manager[MockCommandRunner] = MockFileManager self.test = Test( name="unit_test_end_to_end", @@ -176,7 +165,6 @@ def mock_alerter(test: Test, result: Result): cluster_env="cluster_env.yaml", cluster_compute="cluster_compute.yaml" ), alert="unit_test_alerter", - driver_setup="driver_fail.sh", ) self.anyscale_project = "prj_unit12345678" self.ray_wheels_url = "http://mock.wheels/" @@ -186,16 +174,6 @@ def tearDown(self) -> None: def _succeed_until(self, until: str): # These commands should succeed - self.command_runner_return["prepare_local_env"] = None - - if until == "local_env": - return - - self.test["driver_setup"] = "driver_succeed.sh" - - if until == "driver_setup": - return - self.cluster_manager_return["cluster_compute_id"] = "valid" self.cluster_manager_return["create_cluster_compute"] = None @@ -318,102 +296,6 @@ def testInvalidClusterCompute(self): self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value) - def testAutomaticClusterEnvVariables(self): - result = Result() - - self._succeed_until("local_env") - - with self.assertRaises(LocalEnvSetupError): - self._run(result) - - cluster_manager = self.instances["cluster_manager"] - - command_timeout = self.test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT) - prepare_cmd = self.test["run"].get("prepare", None) - if prepare_cmd: - prepare_timeout = self.test["run"].get("prepare_timeout", command_timeout) - else: - prepare_timeout = 0 - command_and_prepare_timeout = command_timeout + prepare_timeout - - wait_timeout = self.test["run"]["wait_for_nodes"].get( - "timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT - ) - - expected_idle_termination_minutes = int( - command_and_prepare_timeout / 60 + TIMEOUT_BUFFER_MINUTES - ) - expected_maximum_uptime_minutes = int( - expected_idle_termination_minutes + wait_timeout + TIMEOUT_BUFFER_MINUTES - ) - - self.assertEqual( - cluster_manager.cluster_compute["idle_termination_minutes"], - expected_idle_termination_minutes, - ) - self.assertEqual( - cluster_manager.cluster_compute["maximum_uptime_minutes"], - expected_maximum_uptime_minutes, - ) - - def testInvalidPrepareLocalEnv(self): - result = Result() - - self.command_runner_return["prepare_local_env"] = _fail_on_call( - LocalEnvSetupError - ) - with self.assertRaises(LocalEnvSetupError): - self._run(result) - self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) - - def testDriverSetupFails(self): - result = Result() - - self._succeed_until("local_env") - - with self.assertRaises(LocalEnvSetupError): - self._run(result) - self.assertEqual(result.return_code, ExitCode.LOCAL_ENV_SETUP_ERROR.value) - - def testInvalidClusterIdOverride(self): - result = Result() - - self._succeed_until("driver_setup") - - self.sdk.returns["get_cluster_environment"] = None - - with self.assertRaises(ClusterEnvCreateError): - self._run(result, cluster_env_id="existing") - - self.sdk.returns["get_cluster_environment"] = APIDict( - result=APIDict(config_json={"overridden": True}) - ) - - with self.assertRaises(Exception) as cm: # Fail somewhere else - self._run(result, cluster_env_id="existing") - self.assertNotIsInstance(cm.exception, ClusterEnvCreateError) - - def testBuildConfigFailsClusterCompute(self): - result = Result() - - self._succeed_until("driver_setup") - - # These commands should succeed - self.command_runner_return["prepare_local_env"] = None - - # Fails because API response faulty - with self.assertRaisesRegex(ClusterComputeCreateError, "Unexpected"): - self._run(result) - self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) - - # Fails for random cluster compute reason - self.cluster_manager_return["create_cluster_compute"] = _fail_on_call( - ClusterComputeCreateError, "Known" - ) - with self.assertRaisesRegex(ClusterComputeCreateError, "Known"): - self._run(result) - self.assertEqual(result.return_code, ExitCode.CLUSTER_RESOURCE_ERROR.value) - def testBuildConfigFailsClusterEnv(self): result = Result() diff --git a/release/ray_release/tests/test_log_aggregator.py b/release/ray_release/tests/test_log_aggregator.py new file mode 100644 index 000000000000..08293ca0dab6 --- /dev/null +++ b/release/ray_release/tests/test_log_aggregator.py @@ -0,0 +1,74 @@ +import sys +import pytest +from ray_release.log_aggregator import LogAggregator + + +def test_compute_stack_pattern(): + assert ( + LogAggregator( + "\n".join( + [ + "haha", + "Traceback (most recent call last):", + ' File "/tmp/something", line 584', + "Exception: yaya45", + "hehe", + ] + ) + ).compute_crash_pattern() + == "somethingline Exception: yaya" + ) + + +def test_compute_signature(): + assert ( + LogAggregator._compute_signature( + [ + "Traceback (most recent call last):", + ' File "/tmp/something", line 584', + ' File "/tmp/another", deedeebeeaacfa-abc' "Exception: yaya45", + ] + ) + == "somethingline another-abcException: yaya" + ) + + +def test_compute_stack_trace(): + trace = [ + "Traceback (most recent call last):", + ' File "/tmp/something", line 584, in run_release_test', + " raise pipeline_exception", + "ray_release.exception.JobNoLogsError: Could not obtain logs for the job.", + ] + error_trace = [ + "[2023-01-01] ERROR: something is wrong", + "Traceback (most recent call last):", + ' File "/tmp/something", line 584, in run_release_test', + " raise pipeline_exception", + "ray_release.exception.JobStartupTimeout: Cluster did not start.", + ] + error_trace_short = [ + "[2023-01-01] ERROR: something is wrong" + ' File "/tmp/something", line 584, in run_release_test', + " raise pipeline_exception", + "ray_release.exception.JobStartupTimeout: Cluster did not start.", + ] + assert LogAggregator._compute_stack_trace(["haha"] + trace + ["hehe"]) == trace + assert ( + LogAggregator._compute_stack_trace(["haha"] + error_trace + ["hehe"]) + == error_trace + ) + assert ( + LogAggregator._compute_stack_trace(["haha"] + error_trace_short + ["hehe"]) + == error_trace_short + ) + assert ( + LogAggregator._compute_stack_trace( + ["haha"] + trace + ["w00t"] + error_trace + ["hehe"] + ) + == error_trace + ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_result.py b/release/ray_release/tests/test_result.py index 8ae16b298503..f5e963f60df6 100644 --- a/release/ray_release/tests/test_result.py +++ b/release/ray_release/tests/test_result.py @@ -1,3 +1,5 @@ +import pytest +import sys import os from unittest import mock from ray_release.result import handle_exception, ExitCode, ResultStatus @@ -34,3 +36,7 @@ def test_handle_exception(): ResultStatus.INFRA_ERROR, None, ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_wheels.py b/release/ray_release/tests/test_wheels.py index 75dc068952a5..b55a0e95e99c 100644 --- a/release/ray_release/tests/test_wheels.py +++ b/release/ray_release/tests/test_wheels.py @@ -6,6 +6,7 @@ from freezegun import freeze_time +from ray_release.bazel import bazel_runfile from ray_release.config import Test from ray_release.template import load_test_cluster_env from ray_release.exception import RayWheelsNotFoundError, RayWheelsTimeoutError @@ -19,6 +20,7 @@ is_wheels_url_matching_ray_verison, get_wheels_filename, maybe_rewrite_wheels_url, + parse_commit_from_wheel_url, ) @@ -30,9 +32,7 @@ def remove_buildkite_env(): def test_get_ray_version(remove_buildkite_env): - init_file = os.path.join( - os.path.dirname(__file__), "..", "..", "..", "python", "ray", "__init__.py" - ) + init_file = bazel_runfile("python/ray/__init__.py") with open(init_file, "rt") as fp: content = [line.encode() for line in fp.readlines()] @@ -252,5 +252,15 @@ def test_url_exist(): assert not url_exists("invalid://somewhere") +def test_parse_commit_from_wheel_url(): + url = ( + "https://s3-us-west-2.amazonaws.com/ray-wheels/master/" + "0e0c15065507f01e8bfe78e49b0d0de063f81164/" + "ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" + ) + expected_commit = "0e0c15065507f01e8bfe78e49b0d0de063f81164" + assert parse_commit_from_wheel_url(url) == expected_commit + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/util.py b/release/ray_release/util.py index acfa27bda9b3..a9c7a14e4538 100644 --- a/release/ray_release/util.py +++ b/release/ray_release/util.py @@ -28,6 +28,10 @@ def __str__(self): S3_CLOUD_STORAGE = "s3" GS_CLOUD_STORAGE = "gs" GS_BUCKET = "anyscale-oss-dev-bucket" +ERROR_LOG_PATTERNS = [ + "ERROR", + "Traceback (most recent call last)", +] def deep_update(d, u) -> Dict: @@ -144,8 +148,6 @@ def run_bash_script(bash_script: str) -> None: def reinstall_anyscale_dependencies() -> None: logger.info("Re-installing `anyscale` package") - - # Copy anyscale pin to requirements.txt and requirements_buildkite.txt subprocess.check_output( "pip install -U anyscale", shell=True, diff --git a/release/ray_release/wheels.py b/release/ray_release/wheels.py index 0ce5344a7852..92aaa02a155a 100644 --- a/release/ray_release/wheels.py +++ b/release/ray_release/wheels.py @@ -437,3 +437,12 @@ def install_matching_ray_locally(ray_wheels: Optional[str]): for module_name in RELOAD_MODULES: if module_name in sys.modules: importlib.reload(sys.modules[module_name]) + + +def parse_commit_from_wheel_url(url: str) -> str: + # url is expected to be in the format of + # https://s3-us-west-2.amazonaws.com/ray-wheels/master/0e0c15065507f01e8bfe78e49b0d0de063f81164/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl # noqa + regex = r"/([0-9a-f]{40})/" + match = re.search(regex, url) + if match: + return match.group(1) diff --git a/release/release_logs/2.4.0/benchmarks/many_actors.json b/release/release_logs/2.4.0/benchmarks/many_actors.json index 80bbbeb5746c..bb15979086f1 100644 --- a/release/release_logs/2.4.0/benchmarks/many_actors.json +++ b/release/release_logs/2.4.0/benchmarks/many_actors.json @@ -1,32 +1,32 @@ { - "_dashboard_memory_usage_mb": 513.560576, - "_dashboard_test_success": true, - "_peak_memory": 3.91, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n165\t2.02GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2388\t0.85GiB\tpython distributed/test_many_actors.py\n353\t0.37GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n41\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n670\t0.09GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n38\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --ServerApp.token=agh0_CkgwRgIhAP\n2610\t0.07GiB\tray::DashboardTester.run\n2523\t0.07GiB\tray::MemoryMonitorActor.run\n280\t0.04GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.113.246:9031 --host=0.0.0.\n553\t0.04GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/log_m", - "actors_per_second": 772.644103201044, - "num_actors": 10000, - "perf_metrics": [ - { - "perf_metric_name": "actors_per_second", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 772.644103201044 - }, - { - "perf_metric_name": "dashboard_p50_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 34.714 - }, - { - "perf_metric_name": "dashboard_p95_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 2419.503 - }, - { - "perf_metric_name": "dashboard_p99_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 3842.061 - } - ], - "success": "1", - "time": 12.942569494247437 + "_dashboard_memory_usage_mb": 574.578688, + "_dashboard_test_success": true, + "_peak_memory": 3.84, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n165\t2.01GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2859\t0.83GiB\tpython distributed/test_many_actors.py\n338\t0.33GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n41\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n639\t0.09GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n38\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --ServerApp.token= --allow-root -\n3082\t0.07GiB\tray::DashboardTester.run\n2996\t0.07GiB\tray::MemoryMonitorActor.run\n265\t0.04GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.97.64:9031 --host=0.0.0.0\n538\t0.04GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/log_m", + "actors_per_second": 737.6387503180771, + "num_actors": 10000, + "perf_metrics": [ + { + "perf_metric_name": "actors_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 737.6387503180771 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 28.971 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 1899.861 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 2901.064 + } + ], + "success": "1", + "time": 13.556771516799927 } diff --git a/release/release_logs/2.4.0/benchmarks/many_nodes.json b/release/release_logs/2.4.0/benchmarks/many_nodes.json index 8abafc45366a..844e5bbf70da 100644 --- a/release/release_logs/2.4.0/benchmarks/many_nodes.json +++ b/release/release_logs/2.4.0/benchmarks/many_nodes.json @@ -1,38 +1,38 @@ { - "_dashboard_memory_usage_mb": 186.179584, - "_dashboard_test_success": true, - "_peak_memory": 4.23, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n277\t0.59GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n1900\t0.22GiB\tpython distributed/test_many_tasks.py --num-tasks=1000\n477\t0.16GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n811\t0.09GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n56\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n2221\t0.08GiB\tray::StateAPIGeneratorActor.start\n1486\t0.08GiB\tray::JobSupervisor\n46\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --allow-root --ip=127.0.0.1 --no-\n2047\t0.07GiB\tray::MemoryMonitorActor.run\n2144\t0.07GiB\tray::DashboardTester.run", - "num_tasks": 1000, - "perf_metrics": [ - { - "perf_metric_name": "tasks_per_second", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 216.16404352694366 - }, - { - "perf_metric_name": "used_cpus_by_deadline", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 250.0 - }, - { - "perf_metric_name": "dashboard_p50_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 3.915 - }, - { - "perf_metric_name": "dashboard_p95_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 56.099 - }, - { - "perf_metric_name": "dashboard_p99_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 130.237 - } - ], - "success": "1", - "tasks_per_second": 216.16404352694366, - "time": 304.62611627578735, - "used_cpus": 250.0 + "_dashboard_memory_usage_mb": 187.82208, + "_dashboard_test_success": true, + "_peak_memory": 4.05, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n277\t0.77GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n1605\t0.23GiB\tpython distributed/test_many_tasks.py --num-tasks=1000\n450\t0.16GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n783\t0.09GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n61\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n1925\t0.08GiB\tray::StateAPIGeneratorActor.start\n1201\t0.08GiB\tray::JobSupervisor\n52\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --allow-root --ip=127.0.0.1 --no-\n1753\t0.07GiB\tray::MemoryMonitorActor.run\n1863\t0.07GiB\tray::DashboardTester.run", + "num_tasks": 1000, + "perf_metrics": [ + { + "perf_metric_name": "tasks_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 228.0567062081301 + }, + { + "perf_metric_name": "used_cpus_by_deadline", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 250.0 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 4.072 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 34.235 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 134.023 + } + ], + "success": "1", + "tasks_per_second": 228.0567062081301, + "time": 304.38487434387207, + "used_cpus": 250.0 } diff --git a/release/release_logs/2.4.0/benchmarks/many_pgs.json b/release/release_logs/2.4.0/benchmarks/many_pgs.json index 0c8b02ccdc3a..f6653ea5e65c 100644 --- a/release/release_logs/2.4.0/benchmarks/many_pgs.json +++ b/release/release_logs/2.4.0/benchmarks/many_pgs.json @@ -1,32 +1,32 @@ { - "_dashboard_memory_usage_mb": 166.48192, - "_dashboard_test_success": true, - "_peak_memory": 4.85, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n256\t1.02GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n1705\t0.4GiB\tpython distributed/test_many_pgs.py\n471\t0.15GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n803\t0.11GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n61\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n608\t0.08GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/raylet/raylet --raylet_socket_name=\n1300\t0.07GiB\tray::JobSupervisor\n48\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --allow-root --ip=127.0.0.1 --no-\n1963\t0.07GiB\tray::DashboardTester.run\n1852\t0.07GiB\tray::MemoryMonitorActor.run", - "num_pgs": 1000, - "perf_metrics": [ - { - "perf_metric_name": "pgs_per_second", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 17.929446347235622 - }, - { - "perf_metric_name": "dashboard_p50_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 3.307 - }, - { - "perf_metric_name": "dashboard_p95_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 128.925 - }, - { - "perf_metric_name": "dashboard_p99_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 197.038 - } - ], - "pgs_per_second": 17.929446347235622, - "success": "1", - "time": 55.77417063713074 + "_dashboard_memory_usage_mb": 181.796864, + "_dashboard_test_success": true, + "_peak_memory": 4.48, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n277\t1.15GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n1724\t0.35GiB\tpython distributed/test_many_pgs.py\n468\t0.12GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n774\t0.11GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n595\t0.09GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/raylet/raylet --raylet_socket_name=\n69\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n1318\t0.07GiB\tray::JobSupervisor\n52\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --allow-root --ip=127.0.0.1 --no-\n1869\t0.07GiB\tray::MemoryMonitorActor.run\n1965\t0.06GiB\tray::DashboardTester.run", + "num_pgs": 1000, + "perf_metrics": [ + { + "perf_metric_name": "pgs_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 17.321330540558634 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 3.244 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 134.793 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 171.365 + } + ], + "pgs_per_second": 17.321330540558634, + "success": "1", + "time": 57.732285499572754 } diff --git a/release/release_logs/2.4.0/benchmarks/many_tasks.json b/release/release_logs/2.4.0/benchmarks/many_tasks.json index 01ca873ccb04..e5e7c8f8a7f7 100644 --- a/release/release_logs/2.4.0/benchmarks/many_tasks.json +++ b/release/release_logs/2.4.0/benchmarks/many_tasks.json @@ -1,38 +1,38 @@ { - "_dashboard_memory_usage_mb": 659.90656, - "_dashboard_test_success": true, - "_peak_memory": 4.44, - "_peak_process_memory": "PID\tMEM\tCOMMAND\n165\t1.99GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2232\t0.87GiB\tpython distributed/test_many_tasks.py --num-tasks=10000\n353\t0.74GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n2457\t0.1GiB\tray::DashboardTester.run\n41\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n670\t0.09GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n38\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --ServerApp.token=agh0_CkYwRAIgH_\n2520\t0.07GiB\tray::StateAPIGeneratorActor.start\n2370\t0.07GiB\tray::MemoryMonitorActor.run\n280\t0.04GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.96.232:9031 --host=0.0.0.0", - "num_tasks": 10000, - "perf_metrics": [ - { - "perf_metric_name": "tasks_per_second", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 324.3236630929032 - }, - { - "perf_metric_name": "used_cpus_by_deadline", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 2500.0 - }, - { - "perf_metric_name": "dashboard_p50_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 5.019 - }, - { - "perf_metric_name": "dashboard_p95_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 2788.74 - }, - { - "perf_metric_name": "dashboard_p99_latency_ms", - "perf_metric_type": "LATENCY", - "perf_metric_value": 3431.297 - } - ], - "success": "1", - "tasks_per_second": 324.3236630929032, - "time": 330.83339619636536, - "used_cpus": 2500.0 + "_dashboard_memory_usage_mb": 611.98336, + "_dashboard_test_success": true, + "_peak_memory": 4.61, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n165\t2.19GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2582\t0.86GiB\tpython distributed/test_many_tasks.py --num-tasks=10000\n338\t0.71GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n2807\t0.1GiB\tray::DashboardTester.run\n43\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n639\t0.09GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n40\t0.07GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-lab --ServerApp.token= --allow-root -\n2870\t0.07GiB\tray::StateAPIGeneratorActor.start\n2720\t0.07GiB\tray::MemoryMonitorActor.run\n265\t0.04GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.106.119:9031 --host=0.0.0.", + "num_tasks": 10000, + "perf_metrics": [ + { + "perf_metric_name": "tasks_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 296.54971037133174 + }, + { + "perf_metric_name": "used_cpus_by_deadline", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 2500.0 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 5.024 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 2713.875 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 3772.233 + } + ], + "success": "1", + "tasks_per_second": 296.54971037133174, + "time": 333.7211592197418, + "used_cpus": 2500.0 } diff --git a/release/release_logs/2.4.0/microbenchmark.json b/release/release_logs/2.4.0/microbenchmark.json index e0494e2c93f9..af88513bbd59 100644 --- a/release/release_logs/2.4.0/microbenchmark.json +++ b/release/release_logs/2.4.0/microbenchmark.json @@ -1,283 +1,283 @@ { - "1_1_actor_calls_async": [ - 7875.2205662523575, - 91.32036915829057 - ], - "1_1_actor_calls_concurrent": [ - 4898.403930689569, - 18.787428785414974 - ], - "1_1_actor_calls_sync": [ - 2490.1228801310986, - 61.66895883060133 - ], - "1_1_async_actor_calls_async": [ - 2893.7637668160814, - 58.54240700476942 - ], - "1_1_async_actor_calls_sync": [ - 1663.656683863718, - 82.11634772966543 - ], - "1_1_async_actor_calls_with_args_async": [ - 2053.88520928116, - 77.88191802930348 - ], - "1_n_actor_calls_async": [ - 10918.570247859934, - 252.77023513295532 - ], - "1_n_async_actor_calls_async": [ - 10007.72780551488, - 63.167900610588546 - ], - "client__1_1_actor_calls_async": [ - 1053.9221163152763, - 16.386239593374267 - ], - "client__1_1_actor_calls_concurrent": [ - 1067.680489513954, - 12.709001083081098 - ], - "client__1_1_actor_calls_sync": [ - 587.2290114221607, - 11.254483031373043 - ], - "client__get_calls": [ - 1169.0846386325316, - 30.090440134694333 - ], - "client__put_calls": [ - 953.6497274525543, - 27.62987622516938 - ], - "client__put_gigabytes": [ - 0.04453569846336401, - 0.0005889062724214797 - ], - "client__tasks_and_get_batch": [ - 0.9962426891274011, - 0.012695780047090623 - ], - "client__tasks_and_put_batch": [ - 11636.76543061424, - 344.5895993880181 - ], - "multi_client_put_calls_Plasma_Store": [ - 12782.464998678728, - 303.0850304711337 - ], - "multi_client_put_gigabytes": [ - 25.409834920338362, - 0.9620047718388134 - ], - "multi_client_tasks_async": [ - 29499.600819285548, - 1838.097234923537 - ], - "n_n_actor_calls_async": [ - 31558.549225320676, - 675.1014999204177 - ], - "n_n_actor_calls_with_arg_async": [ - 3114.287612859598, - 43.68969356632565 - ], - "n_n_async_actor_calls_async": [ - 25348.416455631697, - 927.5572889075144 - ], - "perf_metrics": [ - { - "perf_metric_name": "single_client_get_calls_Plasma_Store", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 6717.39789740067 - }, - { - "perf_metric_name": "single_client_put_calls_Plasma_Store", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 6141.116222223478 - }, - { - "perf_metric_name": "multi_client_put_calls_Plasma_Store", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 12782.464998678728 - }, - { - "perf_metric_name": "single_client_put_gigabytes", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 19.77763371659089 - }, - { - "perf_metric_name": "single_client_tasks_and_get_batch", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 10.903109137328919 - }, - { - "perf_metric_name": "multi_client_put_gigabytes", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 25.409834920338362 - }, - { - "perf_metric_name": "single_client_get_object_containing_10k_refs", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 12.601815520396993 - }, - { - "perf_metric_name": "single_client_wait_1k_refs", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 6.25672193789162 - }, - { - "perf_metric_name": "single_client_tasks_sync", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1315.6532263025485 - }, - { - "perf_metric_name": "single_client_tasks_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 10565.441516164923 - }, - { - "perf_metric_name": "multi_client_tasks_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 29499.600819285548 - }, - { - "perf_metric_name": "1_1_actor_calls_sync", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 2490.1228801310986 - }, - { - "perf_metric_name": "1_1_actor_calls_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 7875.2205662523575 - }, - { - "perf_metric_name": "1_1_actor_calls_concurrent", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 4898.403930689569 - }, - { - "perf_metric_name": "1_n_actor_calls_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 10918.570247859934 - }, - { - "perf_metric_name": "n_n_actor_calls_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 31558.549225320676 - }, - { - "perf_metric_name": "n_n_actor_calls_with_arg_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 3114.287612859598 - }, - { - "perf_metric_name": "1_1_async_actor_calls_sync", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1663.656683863718 - }, - { - "perf_metric_name": "1_1_async_actor_calls_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 2893.7637668160814 - }, - { - "perf_metric_name": "1_1_async_actor_calls_with_args_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 2053.88520928116 - }, - { - "perf_metric_name": "1_n_async_actor_calls_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 10007.72780551488 - }, - { - "perf_metric_name": "n_n_async_actor_calls_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 25348.416455631697 - }, - { - "perf_metric_name": "placement_group_create/removal", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1010.4619236411705 - }, - { - "perf_metric_name": "client__get_calls", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1169.0846386325316 - }, - { - "perf_metric_name": "client__put_calls", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 953.6497274525543 - }, - { - "perf_metric_name": "client__put_gigabytes", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 0.04453569846336401 - }, - { - "perf_metric_name": "client__tasks_and_put_batch", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 11636.76543061424 - }, - { - "perf_metric_name": "client__1_1_actor_calls_sync", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 587.2290114221607 - }, - { - "perf_metric_name": "client__1_1_actor_calls_async", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1053.9221163152763 - }, - { - "perf_metric_name": "client__1_1_actor_calls_concurrent", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 1067.680489513954 - }, - { - "perf_metric_name": "client__tasks_and_get_batch", - "perf_metric_type": "THROUGHPUT", - "perf_metric_value": 0.9962426891274011 - } - ], - "placement_group_create/removal": [ - 1010.4619236411705, - 10.079449349718745 - ], - "single_client_get_calls_Plasma_Store": [ - 6717.39789740067, - 383.3989162609101 - ], - "single_client_get_object_containing_10k_refs": [ - 12.601815520396993, - 0.02598969871083366 - ], - "single_client_put_calls_Plasma_Store": [ - 6141.116222223478, - 55.35476909877019 - ], - "single_client_put_gigabytes": [ - 19.77763371659089, - 4.567428080163564 - ], - "single_client_tasks_and_get_batch": [ - 10.903109137328919, - 0.578088867627798 - ], - "single_client_tasks_async": [ - 10565.441516164923, - 517.415416188027 - ], - "single_client_tasks_sync": [ - 1315.6532263025485, - 28.172086151746203 - ], - "single_client_wait_1k_refs": [ - 6.25672193789162, - 0.1697384586263945 - ] -} \ No newline at end of file + "1_1_actor_calls_async": [ + 8774.565215109124, + 99.25145099727403 + ], + "1_1_actor_calls_concurrent": [ + 5556.408295085073, + 310.93719458837893 + ], + "1_1_actor_calls_sync": [ + 2627.6907826949946, + 17.95559542036301 + ], + "1_1_async_actor_calls_async": [ + 3053.0727243465535, + 44.040906553833615 + ], + "1_1_async_actor_calls_sync": [ + 1749.332379578524, + 36.34387252208381 + ], + "1_1_async_actor_calls_with_args_async": [ + 2438.8474219503923, + 88.20268908351537 + ], + "1_n_actor_calls_async": [ + 11443.027888783963, + 87.49946447311738 + ], + "1_n_async_actor_calls_async": [ + 10589.09103935827, + 106.34051316384893 + ], + "client__1_1_actor_calls_async": [ + 1084.4466197839831, + 33.13091505679245 + ], + "client__1_1_actor_calls_concurrent": [ + 1106.1285553207586, + 25.829782511660305 + ], + "client__1_1_actor_calls_sync": [ + 569.89256256793, + 24.8044284674939 + ], + "client__get_calls": [ + 1150.072552279968, + 36.79667266684934 + ], + "client__put_calls": [ + 904.8795868787086, + 13.073088078601502 + ], + "client__put_gigabytes": [ + 0.045687216636994404, + 0.00042081305098886794 + ], + "client__tasks_and_get_batch": [ + 0.9467558674435517, + 0.053000735219486415 + ], + "client__tasks_and_put_batch": [ + 12964.98005883289, + 319.3993930878376 + ], + "multi_client_put_calls_Plasma_Store": [ + 13685.622603708454, + 107.89607650269706 + ], + "multi_client_put_gigabytes": [ + 31.944310601566972, + 0.8025680665642678 + ], + "multi_client_tasks_async": [ + 34377.35783189367, + 2098.212516049616 + ], + "n_n_actor_calls_async": [ + 34184.700321977085, + 833.4165939251417 + ], + "n_n_actor_calls_with_arg_async": [ + 3086.29057625603, + 35.752775132663736 + ], + "n_n_async_actor_calls_async": [ + 27000.281282494278, + 1192.3434495510094 + ], + "perf_metrics": [ + { + "perf_metric_name": "single_client_get_calls_Plasma_Store", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 6220.115319914286 + }, + { + "perf_metric_name": "single_client_put_calls_Plasma_Store", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 6427.78729348863 + }, + { + "perf_metric_name": "multi_client_put_calls_Plasma_Store", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 13685.622603708454 + }, + { + "perf_metric_name": "single_client_put_gigabytes", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 20.114238761619227 + }, + { + "perf_metric_name": "single_client_tasks_and_get_batch", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 10.621044232599615 + }, + { + "perf_metric_name": "multi_client_put_gigabytes", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 31.944310601566972 + }, + { + "perf_metric_name": "single_client_get_object_containing_10k_refs", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 13.4303118492593 + }, + { + "perf_metric_name": "single_client_wait_1k_refs", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 5.669743175103769 + }, + { + "perf_metric_name": "single_client_tasks_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1402.5799311893395 + }, + { + "perf_metric_name": "single_client_tasks_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 11589.713176381527 + }, + { + "perf_metric_name": "multi_client_tasks_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 34377.35783189367 + }, + { + "perf_metric_name": "1_1_actor_calls_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 2627.6907826949946 + }, + { + "perf_metric_name": "1_1_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 8774.565215109124 + }, + { + "perf_metric_name": "1_1_actor_calls_concurrent", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 5556.408295085073 + }, + { + "perf_metric_name": "1_n_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 11443.027888783963 + }, + { + "perf_metric_name": "n_n_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 34184.700321977085 + }, + { + "perf_metric_name": "n_n_actor_calls_with_arg_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 3086.29057625603 + }, + { + "perf_metric_name": "1_1_async_actor_calls_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1749.332379578524 + }, + { + "perf_metric_name": "1_1_async_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 3053.0727243465535 + }, + { + "perf_metric_name": "1_1_async_actor_calls_with_args_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 2438.8474219503923 + }, + { + "perf_metric_name": "1_n_async_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 10589.09103935827 + }, + { + "perf_metric_name": "n_n_async_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 27000.281282494278 + }, + { + "perf_metric_name": "placement_group_create/removal", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1111.4458914419295 + }, + { + "perf_metric_name": "client__get_calls", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1150.072552279968 + }, + { + "perf_metric_name": "client__put_calls", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 904.8795868787086 + }, + { + "perf_metric_name": "client__put_gigabytes", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 0.045687216636994404 + }, + { + "perf_metric_name": "client__tasks_and_put_batch", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 12964.98005883289 + }, + { + "perf_metric_name": "client__1_1_actor_calls_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 569.89256256793 + }, + { + "perf_metric_name": "client__1_1_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1084.4466197839831 + }, + { + "perf_metric_name": "client__1_1_actor_calls_concurrent", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1106.1285553207586 + }, + { + "perf_metric_name": "client__tasks_and_get_batch", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 0.9467558674435517 + } + ], + "placement_group_create/removal": [ + 1111.4458914419295, + 36.12535104191624 + ], + "single_client_get_calls_Plasma_Store": [ + 6220.115319914286, + 224.5052300173086 + ], + "single_client_get_object_containing_10k_refs": [ + 13.4303118492593, + 0.17122469816034125 + ], + "single_client_put_calls_Plasma_Store": [ + 6427.78729348863, + 84.21488331092435 + ], + "single_client_put_gigabytes": [ + 20.114238761619227, + 6.003066360606937 + ], + "single_client_tasks_and_get_batch": [ + 10.621044232599615, + 0.7018065234293067 + ], + "single_client_tasks_async": [ + 11589.713176381527, + 69.968115451784 + ], + "single_client_tasks_sync": [ + 1402.5799311893395, + 44.22066668152743 + ], + "single_client_wait_1k_refs": [ + 5.669743175103769, + 0.16404015686449241 + ] +} diff --git a/release/release_logs/2.4.0/scalability/object_store.json b/release/release_logs/2.4.0/scalability/object_store.json index 7a9f43ff8c59..726d2fdf756e 100644 --- a/release/release_logs/2.4.0/scalability/object_store.json +++ b/release/release_logs/2.4.0/scalability/object_store.json @@ -1,40 +1,13 @@ { - "args_time": 16.78179498900002, - "get_time": 24.349769197, - "large_object_size": 107374182400, - "large_object_time": 34.58217635799997, - "num_args": 10000, - "num_get_args": 10000, - "num_queued": 1000000, - "num_returns": 3000, - "perf_metrics": [ - { - "perf_metric_name": "10000_args_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 16.78179498900002 - }, - { - "perf_metric_name": "3000_returns_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 6.026168543000011 - }, - { - "perf_metric_name": "10000_get_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 24.349769197 - }, - { - "perf_metric_name": "1000000_queued_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 239.18845452300002 - }, - { - "perf_metric_name": "107374182400_large_object_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 34.58217635799997 - } - ], - "queued_time": 239.18845452300002, - "returns_time": 6.026168543000011, - "success": "1" + "broadcast_time": 89.42807069100002, + "num_nodes": 50, + "object_size": 1073741824, + "perf_metrics": [ + { + "perf_metric_name": "time_to_broadcast_1073741824_bytes_to_50_nodes", + "perf_metric_type": "LATENCY", + "perf_metric_value": 89.42807069100002 + } + ], + "success": "1" } diff --git a/release/release_logs/2.4.0/scalability/single_node.json b/release/release_logs/2.4.0/scalability/single_node.json index 7a9f43ff8c59..2183a69fd2ae 100644 --- a/release/release_logs/2.4.0/scalability/single_node.json +++ b/release/release_logs/2.4.0/scalability/single_node.json @@ -1,40 +1,40 @@ { - "args_time": 16.78179498900002, - "get_time": 24.349769197, - "large_object_size": 107374182400, - "large_object_time": 34.58217635799997, - "num_args": 10000, - "num_get_args": 10000, - "num_queued": 1000000, - "num_returns": 3000, - "perf_metrics": [ - { - "perf_metric_name": "10000_args_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 16.78179498900002 - }, - { - "perf_metric_name": "3000_returns_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 6.026168543000011 - }, - { - "perf_metric_name": "10000_get_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 24.349769197 - }, - { - "perf_metric_name": "1000000_queued_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 239.18845452300002 - }, - { - "perf_metric_name": "107374182400_large_object_time", - "perf_metric_type": "LATENCY", - "perf_metric_value": 34.58217635799997 - } - ], - "queued_time": 239.18845452300002, - "returns_time": 6.026168543000011, - "success": "1" + "args_time": 17.054044035999993, + "get_time": 24.36676771400002, + "large_object_size": 107374182400, + "large_object_time": 34.06999374499998, + "num_args": 10000, + "num_get_args": 10000, + "num_queued": 1000000, + "num_returns": 3000, + "perf_metrics": [ + { + "perf_metric_name": "10000_args_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 17.054044035999993 + }, + { + "perf_metric_name": "3000_returns_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 6.002825282000003 + }, + { + "perf_metric_name": "10000_get_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 24.36676771400002 + }, + { + "perf_metric_name": "1000000_queued_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 175.74473816900002 + }, + { + "perf_metric_name": "107374182400_large_object_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 34.06999374499998 + } + ], + "queued_time": 175.74473816900002, + "returns_time": 6.002825282000003, + "success": "1" } diff --git a/release/release_logs/2.4.0/stress_tests/stress_test_dead_actors.json b/release/release_logs/2.4.0/stress_tests/stress_test_dead_actors.json index 4674b3febdbd..8e7263ecac1e 100644 --- a/release/release_logs/2.4.0/stress_tests/stress_test_dead_actors.json +++ b/release/release_logs/2.4.0/stress_tests/stress_test_dead_actors.json @@ -1,14 +1,14 @@ { - "avg_iteration_time": 2.4128899502754213, - "max_iteration_time": 11.154391050338745, - "min_iteration_time": 0.2948293685913086, + "avg_iteration_time": 2.0598055481910706, + "max_iteration_time": 15.883565187454224, + "min_iteration_time": 0.14446020126342773, "perf_metrics": [ { "perf_metric_name": "avg_iteration_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 2.4128899502754213 + "perf_metric_value": 2.0598055481910706 } ], "success": 1, - "total_time": 241.28934574127197 + "total_time": 205.98083114624023 } diff --git a/release/release_logs/2.4.0/stress_tests/stress_test_many_tasks.json b/release/release_logs/2.4.0/stress_tests/stress_test_many_tasks.json index cfe605e556d4..f863c0b276fc 100644 --- a/release/release_logs/2.4.0/stress_tests/stress_test_many_tasks.json +++ b/release/release_logs/2.4.0/stress_tests/stress_test_many_tasks.json @@ -3,45 +3,45 @@ { "perf_metric_name": "stage_0_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 14.71593689918518 + "perf_metric_value": 12.432663917541504 }, { "perf_metric_name": "stage_1_avg_iteration_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 23.45582284927368 + "perf_metric_value": 22.303217387199403 }, { "perf_metric_name": "stage_2_avg_iteration_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 59.12007422447205 + "perf_metric_value": 58.13721342086792 }, { "perf_metric_name": "stage_3_creation_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 5.639009952545166 + "perf_metric_value": 4.9506330490112305 }, { "perf_metric_name": "stage_3_time", "perf_metric_type": "LATENCY", - "perf_metric_value": 2746.616822242737 + "perf_metric_value": 2541.1457979679108 }, { "perf_metric_name": "stage_4_spread", "perf_metric_type": "LATENCY", - "perf_metric_value": 0.8324665945841853 + "perf_metric_value": 0.653147785580004 } ], - "stage_0_time": 14.71593689918518, - "stage_1_avg_iteration_time": 23.45582284927368, - "stage_1_max_iteration_time": 24.144246339797974, - "stage_1_min_iteration_time": 22.638681411743164, - "stage_1_time": 234.5583221912384, - "stage_2_avg_iteration_time": 59.12007422447205, - "stage_2_max_iteration_time": 60.01493453979492, - "stage_2_min_iteration_time": 57.3223192691803, - "stage_2_time": 295.60199069976807, - "stage_3_creation_time": 5.639009952545166, - "stage_3_time": 2746.616822242737, - "stage_4_spread": 0.8324665945841853, + "stage_0_time": 12.432663917541504, + "stage_1_avg_iteration_time": 22.303217387199403, + "stage_1_max_iteration_time": 23.565119743347168, + "stage_1_min_iteration_time": 21.196225881576538, + "stage_1_time": 223.03227972984314, + "stage_2_avg_iteration_time": 58.13721342086792, + "stage_2_max_iteration_time": 58.608288526535034, + "stage_2_min_iteration_time": 57.48827838897705, + "stage_2_time": 290.6870460510254, + "stage_3_creation_time": 4.9506330490112305, + "stage_3_time": 2541.1457979679108, + "stage_4_spread": 0.653147785580004, "success": 1 } diff --git a/release/release_logs/2.4.0/stress_tests/stress_test_placement_group.json b/release/release_logs/2.4.0/stress_tests/stress_test_placement_group.json index bc4bc4ae8d49..751120113766 100644 --- a/release/release_logs/2.4.0/stress_tests/stress_test_placement_group.json +++ b/release/release_logs/2.4.0/stress_tests/stress_test_placement_group.json @@ -1,16 +1,16 @@ { - "avg_pg_create_time_ms": 0.9266749219217597, - "avg_pg_remove_time_ms": 0.9992335435431319, + "avg_pg_create_time_ms": 0.8416926516519441, + "avg_pg_remove_time_ms": 0.8426233363353036, "perf_metrics": [ { "perf_metric_name": "avg_pg_create_time_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 0.9266749219217597 + "perf_metric_value": 0.8416926516519441 }, { "perf_metric_name": "avg_pg_remove_time_ms", "perf_metric_type": "LATENCY", - "perf_metric_value": 0.9992335435431319 + "perf_metric_value": 0.8426233363353036 } ], "success": 1 diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 27a5b9c137c6..c8984316c172 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -10,7 +10,10 @@ # working_dir: example_dir # # # How often to run the tests. -# # One of [manual, any, multi, nightly, weekly]. +# # One of [manual, any, multi, nightly, nightly-3x, weekly]. +# # Descriptions of each frequency (that's not immediately obvious): +# # - manual: Not run on a schedule, but can be manually run through the buildkite UI. +# # - nightly-3x: Run 3 times a week (Monday, Wednesday, Friday). # frequency: weekly # # Owning team. This field will be persisted to the database # team: ml @@ -19,10 +22,6 @@ # # on. This must be a string! # python: "3.7" # -# # Optional location of a bash setup script to run on the driver -# # when setting up the local environment. Relative to working_dir -# driver_setup: setup_driver.sh -# # # Cluster information # cluster: # # Location of cluster env, relative to working_dir @@ -40,15 +39,6 @@ # # # Run configuration for the test # run: -# # Type of test. Can be [anyscale_job, sdk_command]. -# # Uses either Ray jobs, anyscale jobs or anyscale SDK commands -# # run the actual release test. -# type: anyscale_job -# -# # File manager to use to transfer files to and from the cluster. -# # Can be any of [sdk, job]. -# file_manager: job -# # # If you want to wait for nodes to be ready, you can specify this here: # wait_for_nodes: # # Number of nodes @@ -134,13 +124,22 @@ cluster: cluster_env: frequent_pausing/app_config.yaml - cluster_compute: frequent_pausing/compute_config.yaml + cluster_compute: frequent_pausing/compute_config_aws.yaml run: timeout: 600 # 10min long_running: true script: python frequent_pausing/script.py + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: frequent_pausing/app_config.yaml + cluster_compute: frequent_pausing/compute_config_gce.yaml + alert: default @@ -190,7 +189,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: data_20_nodes.yaml + cluster_compute: compute_data_20_nodes_aws.yaml run: timeout: 3600 @@ -199,6 +198,14 @@ wait_for_nodes: num_nodes: 20 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_data_20_nodes_gce.yaml alert: default @@ -212,7 +219,7 @@ cluster: cluster_env: xgboost_app_config.yaml - cluster_compute: xgboost_compute_tpl.yaml + cluster_compute: compute_xgboost_aws.yaml run: timeout: 36000 @@ -221,6 +228,14 @@ wait_for_nodes: num_nodes: 11 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: xgboost_app_config.yaml + cluster_compute: compute_xgboost_gce.yaml smoke_test: frequency: manual @@ -240,7 +255,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_cpu_4.yaml + cluster_compute: compute_cpu_4_aws.yaml run: timeout: 3600 @@ -249,6 +264,15 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_cpu_4_gce.yaml + alert: default - name: air_benchmark_torch_mnist_gpu_4x4 @@ -260,7 +284,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_gpu_4x4.yaml + cluster_compute: compute_gpu_4x4_aws.yaml run: timeout: 4800 @@ -273,7 +297,7 @@ frequency: nightly cluster: - cluster_compute: compute_gpu_2x2.yaml + cluster_compute: compute_gpu_2x2_aws.yaml run: timeout: 3600 @@ -282,6 +306,17 @@ wait_for_nodes: num_nodes: 2 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_gpu_4x4_gce.yaml + smoke_test: + frequency: manual + alert: default @@ -294,12 +329,20 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_cpu_1.yaml + cluster_compute: compute_cpu_1_aws.yaml run: timeout: 3600 script: python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_cpu_1_gce.yaml alert: default @@ -313,13 +356,12 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_gpu_1_g4_8xl.yaml + cluster_compute: compute_gpu_1_cpu_16_aws.yaml run: timeout: 3600 script: python workloads/gpu_batch_prediction.py --data-size-gb 20 - alert: default variations: @@ -329,7 +371,7 @@ frequency: manual cluster: cluster_env: app_config.yaml - cluster_compute: compute_gce_gpu_1_g4_8xl.yaml + cluster_compute: compute_gpu_1_cpu_16_gce.yaml - name: air_benchmark_torch_batch_prediction_gpu_4x4_100gb @@ -343,7 +385,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_gpu_4_g4_12xl.yaml + cluster_compute: compute_gpu_4x4_aws.yaml run: timeout: 10800 @@ -361,7 +403,7 @@ frequency: manual cluster: cluster_env: app_config.yaml - cluster_compute: compute_gce_gpu_4_g4_12xl.yaml + cluster_compute: compute_gpu_4x4_gce.yaml - name: air_benchmark_torch_mnist_cpu_4x4 group: AIR tests @@ -372,7 +414,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_cpu_4.yaml + cluster_compute: compute_cpu_4_aws.yaml run: timeout: 5400 @@ -381,10 +423,17 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_cpu_4_gce.yaml alert: default - - name: air_benchmark_tune_torch_mnist group: AIR tests working_dir: air_tests/air_benchmarks @@ -394,7 +443,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_cpu_8.yaml + cluster_compute: compute_cpu_8_aws.yaml run: timeout: 3600 @@ -403,6 +452,14 @@ wait_for_nodes: num_nodes: 8 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_cpu_8_gce.yaml alert: default @@ -415,7 +472,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_gpu_4_g4_12xl.yaml + cluster_compute: compute_gpu_4x4_aws.yaml run: timeout: 3600 @@ -424,6 +481,14 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_gpu_4x4_gce.yaml alert: default @@ -437,7 +502,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_cpu_4.yaml + cluster_compute: compute_cpu_4_aws.yaml run: timeout: 5400 @@ -446,6 +511,14 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_cpu_4_gce.yaml alert: default @@ -459,12 +532,20 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_cpu_1.yaml + cluster_compute: compute_cpu_1_aws.yaml run: timeout: 5400 script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_cpu_1_gce.yaml alert: default @@ -480,7 +561,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_cpu_4.yaml + cluster_compute: compute_cpu_4_aws.yaml run: timeout: 5400 @@ -489,6 +570,14 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_cpu_4_gce.yaml alert: default @@ -504,7 +593,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_gpu_4x4.yaml + cluster_compute: compute_gpu_4x4_aws.yaml run: timeout: 5400 @@ -518,7 +607,7 @@ frequency: nightly cluster: - cluster_compute: compute_gpu_2x2.yaml + cluster_compute: compute_gpu_2x2_aws.yaml run: script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu @@ -526,8 +615,18 @@ wait_for_nodes: num_nodes: 2 - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_gpu_4x4_gce.yaml + smoke_test: + frequency: manual + alert: default - name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb group: AIR tests @@ -538,13 +637,12 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_gpu_1.yaml + cluster_compute: compute_gpu_1_aws.yaml run: timeout: 3600 script: python workloads/pytorch_training_e2e.py --data-size-gb 20 - alert: default variations: @@ -554,7 +652,7 @@ frequency: manual cluster: cluster_env: app_config.yaml - cluster_compute: compute_gce_gpu_1.yaml + cluster_compute: compute_gpu_1_gce.yaml - name: air_benchmark_pytorch_training_e2e_gpu_4x4_100gb @@ -568,7 +666,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_gpu_16.yaml + cluster_compute: compute_gpu_4x4_aws.yaml run: timeout: 10800 @@ -586,7 +684,7 @@ frequency: manual cluster: cluster_env: app_config.yaml - cluster_compute: compute_gce_gpu_16.yaml + cluster_compute: compute_gpu_4x4_gce.yaml # Test tiny, medium, and huge input files. - name: ray-data-bulk-ingest-file-size-benchmark @@ -594,9 +692,10 @@ working_dir: air_tests/air_benchmarks/mlperf-train stable: false + jailed: true frequency: nightly - team: core + team: data cluster: cluster_env: app_config_oom.yaml cluster_compute: compute_cpu_16.yaml @@ -620,9 +719,10 @@ working_dir: air_tests/air_benchmarks/mlperf-train stable: false + jailed: true frequency: nightly - team: core + team: data cluster: cluster_env: app_config_oom.yaml cluster_compute: compute_cpu_16.yaml @@ -646,9 +746,10 @@ working_dir: air_tests/air_benchmarks/mlperf-train stable: false + jailed: true frequency: nightly - team: core + team: data cluster: cluster_env: app_config_oom.yaml cluster_compute: compute_cpu_16_worker_nodes_2.yaml @@ -686,13 +787,15 @@ team: ml cluster: cluster_env: dreambooth_env.yaml - cluster_compute: dreambooth_compute.yaml + cluster_compute: dreambooth_compute_aws.yaml run: timeout: 1800 script: bash dreambooth_run.sh artifact_path: /tmp/artifacts/example_out.jpg + # variations: A10G not available on GCE, yet. + - name: air_example_gptj_deepspeed_fine_tuning group: AIR examples @@ -704,65 +807,132 @@ team: ml cluster: cluster_env: gptj_deepspeed_env.yaml - cluster_compute: gptj_deepspeed_compute.yaml + cluster_compute: gptj_deepspeed_compute_aws.yaml run: timeout: 3600 script: python test_myst_doc.py --path gptj_deepspeed_fine_tuning.ipynb + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: gptj_deepspeed_env.yaml + cluster_compute: gptj_deepspeed_compute_gce.yaml + + +- name: air_example_dolly_v2_lightning_fsdp_finetuning + group: AIR examples + working_dir: air_examples/dolly_v2_lightning_fsdp_finetuning + + python: "3.8" + + frequency: weekly + team: ml + cluster: + cluster_env: dolly_v2_fsdp_env.yaml + cluster_compute: dolly_v2_fsdp_compute_aws.yaml + + run: + timeout: 4700 + script: python test_myst_doc.py --path lightning-llm-finetuning-7b.ipynb + + +- name: air_example_opt_deepspeed_batch_inference + group: AIR examples + working_dir: air_examples/opt_deepspeed_batch_inference + + python: "3.9" + + frequency: weekly + team: ml + cluster: + cluster_env: 30b_deepspeed_env.yaml + cluster_compute: 30b_deepspeed_compute.yaml + + run: + timeout: 3600 + script: python test_myst_doc.py --path opt_deepspeed_batch_inference.ipynb + + # variations: TODO(jungong): add GCP variation. + ##################################### # Workspace templates release tests # ##################################### -- name: workspace_template_small_01_batch_inference +- name: workspace_template_batch_inference group: Workspace templates - working_dir: workspace_templates/01_batch_inference + working_dir: workspace_templates/tests/01_batch_inference python: "3.9" - frequency: nightly + frequency: nightly-3x team: ml cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/gpu/aws_small.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/aws_release_test.yaml run: timeout: 600 - script: jupyter nbconvert --TagRemovePreprocessor.remove_input_tags='large' - --to script --output _test batch_inference.ipynb && ipython _test.py + script: jupyter nbconvert --to script --output _test batch_inference.ipynb && ipython _test.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/gce_release_test.yaml -- name: workspace_template_small_02_many_model_training +- name: workspace_template_many_model_training group: Workspace templates - working_dir: workspace_templates/02_many_model_training + working_dir: workspace_templates/tests/02_many_model_training python: "3.9" - frequency: nightly + frequency: nightly-3x team: ml cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/cpu/aws_small.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/cpu/aws_release_test.yaml run: - timeout: 300 - script: pip install -U -r requirements.txt - && jupyter nbconvert --TagRemovePreprocessor.remove_input_tags='large' - --to script --output _test many_model_training.ipynb && ipython _test.py + timeout: 600 + script: pip install -U -r requirements.txt && jupyter nbconvert --to script --output _test many_model_training.ipynb && ipython _test.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/cpu/gce_release_test.yaml -- name: workspace_template_small_03_serving_stable_diffusion +- name: workspace_template_serving_stable_diffusion group: Workspace templates - working_dir: workspace_templates/03_serving_stable_diffusion + working_dir: workspace_templates/tests/03_serving_stable_diffusion python: "3.9" - frequency: nightly + frequency: nightly-3x team: ml cluster: - cluster_env: ../configs/release_test_cluster_env.yaml - cluster_compute: ../configs/compute/gpu/aws_small.yaml + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/aws_release_test.yaml run: - timeout: 900 - script: pip install -U -r requirements.txt - && jupyter nbconvert --TagRemovePreprocessor.remove_input_tags='large' - --to script --output _test serving_stable_diffusion.ipynb && ipython _test.py + timeout: 600 + script: pip install -U -r requirements.txt && jupyter nbconvert --to script --output _test serving_stable_diffusion.ipynb && ipython _test.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: ../../configs/release_test_cluster_env.yaml + cluster_compute: ../../configs/compute/gpu/gce_release_test.yaml ####################### @@ -790,8 +960,6 @@ # wait_for_nodes: # num_nodes: 4 -# type: anyscale_job - # alert: xgboost_tests - name: xgboost_train_moderate @@ -1053,8 +1221,6 @@ # wait_for_nodes: # num_nodes: 4 -# type: anyscale_job - # alert: default - name: lightgbm_train_moderate @@ -1243,7 +1409,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_tpl.yaml + cluster_compute: compute_tpl_aws.yaml run: timeout: 1200 @@ -1251,6 +1417,15 @@ wait_for_nodes: num_nodes: 3 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_tpl_gce.yaml + alert: default - name: lightning_gpu_tune_3x16_3x1 @@ -1262,7 +1437,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_tpl.yaml + cluster_compute: compute_tpl_aws.yaml run: timeout: 1200 @@ -1270,6 +1445,15 @@ wait_for_nodes: num_nodes: 3 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_tpl_gce.yaml + alert: default ####################### @@ -1284,15 +1468,23 @@ cluster: cluster_env: horovod/app_config.yaml - cluster_compute: horovod/compute_tpl.yaml + cluster_compute: horovod/compute_tpl_aws.yaml - driver_setup: horovod/driver_setup_latest.sh run: timeout: 1200 script: python horovod/horovod_user_test.py wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: horovod/app_config.yaml + cluster_compute: horovod/compute_tpl_gce.yaml + alert: default - name: ml_user_horovod_user_test_master @@ -1304,15 +1496,23 @@ cluster: cluster_env: horovod/app_config_master.yaml - cluster_compute: horovod/compute_tpl.yaml + cluster_compute: horovod/compute_tpl_aws.yaml - driver_setup: horovod/driver_setup_master.sh run: timeout: 1200 script: python horovod/horovod_user_test.py wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: horovod/app_config_master.yaml + cluster_compute: horovod/compute_tpl_gce.yaml + alert: default - name: ml_user_train_tensorflow_mnist_test @@ -1324,15 +1524,23 @@ cluster: cluster_env: train/app_config.yaml - cluster_compute: train/compute_tpl.yaml + cluster_compute: train/compute_tpl_aws.yaml - driver_setup: train/driver_setup.sh run: timeout: 36000 script: python train/train_tensorflow_mnist_test.py wait_for_nodes: num_nodes: 3 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: train/app_config.yaml + cluster_compute: train/compute_tpl_gce.yaml + alert: default - name: ml_user_train_torch_linear_test @@ -1344,15 +1552,23 @@ cluster: cluster_env: train/app_config.yaml - cluster_compute: train/compute_tpl.yaml + cluster_compute: train/compute_tpl_aws.yaml - driver_setup: train/driver_setup.sh run: timeout: 36000 script: python train/train_torch_linear_test.py wait_for_nodes: num_nodes: 3 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: train/app_config.yaml + cluster_compute: train/compute_tpl_gce.yaml + alert: default - name: ml_user_xgboost_gpu_connect_latest @@ -1364,7 +1580,7 @@ cluster: cluster_env: xgboost/app_config_gpu.yaml - cluster_compute: xgboost/tpl_gpu_small_scaling.yaml + cluster_compute: xgboost/tpl_gpu_small_scaling_aws.yaml run: timeout: 1200 @@ -1372,6 +1588,15 @@ wait_for_nodes: num_nodes: 5 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: xgboost/app_config_gpu.yaml + cluster_compute: xgboost/tpl_gpu_small_scaling_gce.yaml + alert: default - name: ml_user_xgboost_gpu_connect_master @@ -1383,7 +1608,7 @@ cluster: cluster_env: xgboost/app_config_gpu_master.yaml - cluster_compute: xgboost/tpl_gpu_small_scaling.yaml + cluster_compute: xgboost/tpl_gpu_small_scaling_aws.yaml run: timeout: 1200 @@ -1391,6 +1616,15 @@ wait_for_nodes: num_nodes: 5 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: xgboost/app_config_gpu_master.yaml + cluster_compute: xgboost/tpl_gpu_small_scaling_gce.yaml + alert: default - name: ml_user_ray_lightning_user_test_latest @@ -1402,15 +1636,23 @@ cluster: cluster_env: ray-lightning/app_config.yaml - cluster_compute: ray-lightning/compute_tpl.yaml + cluster_compute: ray-lightning/compute_tpl_aws.yaml - driver_setup: ray-lightning/driver_setup.sh run: timeout: 1200 script: python ray-lightning/ray_lightning_user_test.py wait_for_nodes: num_nodes: 3 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: ray-lightning/app_config.yaml + cluster_compute: ray-lightning/compute_tpl_gce.yaml + alert: default - name: ml_user_ray_lightning_user_test_master @@ -1422,15 +1664,23 @@ cluster: cluster_env: ray-lightning/app_config_master.yaml - cluster_compute: ray-lightning/compute_tpl.yaml + cluster_compute: ray-lightning/compute_tpl_aws.yaml - driver_setup: ray-lightning/driver_setup.sh run: timeout: 1200 script: python ray-lightning/ray_lightning_user_test.py wait_for_nodes: num_nodes: 3 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: ray-lightning/app_config_master.yaml + cluster_compute: ray-lightning/compute_tpl_gce.yaml + alert: default - name: ml_user_tune_rllib_connect_test @@ -1442,15 +1692,23 @@ cluster: cluster_env: ../rllib_tests/app_config.yaml - cluster_compute: tune_rllib/compute_tpl.yaml + cluster_compute: tune_rllib/compute_tpl_aws.yaml - driver_setup: tune_rllib/driver_setup.sh run: timeout: 2000 script: python tune_rllib/run_connect_tests.py wait_for_nodes: num_nodes: 9 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: ../rllib_tests/app_config.yaml + cluster_compute: tune_rllib/compute_tpl_gce.yaml + alert: default ####################### @@ -1479,7 +1737,6 @@ env: gce frequency: manual cluster: - cluster_env: app_config.yaml cluster_compute: tpl_gce_4x8.yaml alert: tune_tests @@ -1544,7 +1801,7 @@ alert: tune_tests -- name: tune_cloud_aws_durable_upload_rllib_str +- name: tune_cloud_durable_upload_rllib_str group: Tune cloud tests working_dir: tune_tests/cloud_tests @@ -1566,10 +1823,25 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config_ml.yaml + cluster_compute: tpl_gce_4x2.yaml + run: + timeout: 600 + script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str + --bucket gs://tune-cloud-tests/durable_upload_rllib_str + wait_for_nodes: + num_nodes: 4 + alert: tune_tests -- name: tune_cloud_aws_durable_upload_rllib_trainer +- name: tune_cloud_durable_upload_rllib_trainer group: Tune cloud tests working_dir: tune_tests/cloud_tests @@ -1590,6 +1862,20 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config_ml.yaml + cluster_compute: tpl_gce_4x2.yaml + run: + timeout: 600 + script: python workloads/run_cloud_test.py durable_upload --trainable rllib_str + --bucket gs://tune-cloud-tests/durable_upload_rllib_trainer + wait_for_nodes: + num_nodes: 4 alert: tune_tests @@ -1636,10 +1922,57 @@ run: timeout: 900 - script: python workloads/test_durable_trainable.py --bucket tune-cloud-tests + script: python workloads/test_durable_trainable.py --bucket s3://tune-cloud-tests/scalability_durable_trainable + wait_for_nodes: + num_nodes: 16 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + run: + timeout: 900 + script: python workloads/test_durable_trainable.py --bucket gs://tune-cloud-tests/scalability_durable_trainable + wait_for_nodes: + num_nodes: 16 + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gce_16x2.yaml + + alert: tune_tests + + +- name: tune_scalability_durable_multifile_checkpoints + group: Tune scalability tests + working_dir: tune_tests/scalability_tests + + frequency: nightly + team: ml + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_16x2.yaml + + run: + timeout: 900 + script: python workloads/test_durable_multifile_checkpoints.py --bucket s3://tune-cloud-tests/scalability_durable_multifile_checkpoints wait_for_nodes: num_nodes: 16 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + run: + timeout: 900 + script: python workloads/test_durable_multifile_checkpoints.py --bucket gs://tune-cloud-tests/scalability_durable_multifile_checkpoints + wait_for_nodes: + num_nodes: 16 + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gce_16x2.yaml alert: tune_tests @@ -1775,7 +2108,6 @@ run: timeout: 600 script: python workloads/test_result_throughput_single_node.py - type: anyscale_job - name: tune_scalability_xgboost_sweep group: Tune scalability tests @@ -1826,11 +2158,27 @@ run: timeout: 5400 - script: python workloads/test_tune_worker_fault_tolerance.py + script: python workloads/test_tune_worker_fault_tolerance.py --bucket s3://tune-cloud-tests/worker_fault_tolerance wait_for_nodes: num_nodes: 16 +# Disabled until we can kill nodes in GCE +# variations: +# - __suffix__: aws +# - __suffix__: gce +# env: gce +# frequency: manual +# run: +# timeout: 5400 +# script: python workloads/test_tune_worker_fault_tolerance.py --bucket gs://tune-cloud-tests/worker_fault_tolerance +# +# wait_for_nodes: +# num_nodes: 16 +# cluster: +# cluster_env: app_config.yaml +# cluster_compute: tpl_gce_16x1.yaml + ######################## # Golden Notebook tests ######################## @@ -1843,7 +2191,7 @@ cluster: cluster_env: torch_tune_serve_app_config.yaml - cluster_compute: gpu_tpl.yaml + cluster_compute: gpu_tpl_aws.yaml run: timeout: 600 @@ -1851,6 +2199,15 @@ wait_for_nodes: num_nodes: 2 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: torch_tune_serve_app_config.yaml + cluster_compute: gpu_tpl_gce.yaml + alert: default @@ -2582,8 +2939,6 @@ # wait_for_nodes: # num_nodes: 1 -# type: anyscale_job - # alert: default @@ -2954,7 +3309,7 @@ cluster: cluster_env: app_config.yaml - cluster_compute: compute_tpl.yaml + cluster_compute: compute_tpl_aws.yaml run: timeout: 3000 @@ -2963,6 +3318,14 @@ wait_for_nodes: num_nodes: 2 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: compute_tpl_gce.yaml alert: default @@ -2979,15 +3342,30 @@ cluster: cluster_env: app_config.yaml - cluster_compute: 2_g4dn_12xlarge.yaml + cluster_compute: gpu_2x4_t4_aws.yaml run: timeout: 3600 - script: bash run_train_opt_2_7b.sh + script: bash run_train_opt_2_7b.sh --storage aws wait_for_nodes: num_nodes: 2 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: gpu_2x4_t4_gce.yaml + run: + timeout: 3600 + script: bash run_train_opt_2_7b.sh --storage gcs + + wait_for_nodes: + num_nodes: 2 + alert: default - name: alpa_opt_30b_inference @@ -2999,15 +3377,30 @@ cluster: cluster_env: app_config.yaml - cluster_compute: 1_p3_16xlarge.yaml + cluster_compute: gpu_1x8_v100_aws.yaml run: timeout: 3600 - script: bash run_inference_opt_30b.sh + script: bash run_inference_opt_30b.sh --storage aws wait_for_nodes: num_nodes: 1 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: gpu_1x8_v100_gce.yaml + run: + timeout: 3600 + script: bash run_inference_opt_30b.sh --storage gcs + + wait_for_nodes: + num_nodes: 1 + alert: default ######################## @@ -3024,16 +3417,25 @@ cluster: cluster_env: app_config.yaml cluster_compute: multi_node_checkpointing_compute_config.yaml - + run: timeout: 3600 script: pytest checkpointing_tests/test_learner_group_checkpointing.py wait_for_nodes: num_nodes: 3 - + alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: multi_node_checkpointing_compute_config_gce.yaml + - name: rllib_learning_tests_a2c_tf group: RLlib tests @@ -3050,9 +3452,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=a2c --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_a2c_torch group: RLlib tests working_dir: rllib_tests @@ -3070,6 +3480,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_a3c_tf group: RLlib tests working_dir: rllib_tests @@ -3085,29 +3504,16 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=a3c --framework=tf - alert: default -# TODO(sven, jungong, Kourosh): fix A3C on torch and tf2 and re-enable. -#- name: rllib_learning_tests_a3c_torch -# group: RLlib tests -# working_dir: rllib_tests - -# frequency: nightly -# team: rllib -# env: staging_v2 - -# cluster: -# cluster_env: app_config.yaml -# cluster_compute: 32cpus.yaml - -# run: -# timeout: 18000 -# script: python learning_tests/run.py --yaml-sub-dir=a3c --framework=torch -# type: anyscale_job -# file_manager: job - -# alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 32cpus_gce.yaml - name: rllib_learning_tests_apex_tf group: RLlib tests @@ -3127,9 +3533,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=apex --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_24cpus_gce.yaml + - name: rllib_learning_tests_apex_torch group: RLlib tests working_dir: rllib_tests @@ -3147,6 +3561,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_24cpus_gce.yaml + - name: rllib_learning_tests_appo_tf group: RLlib tests working_dir: rllib_tests @@ -3193,6 +3616,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 2gpus_32cpus_gce.yaml + - name: rllib_learning_tests_bc_tf group: RLlib tests working_dir: rllib_tests @@ -3210,6 +3642,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_bc_torch group: RLlib tests working_dir: rllib_tests @@ -3227,6 +3668,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_cql_tf group: RLlib tests working_dir: rllib_tests @@ -3247,6 +3697,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_cql_torch group: RLlib tests working_dir: rllib_tests @@ -3267,6 +3726,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_ddpg_tf group: RLlib tests working_dir: rllib_tests @@ -3282,9 +3750,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_ddpg_torch group: RLlib tests working_dir: rllib_tests @@ -3300,9 +3776,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=ddpg --framework=torch - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_dqn_tf group: RLlib tests working_dir: rllib_tests @@ -3318,9 +3802,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_dqn_torch group: RLlib tests working_dir: rllib_tests @@ -3339,9 +3831,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=dqn --framework=torch - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_es_tf group: RLlib tests working_dir: rllib_tests @@ -3357,9 +3857,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=es --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 2gpus_64cpus_gce.yaml + - name: rllib_learning_tests_es_torch group: RLlib tests working_dir: rllib_tests @@ -3375,9 +3883,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=es --framework=torch - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 2gpus_64cpus_gce.yaml + - name: rllib_learning_tests_impala_tf group: RLlib tests working_dir: rllib_tests @@ -3393,9 +3909,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=impala --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_impala_torch group: RLlib tests working_dir: rllib_tests @@ -3411,9 +3935,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=impala --framework=torch - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_marwil_tf group: RLlib tests working_dir: rllib_tests @@ -3432,9 +3964,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_marwil_torch group: RLlib tests working_dir: rllib_tests @@ -3453,9 +3993,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=marwil --framework=torch - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_ppo_tf group: RLlib tests working_dir: rllib_tests @@ -3471,9 +4019,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=ppo/tf --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 2gpus_32cpus_gce.yaml + - name: rllib_learning_tests_ppo_torch group: RLlib tests working_dir: rllib_tests @@ -3518,9 +4074,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=sac --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_sac_torch group: RLlib tests working_dir: rllib_tests @@ -3538,6 +4102,15 @@ alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_slateq_tf group: RLlib tests working_dir: rllib_tests @@ -3553,9 +4126,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_slateq_torch group: RLlib tests working_dir: rllib_tests @@ -3574,9 +4155,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=slateq --framework=torch - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_td3_tf group: RLlib tests working_dir: rllib_tests @@ -3592,9 +4181,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=tf - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_learning_tests_td3_torch group: RLlib tests working_dir: rllib_tests @@ -3610,9 +4207,17 @@ timeout: 18000 script: python learning_tests/run.py --yaml-sub-dir=td3 --framework=torch - alert: default + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: 1gpu_16cpus_gce.yaml + - name: rllib_multi_gpu_learning_tests group: RLlib tests working_dir: rllib_tests @@ -3654,7 +4259,6 @@ timeout: 7200 script: python multi_gpu_with_lstm_learning_tests/run.py - alert: default variations: @@ -3681,7 +4285,6 @@ timeout: 7200 script: python multi_gpu_with_attention_learning_tests/run.py - alert: default variations: @@ -3690,7 +4293,10 @@ env: gce frequency: manual cluster: - cluster_env: app_config.yaml + # TODO(https://github.com/ray-project/ray/issues/34591) + # Revert to the comment below once ^ closed. + # cluster_env: app_config.yaml + cluster_env: debug_app_config.yaml cluster_compute: 8gpus_96cpus_gce.yaml - name: rllib_stress_tests @@ -4010,6 +4616,8 @@ cluster: cluster_env: stress_tests/state_api_app_config.yaml cluster_compute: stress_tests/stress_tests_compute_large_gce.yaml + smoke_test: + frequency: manual - name: shuffle_20gb_with_state_api @@ -4073,6 +4681,8 @@ cluster: cluster_env: stress_tests/stress_tests_app_config.yaml cluster_compute: stress_tests/stress_tests_compute_gce.yaml + smoke_test: + frequency: manual - name: stress_test_dead_actors group: core-daily-test @@ -4112,6 +4722,8 @@ cluster: cluster_env: stress_tests/stress_tests_app_config.yaml cluster_compute: stress_tests/stress_tests_compute_gce.yaml + smoke_test: + frequency: manual # The full test is not stable, so run the smoke test only. # See https://github.com/ray-project/ray/issues/23244. @@ -4203,7 +4815,9 @@ - name: tune_air_oom group: core-daily-test working_dir: air_tests + stable: false + jailed: true frequency: nightly team: core @@ -4278,8 +4892,6 @@ # wait_for_nodes: # num_nodes: 251 # -# type: anyscale_job -# file_manager: sdk - name: pg_autoscaling_regression_test group: core-daily-test @@ -4629,17 +5241,17 @@ run: timeout: 1800 - script: python dataset_shuffle_data_loader.py - - # TODO: Port s3://shuffling-data-loader-benchmarks/ to GCS. - # variations: - # - __suffix__: aws - # - __suffix__: gce - # env: gce - # frequency: manual - # cluster: - # cluster_env: shuffle_app_config.yaml - # cluster_compute: shuffle_compute_gce.yaml + script: python dataset_shuffle_data_loader.py --cloud aws + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: shuffle_compute_gce.yaml + run: + script: python dataset_shuffle_data_loader.py --cloud gcp - name: parquet_metadata_resolution group: data-tests @@ -4654,17 +5266,17 @@ run: # Expect the test to finish around 40 seconds. timeout: 100 - script: python parquet_metadata_resolution.py --num-files 915 - - # TODO: Port s3://shuffling-data-loader-benchmarks/ to GCS. - # variations: - # - __suffix__: aws - # - __suffix__: gce - # env: gce - # frequency: manual - # cluster: - # cluster_env: app_config.yaml - # cluster_compute: single_node_benchmark_compute_gce.yaml + script: python parquet_metadata_resolution.py --num-files 915 --cloud aws + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: single_node_benchmark_compute_gce.yaml + run: + script: python parquet_metadata_resolution.py --num-files 915 --cloud gcp - name: dataset_random_access group: data-tests @@ -4742,6 +5354,61 @@ cluster_env: app_config.yaml cluster_compute: data_ingest_benchmark_compute_gce.yaml +- name: streaming_data_ingest_benchmark_100gb_gpu + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + cluster_env: app_config.yaml + cluster_compute: data_ingest_benchmark_compute_gpu.yaml + + run: + timeout: 300 + script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --new_streaming --use-gpu + wait_for_nodes: + num_nodes: 3 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml + +# This test case will early stop the data ingestion iteration on the GPU actors. +# This is a common usage in PyTorch Lightning +# (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches). +# There was a bug in Ray Data that caused GPU memoy leak (see #34819). +# We add this test case to cover this scenario. +- name: streaming_data_ingest_benchmark_100gb_gpu_early_stop + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + cluster_env: app_config.yaml + cluster_compute: data_ingest_benchmark_compute_gpu.yaml + + run: + timeout: 300 + script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --new_streaming --use-gpu --early-stop + wait_for_nodes: + num_nodes: 3 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_env: app_config.yaml + cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml + - name: aggregate_benchmark group: data-tests working_dir: nightly_tests/dataset @@ -4906,7 +5573,7 @@ frequency: manual cluster: cluster_env: app_config.yaml - cluster_compute: single_node_benchmark_compute_gce.yaml + cluster_compute: multi_node_benchmark_compute_gce.yaml - name: iter_batches_benchmark_single_node group: data-tests @@ -5013,6 +5680,8 @@ group: data-tests working_dir: nightly_tests + stable: false + frequency: nightly team: data cluster: @@ -5155,7 +5824,7 @@ cluster_compute: chaos_test/compute_template.yaml run: - timeout: 3600 + timeout: 4200 wait_for_nodes: num_nodes: 10 prepare: python setup_chaos.py --no-start @@ -5284,6 +5953,8 @@ group: data-tests working_dir: nightly_tests + stable: false + frequency: nightly team: data cluster: @@ -5397,8 +6068,6 @@ working_dir: k8s_tests stable: false - # TODO: Migrate this test to Anyscale Jobs / staging_v2 - env: prod_v1 frequency: nightly team: serve @@ -5410,64 +6079,85 @@ timeout: 28800 # 8h prepare: bash prepare.sh script: python run_gcs_ft_on_k8s.py - type: sdk_command - name: aws_cluster_launcher group: cluster-launcher-test - working_dir: ../python/ray/autoscaler/aws/ + working_dir: ../python/ray/autoscaler/ stable: true - # TODO: Migrate this test to Anyscale Jobs / staging_v2 - env: prod_v1 - frequency: nightly team: core cluster: - cluster_env: tests/aws_config.yaml - cluster_compute: tests/aws_compute.yaml + cluster_env: aws/tests/aws_config.yaml + cluster_compute: aws/tests/aws_compute.yaml run: timeout: 1200 - script: cd tests && python aws_launch_and_verify_cluster.py aws_cluster.yaml - type: sdk_command + script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml - name: aws_cluster_launcher_minimal group: cluster-launcher-test - working_dir: ../python/ray/autoscaler/aws/ + working_dir: ../python/ray/autoscaler/ stable: true - # TODO: Migrate this test to Anyscale Jobs / staging_v2 - env: prod_v1 - frequency: nightly team: core cluster: - cluster_env: tests/aws_config.yaml - cluster_compute: tests/aws_compute.yaml + cluster_env: aws/tests/aws_config.yaml + cluster_compute: aws/tests/aws_compute.yaml run: timeout: 1200 - script: cd tests && python aws_launch_and_verify_cluster.py ../example-minimal.yaml - type: sdk_command + script: python launch_and_verify_cluster.py aws/example-minimal.yaml - name: aws_cluster_launcher_full group: cluster-launcher-test - working_dir: ../python/ray/autoscaler/aws/ + working_dir: ../python/ray/autoscaler/ stable: true - # TODO: Migrate this test to Anyscale Jobs / staging_v2 - env: prod_v1 + frequency: nightly + team: core + cluster: + cluster_env: aws/tests/aws_config.yaml + cluster_compute: aws/tests/aws_compute.yaml + + run: + timeout: 1200 + script: python launch_and_verify_cluster.py aws/example-full.yaml + +- name: gcp_cluster_launcher_minimal + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + stable: true + env: gce frequency: nightly team: core cluster: - cluster_env: tests/aws_config.yaml - cluster_compute: tests/aws_compute.yaml + cluster_env: gcp/tests/gce_config.yaml + cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml run: timeout: 1200 - script: cd tests && python aws_launch_and_verify_cluster.py ../example-full.yaml - type: sdk_command \ No newline at end of file + script: python launch_and_verify_cluster.py gcp/example-minimal.yaml + +- name: gcp_cluster_launcher_full + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + stable: true + + env: gce + frequency: nightly + team: core + cluster: + cluster_env: gcp/tests/gce_config.yaml + cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml + + run: + timeout: 2400 + script: python launch_and_verify_cluster.py gcp/example-full.yaml \ No newline at end of file diff --git a/release/requirements_buildkite.in b/release/requirements_buildkite.in new file mode 100644 index 000000000000..02f90acb2c3b --- /dev/null +++ b/release/requirements_buildkite.in @@ -0,0 +1,16 @@ +# Requirements to run release tests from buildkite (client dependencies will be installed separately) +# Copy anyscale pin to requirements.txt and util.py +anyscale +bazel-runfiles +boto3 +click +freezegun +google-cloud-storage +jinja2 +protobuf >= 3.15.3, != 3.19.5 +pydantic < 1.10.0 +pytest +pyyaml +requests +retry + diff --git a/release/requirements_buildkite.txt b/release/requirements_buildkite.txt index 26555177a25d..5ed50766f501 100644 --- a/release/requirements_buildkite.txt +++ b/release/requirements_buildkite.txt @@ -1,12 +1,1196 @@ -# Requirements to run release tests from buildkite (client dependencies will be installed separately) -# Copy anyscale pin to requirements.txt and util.py -anyscale -click -boto3 -google-cloud-storage -jinja2 -protobuf >= 3.15.3, != 3.19.5 -pydantic < 1.10.0 -pyyaml -requests -retry +# +# This file is autogenerated by pip-compile with python 3.7 +# To update, run: +# +# bazel run //release:requirements_buildkite.update +# +aiohttp==3.8.4 \ + --hash=sha256:03543dcf98a6619254b409be2d22b51f21ec66272be4ebda7b04e6412e4b2e14 \ + --hash=sha256:03baa76b730e4e15a45f81dfe29a8d910314143414e528737f8589ec60cf7391 \ + --hash=sha256:0a63f03189a6fa7c900226e3ef5ba4d3bd047e18f445e69adbd65af433add5a2 \ + --hash=sha256:10c8cefcff98fd9168cdd86c4da8b84baaa90bf2da2269c6161984e6737bf23e \ + --hash=sha256:147ae376f14b55f4f3c2b118b95be50a369b89b38a971e80a17c3fd623f280c9 \ + --hash=sha256:176a64b24c0935869d5bbc4c96e82f89f643bcdf08ec947701b9dbb3c956b7dd \ + --hash=sha256:17b79c2963db82086229012cff93ea55196ed31f6493bb1ccd2c62f1724324e4 \ + --hash=sha256:1a45865451439eb320784918617ba54b7a377e3501fb70402ab84d38c2cd891b \ + --hash=sha256:1b3ea7edd2d24538959c1c1abf97c744d879d4e541d38305f9bd7d9b10c9ec41 \ + --hash=sha256:22f6eab15b6db242499a16de87939a342f5a950ad0abaf1532038e2ce7d31567 \ + --hash=sha256:3032dcb1c35bc330134a5b8a5d4f68c1a87252dfc6e1262c65a7e30e62298275 \ + --hash=sha256:33587f26dcee66efb2fff3c177547bd0449ab7edf1b73a7f5dea1e38609a0c54 \ + --hash=sha256:34ce9f93a4a68d1272d26030655dd1b58ff727b3ed2a33d80ec433561b03d67a \ + --hash=sha256:3a80464982d41b1fbfe3154e440ba4904b71c1a53e9cd584098cd41efdb188ef \ + --hash=sha256:3b90467ebc3d9fa5b0f9b6489dfb2c304a1db7b9946fa92aa76a831b9d587e99 \ + --hash=sha256:3d89efa095ca7d442a6d0cbc755f9e08190ba40069b235c9886a8763b03785da \ + --hash=sha256:3d8ef1a630519a26d6760bc695842579cb09e373c5f227a21b67dc3eb16cfea4 \ + --hash=sha256:3f43255086fe25e36fd5ed8f2ee47477408a73ef00e804cb2b5cba4bf2ac7f5e \ + --hash=sha256:40653609b3bf50611356e6b6554e3a331f6879fa7116f3959b20e3528783e699 \ + --hash=sha256:41a86a69bb63bb2fc3dc9ad5ea9f10f1c9c8e282b471931be0268ddd09430b04 \ + --hash=sha256:493f5bc2f8307286b7799c6d899d388bbaa7dfa6c4caf4f97ef7521b9cb13719 \ + --hash=sha256:4a6cadebe132e90cefa77e45f2d2f1a4b2ce5c6b1bfc1656c1ddafcfe4ba8131 \ + --hash=sha256:4c745b109057e7e5f1848c689ee4fb3a016c8d4d92da52b312f8a509f83aa05e \ + --hash=sha256:4d347a172f866cd1d93126d9b239fcbe682acb39b48ee0873c73c933dd23bd0f \ + --hash=sha256:4dac314662f4e2aa5009977b652d9b8db7121b46c38f2073bfeed9f4049732cd \ + --hash=sha256:4ddaae3f3d32fc2cb4c53fab020b69a05c8ab1f02e0e59665c6f7a0d3a5be54f \ + --hash=sha256:5393fb786a9e23e4799fec788e7e735de18052f83682ce2dfcabaf1c00c2c08e \ + --hash=sha256:59f029a5f6e2d679296db7bee982bb3d20c088e52a2977e3175faf31d6fb75d1 \ + --hash=sha256:5a7bdf9e57126dc345b683c3632e8ba317c31d2a41acd5800c10640387d193ed \ + --hash=sha256:5b3f2e06a512e94722886c0827bee9807c86a9f698fac6b3aee841fab49bbfb4 \ + --hash=sha256:5ce45967538fb747370308d3145aa68a074bdecb4f3a300869590f725ced69c1 \ + --hash=sha256:5e14f25765a578a0a634d5f0cd1e2c3f53964553a00347998dfdf96b8137f777 \ + --hash=sha256:618c901dd3aad4ace71dfa0f5e82e88b46ef57e3239fc7027773cb6d4ed53531 \ + --hash=sha256:652b1bff4f15f6287550b4670546a2947f2a4575b6c6dff7760eafb22eacbf0b \ + --hash=sha256:6c08e8ed6fa3d477e501ec9db169bfac8140e830aa372d77e4a43084d8dd91ab \ + --hash=sha256:6ddb2a2026c3f6a68c3998a6c47ab6795e4127315d2e35a09997da21865757f8 \ + --hash=sha256:6e601588f2b502c93c30cd5a45bfc665faaf37bbe835b7cfd461753068232074 \ + --hash=sha256:6e74dd54f7239fcffe07913ff8b964e28b712f09846e20de78676ce2a3dc0bfc \ + --hash=sha256:7235604476a76ef249bd64cb8274ed24ccf6995c4a8b51a237005ee7a57e8643 \ + --hash=sha256:7ab43061a0c81198d88f39aaf90dae9a7744620978f7ef3e3708339b8ed2ef01 \ + --hash=sha256:7c7837fe8037e96b6dd5cfcf47263c1620a9d332a87ec06a6ca4564e56bd0f36 \ + --hash=sha256:80575ba9377c5171407a06d0196b2310b679dc752d02a1fcaa2bc20b235dbf24 \ + --hash=sha256:80a37fe8f7c1e6ce8f2d9c411676e4bc633a8462844e38f46156d07a7d401654 \ + --hash=sha256:8189c56eb0ddbb95bfadb8f60ea1b22fcfa659396ea36f6adcc521213cd7b44d \ + --hash=sha256:854f422ac44af92bfe172d8e73229c270dc09b96535e8a548f99c84f82dde241 \ + --hash=sha256:880e15bb6dad90549b43f796b391cfffd7af373f4646784795e20d92606b7a51 \ + --hash=sha256:8b631e26df63e52f7cce0cce6507b7a7f1bc9b0c501fcde69742130b32e8782f \ + --hash=sha256:8c29c77cc57e40f84acef9bfb904373a4e89a4e8b74e71aa8075c021ec9078c2 \ + --hash=sha256:91f6d540163f90bbaef9387e65f18f73ffd7c79f5225ac3d3f61df7b0d01ad15 \ + --hash=sha256:92c0cea74a2a81c4c76b62ea1cac163ecb20fb3ba3a75c909b9fa71b4ad493cf \ + --hash=sha256:9bcb89336efa095ea21b30f9e686763f2be4478f1b0a616969551982c4ee4c3b \ + --hash=sha256:a1f4689c9a1462f3df0a1f7e797791cd6b124ddbee2b570d34e7f38ade0e2c71 \ + --hash=sha256:a3fec6a4cb5551721cdd70473eb009d90935b4063acc5f40905d40ecfea23e05 \ + --hash=sha256:a5d794d1ae64e7753e405ba58e08fcfa73e3fad93ef9b7e31112ef3c9a0efb52 \ + --hash=sha256:a86d42d7cba1cec432d47ab13b6637bee393a10f664c425ea7b305d1301ca1a3 \ + --hash=sha256:adfbc22e87365a6e564c804c58fc44ff7727deea782d175c33602737b7feadb6 \ + --hash=sha256:aeb29c84bb53a84b1a81c6c09d24cf33bb8432cc5c39979021cc0f98c1292a1a \ + --hash=sha256:aede4df4eeb926c8fa70de46c340a1bc2c6079e1c40ccf7b0eae1313ffd33519 \ + --hash=sha256:b744c33b6f14ca26b7544e8d8aadff6b765a80ad6164fb1a430bbadd593dfb1a \ + --hash=sha256:b7a00a9ed8d6e725b55ef98b1b35c88013245f35f68b1b12c5cd4100dddac333 \ + --hash=sha256:bb96fa6b56bb536c42d6a4a87dfca570ff8e52de2d63cabebfd6fb67049c34b6 \ + --hash=sha256:bbcf1a76cf6f6dacf2c7f4d2ebd411438c275faa1dc0c68e46eb84eebd05dd7d \ + --hash=sha256:bca5f24726e2919de94f047739d0a4fc01372801a3672708260546aa2601bf57 \ + --hash=sha256:bf2e1a9162c1e441bf805a1fd166e249d574ca04e03b34f97e2928769e91ab5c \ + --hash=sha256:c4eb3b82ca349cf6fadcdc7abcc8b3a50ab74a62e9113ab7a8ebc268aad35bb9 \ + --hash=sha256:c6cc15d58053c76eacac5fa9152d7d84b8d67b3fde92709195cb984cfb3475ea \ + --hash=sha256:c6cd05ea06daca6ad6a4ca3ba7fe7dc5b5de063ff4daec6170ec0f9979f6c332 \ + --hash=sha256:c844fd628851c0bc309f3c801b3a3d58ce430b2ce5b359cd918a5a76d0b20cb5 \ + --hash=sha256:c9cb1565a7ad52e096a6988e2ee0397f72fe056dadf75d17fa6b5aebaea05622 \ + --hash=sha256:cab9401de3ea52b4b4c6971db5fb5c999bd4260898af972bf23de1c6b5dd9d71 \ + --hash=sha256:cd468460eefef601ece4428d3cf4562459157c0f6523db89365202c31b6daebb \ + --hash=sha256:d1e6a862b76f34395a985b3cd39a0d949ca80a70b6ebdea37d3ab39ceea6698a \ + --hash=sha256:d1f9282c5f2b5e241034a009779e7b2a1aa045f667ff521e7948ea9b56e0c5ff \ + --hash=sha256:d265f09a75a79a788237d7f9054f929ced2e69eb0bb79de3798c468d8a90f945 \ + --hash=sha256:db3fc6120bce9f446d13b1b834ea5b15341ca9ff3f335e4a951a6ead31105480 \ + --hash=sha256:dbf3a08a06b3f433013c143ebd72c15cac33d2914b8ea4bea7ac2c23578815d6 \ + --hash=sha256:de04b491d0e5007ee1b63a309956eaed959a49f5bb4e84b26c8f5d49de140fa9 \ + --hash=sha256:e4b09863aae0dc965c3ef36500d891a3ff495a2ea9ae9171e4519963c12ceefd \ + --hash=sha256:e595432ac259af2d4630008bf638873d69346372d38255774c0e286951e8b79f \ + --hash=sha256:e75b89ac3bd27d2d043b234aa7b734c38ba1b0e43f07787130a0ecac1e12228a \ + --hash=sha256:ea9eb976ffdd79d0e893869cfe179a8f60f152d42cb64622fca418cd9b18dc2a \ + --hash=sha256:eafb3e874816ebe2a92f5e155f17260034c8c341dad1df25672fb710627c6949 \ + --hash=sha256:ee3c36df21b5714d49fc4580247947aa64bcbe2939d1b77b4c8dcb8f6c9faecc \ + --hash=sha256:f352b62b45dff37b55ddd7b9c0c8672c4dd2eb9c0f9c11d395075a84e2c40f75 \ + --hash=sha256:fabb87dd8850ef0f7fe2b366d44b77d7e6fa2ea87861ab3844da99291e81e60f \ + --hash=sha256:fe11310ae1e4cd560035598c3f29d86cef39a83d244c7466f95c27ae04850f10 \ + --hash=sha256:fe7ba4a51f33ab275515f66b0a236bcde4fb5561498fe8f898d4e549b2e4509f + # via anyscale +aiosignal==1.3.1 \ + --hash=sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc \ + --hash=sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17 + # via aiohttp +anyscale==0.5.106 \ + --hash=sha256:8e6a371f50ab35743521fa552b0ca2cfd75fb186023ed29049b3822ffa1fb7e2 + # via -r release/requirements_buildkite.in +argon2-cffi==21.3.0 \ + --hash=sha256:8c976986f2c5c0e5000919e6de187906cfd81fb1c72bf9d88c01177e77da7f80 \ + --hash=sha256:d384164d944190a7dd7ef22c6aa3ff197da12962bd04b17f64d4e93d934dba5b + # via anyscale +argon2-cffi-bindings==21.2.0 \ + --hash=sha256:20ef543a89dee4db46a1a6e206cd015360e5a75822f76df533845c3cbaf72670 \ + --hash=sha256:2c3e3cc67fdb7d82c4718f19b4e7a87123caf8a93fde7e23cf66ac0337d3cb3f \ + --hash=sha256:3b9ef65804859d335dc6b31582cad2c5166f0c3e7975f324d9ffaa34ee7e6583 \ + --hash=sha256:3e385d1c39c520c08b53d63300c3ecc28622f076f4c2b0e6d7e796e9f6502194 \ + --hash=sha256:58ed19212051f49a523abb1dbe954337dc82d947fb6e5a0da60f7c8471a8476c \ + --hash=sha256:5e00316dabdaea0b2dd82d141cc66889ced0cdcbfa599e8b471cf22c620c329a \ + --hash=sha256:603ca0aba86b1349b147cab91ae970c63118a0f30444d4bc80355937c950c082 \ + --hash=sha256:6a22ad9800121b71099d0fb0a65323810a15f2e292f2ba450810a7316e128ee5 \ + --hash=sha256:8cd69c07dd875537a824deec19f978e0f2078fdda07fd5c42ac29668dda5f40f \ + --hash=sha256:93f9bf70084f97245ba10ee36575f0c3f1e7d7724d67d8e5b08e61787c320ed7 \ + --hash=sha256:9524464572e12979364b7d600abf96181d3541da11e23ddf565a32e70bd4dc0d \ + --hash=sha256:b2ef1c30440dbbcba7a5dc3e319408b59676e2e039e2ae11a8775ecf482b192f \ + --hash=sha256:b746dba803a79238e925d9046a63aa26bf86ab2a2fe74ce6b009a1c3f5c8f2ae \ + --hash=sha256:bb89ceffa6c791807d1305ceb77dbfacc5aa499891d2c55661c6459651fc39e3 \ + --hash=sha256:bd46088725ef7f58b5a1ef7ca06647ebaf0eb4baff7d1d0d177c6cc8744abd86 \ + --hash=sha256:ccb949252cb2ab3a08c02024acb77cfb179492d5701c7cbdbfd776124d4d2367 \ + --hash=sha256:d4966ef5848d820776f5f562a7d45fdd70c2f330c961d0d745b784034bd9f48d \ + --hash=sha256:e415e3f62c8d124ee16018e491a009937f8cf7ebf5eb430ffc5de21b900dad93 \ + --hash=sha256:ed2937d286e2ad0cc79a7087d3c272832865f779430e0cc2b4f3718d3159b0cb \ + --hash=sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e \ + --hash=sha256:f9f8b450ed0547e3d473fdc8612083fd08dd2120d6ac8f73828df9b7d45bb351 + # via argon2-cffi +async-timeout==4.0.2 \ + --hash=sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15 \ + --hash=sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c + # via aiohttp +asynctest==0.13.0 \ + --hash=sha256:5da6118a7e6d6b54d83a8f7197769d046922a44d2a99c21382f0a6e4fadae676 \ + --hash=sha256:c27862842d15d83e6a34eb0b2866c323880eb3a75e4485b079ea11748fd77fac + # via aiohttp +attrs==23.1.0 \ + --hash=sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04 \ + --hash=sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015 + # via + # aiohttp + # jsonschema +backports-zoneinfo==0.2.1 \ + --hash=sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf \ + --hash=sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328 \ + --hash=sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546 \ + --hash=sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6 \ + --hash=sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570 \ + --hash=sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9 \ + --hash=sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7 \ + --hash=sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987 \ + --hash=sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722 \ + --hash=sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582 \ + --hash=sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc \ + --hash=sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b \ + --hash=sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1 \ + --hash=sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08 \ + --hash=sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac \ + --hash=sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2 + # via + # pytz-deprecation-shim + # tzlocal +bazel-runfiles==0.21.0 \ + --hash=sha256:3e430dd9a5aba90a90bc2493fdcfce02a3ece47fb574db0f4ac898261e6b068d + # via -r release/requirements_buildkite.in +boto3==1.26.131 \ + --hash=sha256:061d3270472b9be09901bb08a45e9871ac8f86a9b1c9c615535ca0223acd7582 \ + --hash=sha256:5b2b13d9f3430e3d5e768bf32097d5d6d16f47a4719f2656de67da49dd3e4de1 + # via + # -r release/requirements_buildkite.in + # anyscale +botocore==1.29.131 \ + --hash=sha256:d0dea23bccdfd7c2f6d0cd3216cfbd7065bc3e9e7b1ef6fee0952b04f5d2cffd \ + --hash=sha256:ffbd85915b2624c545438a33c2624a809593720a10648f6e757fe50be4893188 + # via + # anyscale + # boto3 + # s3transfer +cachetools==5.3.0 \ + --hash=sha256:13dfddc7b8df938c21a940dfa6557ce6e94a2f1cdfa58eb90c805721d58f2c14 \ + --hash=sha256:429e1a1e845c008ea6c85aa35d4b98b65d6a9763eeef3e37e92728a12d1de9d4 + # via google-auth +certifi==2023.5.7 \ + --hash=sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7 \ + --hash=sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716 + # via + # anyscale + # requests +cffi==1.15.1 \ + --hash=sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5 \ + --hash=sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef \ + --hash=sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104 \ + --hash=sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426 \ + --hash=sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405 \ + --hash=sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375 \ + --hash=sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a \ + --hash=sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e \ + --hash=sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc \ + --hash=sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf \ + --hash=sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185 \ + --hash=sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497 \ + --hash=sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3 \ + --hash=sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35 \ + --hash=sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c \ + --hash=sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83 \ + --hash=sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21 \ + --hash=sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca \ + --hash=sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984 \ + --hash=sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac \ + --hash=sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd \ + --hash=sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee \ + --hash=sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a \ + --hash=sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2 \ + --hash=sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192 \ + --hash=sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7 \ + --hash=sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585 \ + --hash=sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f \ + --hash=sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e \ + --hash=sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27 \ + --hash=sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b \ + --hash=sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e \ + --hash=sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e \ + --hash=sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d \ + --hash=sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c \ + --hash=sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415 \ + --hash=sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82 \ + --hash=sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02 \ + --hash=sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314 \ + --hash=sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325 \ + --hash=sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c \ + --hash=sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3 \ + --hash=sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914 \ + --hash=sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045 \ + --hash=sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d \ + --hash=sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9 \ + --hash=sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5 \ + --hash=sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2 \ + --hash=sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c \ + --hash=sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3 \ + --hash=sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2 \ + --hash=sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8 \ + --hash=sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d \ + --hash=sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d \ + --hash=sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9 \ + --hash=sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162 \ + --hash=sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76 \ + --hash=sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4 \ + --hash=sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e \ + --hash=sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9 \ + --hash=sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6 \ + --hash=sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b \ + --hash=sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01 \ + --hash=sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0 + # via argon2-cffi-bindings +charset-normalizer==3.1.0 \ + --hash=sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6 \ + --hash=sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1 \ + --hash=sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e \ + --hash=sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373 \ + --hash=sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62 \ + --hash=sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230 \ + --hash=sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be \ + --hash=sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c \ + --hash=sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0 \ + --hash=sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448 \ + --hash=sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f \ + --hash=sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649 \ + --hash=sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d \ + --hash=sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0 \ + --hash=sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706 \ + --hash=sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a \ + --hash=sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59 \ + --hash=sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23 \ + --hash=sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5 \ + --hash=sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb \ + --hash=sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e \ + --hash=sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e \ + --hash=sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c \ + --hash=sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28 \ + --hash=sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d \ + --hash=sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41 \ + --hash=sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974 \ + --hash=sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce \ + --hash=sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f \ + --hash=sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1 \ + --hash=sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d \ + --hash=sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8 \ + --hash=sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017 \ + --hash=sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31 \ + --hash=sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7 \ + --hash=sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8 \ + --hash=sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e \ + --hash=sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14 \ + --hash=sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd \ + --hash=sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d \ + --hash=sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795 \ + --hash=sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b \ + --hash=sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b \ + --hash=sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b \ + --hash=sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203 \ + --hash=sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f \ + --hash=sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19 \ + --hash=sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1 \ + --hash=sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a \ + --hash=sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac \ + --hash=sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9 \ + --hash=sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0 \ + --hash=sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137 \ + --hash=sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f \ + --hash=sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6 \ + --hash=sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5 \ + --hash=sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909 \ + --hash=sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f \ + --hash=sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0 \ + --hash=sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324 \ + --hash=sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755 \ + --hash=sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb \ + --hash=sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854 \ + --hash=sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c \ + --hash=sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60 \ + --hash=sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84 \ + --hash=sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0 \ + --hash=sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b \ + --hash=sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1 \ + --hash=sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531 \ + --hash=sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1 \ + --hash=sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11 \ + --hash=sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326 \ + --hash=sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df \ + --hash=sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab + # via + # aiohttp + # requests +click==8.1.3 \ + --hash=sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e \ + --hash=sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48 + # via + # -r release/requirements_buildkite.in + # anyscale +colorama==0.4.6 \ + --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \ + --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 + # via + # anyscale + # halo + # log-symbols +decorator==5.1.1 \ + --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ + --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 + # via retry +exceptiongroup==1.1.1 \ + --hash=sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e \ + --hash=sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785 + # via pytest +expiringdict==1.2.2 \ + --hash=sha256:09a5d20bc361163e6432a874edd3179676e935eb81b925eccef48d409a8a45e8 \ + --hash=sha256:300fb92a7e98f15b05cf9a856c1415b3bc4f2e132be07daa326da6414c23ee09 + # via anyscale +freezegun==1.2.2 \ + --hash=sha256:cd22d1ba06941384410cd967d8a99d5ae2442f57dfafeff2fda5de8dc5c05446 \ + --hash=sha256:ea1b963b993cb9ea195adbd893a48d573fda951b0da64f60883d7e988b606c9f + # via -r release/requirements_buildkite.in +frozenlist==1.3.3 \ + --hash=sha256:008a054b75d77c995ea26629ab3a0c0d7281341f2fa7e1e85fa6153ae29ae99c \ + --hash=sha256:02c9ac843e3390826a265e331105efeab489ffaf4dd86384595ee8ce6d35ae7f \ + --hash=sha256:034a5c08d36649591be1cbb10e09da9f531034acfe29275fc5454a3b101ce41a \ + --hash=sha256:05cdb16d09a0832eedf770cb7bd1fe57d8cf4eaf5aced29c4e41e3f20b30a784 \ + --hash=sha256:0693c609e9742c66ba4870bcee1ad5ff35462d5ffec18710b4ac89337ff16e27 \ + --hash=sha256:0771aed7f596c7d73444c847a1c16288937ef988dc04fb9f7be4b2aa91db609d \ + --hash=sha256:0af2e7c87d35b38732e810befb9d797a99279cbb85374d42ea61c1e9d23094b3 \ + --hash=sha256:14143ae966a6229350021384870458e4777d1eae4c28d1a7aa47f24d030e6678 \ + --hash=sha256:180c00c66bde6146a860cbb81b54ee0df350d2daf13ca85b275123bbf85de18a \ + --hash=sha256:1841e200fdafc3d51f974d9d377c079a0694a8f06de2e67b48150328d66d5483 \ + --hash=sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8 \ + --hash=sha256:2b07ae0c1edaa0a36339ec6cce700f51b14a3fc6545fdd32930d2c83917332cf \ + --hash=sha256:2c926450857408e42f0bbc295e84395722ce74bae69a3b2aa2a65fe22cb14b99 \ + --hash=sha256:2e24900aa13212e75e5b366cb9065e78bbf3893d4baab6052d1aca10d46d944c \ + --hash=sha256:303e04d422e9b911a09ad499b0368dc551e8c3cd15293c99160c7f1f07b59a48 \ + --hash=sha256:352bd4c8c72d508778cf05ab491f6ef36149f4d0cb3c56b1b4302852255d05d5 \ + --hash=sha256:3843f84a6c465a36559161e6c59dce2f2ac10943040c2fd021cfb70d58c4ad56 \ + --hash=sha256:394c9c242113bfb4b9aa36e2b80a05ffa163a30691c7b5a29eba82e937895d5e \ + --hash=sha256:3bbdf44855ed8f0fbcd102ef05ec3012d6a4fd7c7562403f76ce6a52aeffb2b1 \ + --hash=sha256:40de71985e9042ca00b7953c4f41eabc3dc514a2d1ff534027f091bc74416401 \ + --hash=sha256:41fe21dc74ad3a779c3d73a2786bdf622ea81234bdd4faf90b8b03cad0c2c0b4 \ + --hash=sha256:47df36a9fe24054b950bbc2db630d508cca3aa27ed0566c0baf661225e52c18e \ + --hash=sha256:4ea42116ceb6bb16dbb7d526e242cb6747b08b7710d9782aa3d6732bd8d27649 \ + --hash=sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a \ + --hash=sha256:5c11e43016b9024240212d2a65043b70ed8dfd3b52678a1271972702d990ac6d \ + --hash=sha256:5cf820485f1b4c91e0417ea0afd41ce5cf5965011b3c22c400f6d144296ccbc0 \ + --hash=sha256:5d8860749e813a6f65bad8285a0520607c9500caa23fea6ee407e63debcdbef6 \ + --hash=sha256:6327eb8e419f7d9c38f333cde41b9ae348bec26d840927332f17e887a8dcb70d \ + --hash=sha256:65a5e4d3aa679610ac6e3569e865425b23b372277f89b5ef06cf2cdaf1ebf22b \ + --hash=sha256:66080ec69883597e4d026f2f71a231a1ee9887835902dbe6b6467d5a89216cf6 \ + --hash=sha256:783263a4eaad7c49983fe4b2e7b53fa9770c136c270d2d4bbb6d2192bf4d9caf \ + --hash=sha256:7f44e24fa70f6fbc74aeec3e971f60a14dde85da364aa87f15d1be94ae75aeef \ + --hash=sha256:7fdfc24dcfce5b48109867c13b4cb15e4660e7bd7661741a391f821f23dfdca7 \ + --hash=sha256:810860bb4bdce7557bc0febb84bbd88198b9dbc2022d8eebe5b3590b2ad6c842 \ + --hash=sha256:841ea19b43d438a80b4de62ac6ab21cfe6827bb8a9dc62b896acc88eaf9cecba \ + --hash=sha256:84610c1502b2461255b4c9b7d5e9c48052601a8957cd0aea6ec7a7a1e1fb9420 \ + --hash=sha256:899c5e1928eec13fd6f6d8dc51be23f0d09c5281e40d9cf4273d188d9feeaf9b \ + --hash=sha256:8bae29d60768bfa8fb92244b74502b18fae55a80eac13c88eb0b496d4268fd2d \ + --hash=sha256:8df3de3a9ab8325f94f646609a66cbeeede263910c5c0de0101079ad541af332 \ + --hash=sha256:8fa3c6e3305aa1146b59a09b32b2e04074945ffcfb2f0931836d103a2c38f936 \ + --hash=sha256:924620eef691990dfb56dc4709f280f40baee568c794b5c1885800c3ecc69816 \ + --hash=sha256:9309869032abb23d196cb4e4db574232abe8b8be1339026f489eeb34a4acfd91 \ + --hash=sha256:9545a33965d0d377b0bc823dcabf26980e77f1b6a7caa368a365a9497fb09420 \ + --hash=sha256:9ac5995f2b408017b0be26d4a1d7c61bce106ff3d9e3324374d66b5964325448 \ + --hash=sha256:9bbbcedd75acdfecf2159663b87f1bb5cfc80e7cd99f7ddd9d66eb98b14a8411 \ + --hash=sha256:a4ae8135b11652b08a8baf07631d3ebfe65a4c87909dbef5fa0cdde440444ee4 \ + --hash=sha256:a6394d7dadd3cfe3f4b3b186e54d5d8504d44f2d58dcc89d693698e8b7132b32 \ + --hash=sha256:a97b4fe50b5890d36300820abd305694cb865ddb7885049587a5678215782a6b \ + --hash=sha256:ae4dc05c465a08a866b7a1baf360747078b362e6a6dbeb0c57f234db0ef88ae0 \ + --hash=sha256:b1c63e8d377d039ac769cd0926558bb7068a1f7abb0f003e3717ee003ad85530 \ + --hash=sha256:b1e2c1185858d7e10ff045c496bbf90ae752c28b365fef2c09cf0fa309291669 \ + --hash=sha256:b4395e2f8d83fbe0c627b2b696acce67868793d7d9750e90e39592b3626691b7 \ + --hash=sha256:b756072364347cb6aa5b60f9bc18e94b2f79632de3b0190253ad770c5df17db1 \ + --hash=sha256:ba64dc2b3b7b158c6660d49cdb1d872d1d0bf4e42043ad8d5006099479a194e5 \ + --hash=sha256:bed331fe18f58d844d39ceb398b77d6ac0b010d571cba8267c2e7165806b00ce \ + --hash=sha256:c188512b43542b1e91cadc3c6c915a82a5eb95929134faf7fd109f14f9892ce4 \ + --hash=sha256:c21b9aa40e08e4f63a2f92ff3748e6b6c84d717d033c7b3438dd3123ee18f70e \ + --hash=sha256:ca713d4af15bae6e5d79b15c10c8522859a9a89d3b361a50b817c98c2fb402a2 \ + --hash=sha256:cd4210baef299717db0a600d7a3cac81d46ef0e007f88c9335db79f8979c0d3d \ + --hash=sha256:cfe33efc9cb900a4c46f91a5ceba26d6df370ffddd9ca386eb1d4f0ad97b9ea9 \ + --hash=sha256:d5cd3ab21acbdb414bb6c31958d7b06b85eeb40f66463c264a9b343a4e238642 \ + --hash=sha256:dfbac4c2dfcc082fcf8d942d1e49b6aa0766c19d3358bd86e2000bf0fa4a9cf0 \ + --hash=sha256:e235688f42b36be2b6b06fc37ac2126a73b75fb8d6bc66dd632aa35286238703 \ + --hash=sha256:eb82dbba47a8318e75f679690190c10a5e1f447fbf9df41cbc4c3afd726d88cb \ + --hash=sha256:ebb86518203e12e96af765ee89034a1dbb0c3c65052d1b0c19bbbd6af8a145e1 \ + --hash=sha256:ee78feb9d293c323b59a6f2dd441b63339a30edf35abcb51187d2fc26e696d13 \ + --hash=sha256:eedab4c310c0299961ac285591acd53dc6723a1ebd90a57207c71f6e0c2153ab \ + --hash=sha256:efa568b885bca461f7c7b9e032655c0c143d305bf01c30caf6db2854a4532b38 \ + --hash=sha256:efce6ae830831ab6a22b9b4091d411698145cb9b8fc869e1397ccf4b4b6455cb \ + --hash=sha256:f163d2fd041c630fed01bc48d28c3ed4a3b003c00acd396900e11ee5316b56bb \ + --hash=sha256:f20380df709d91525e4bee04746ba612a4df0972c1b8f8e1e8af997e678c7b81 \ + --hash=sha256:f30f1928162e189091cf4d9da2eac617bfe78ef907a761614ff577ef4edfb3c8 \ + --hash=sha256:f470c92737afa7d4c3aacc001e335062d582053d4dbe73cda126f2d7031068dd \ + --hash=sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4 + # via + # aiohttp + # aiosignal +gitdb==4.0.10 \ + --hash=sha256:6eb990b69df4e15bad899ea868dc46572c3f75339735663b81de79b06f17eb9a \ + --hash=sha256:c286cf298426064079ed96a9e4a9d39e7f3e9bf15ba60701e95f5492f28415c7 + # via gitpython +gitpython==3.1.31 \ + --hash=sha256:8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573 \ + --hash=sha256:f04893614f6aa713a60cbbe1e6a97403ef633103cdd0ef5eb6efe0deb98dbe8d + # via anyscale +google-api-core==2.11.0 \ + --hash=sha256:4b9bb5d5a380a0befa0573b302651b8a9a89262c1730e37bf423cec511804c22 \ + --hash=sha256:ce222e27b0de0d7bc63eb043b956996d6dccab14cc3b690aaea91c9cc99dc16e + # via + # google-cloud-core + # google-cloud-storage +google-auth==2.17.3 \ + --hash=sha256:ce311e2bc58b130fddf316df57c9b3943c2a7b4f6ec31de9663a9333e4064efc \ + --hash=sha256:f586b274d3eb7bd932ea424b1c702a30e0393a2e2bc4ca3eae8263ffd8be229f + # via + # anyscale + # google-api-core + # google-cloud-core + # google-cloud-storage +google-cloud-core==2.3.2 \ + --hash=sha256:8417acf6466be2fa85123441696c4badda48db314c607cf1e5d543fa8bdc22fe \ + --hash=sha256:b9529ee7047fd8d4bf4a2182de619154240df17fbe60ead399078c1ae152af9a + # via google-cloud-storage +google-cloud-storage==2.9.0 \ + --hash=sha256:83a90447f23d5edd045e0037982c270302e3aeb45fc1288d2c2ca713d27bad94 \ + --hash=sha256:9b6ae7b509fc294bdacb84d0f3ea8e20e2c54a8b4bbe39c5707635fec214eff3 + # via -r release/requirements_buildkite.in +google-crc32c==1.5.0 \ + --hash=sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a \ + --hash=sha256:02c65b9817512edc6a4ae7c7e987fea799d2e0ee40c53ec573a692bee24de876 \ + --hash=sha256:02ebb8bf46c13e36998aeaad1de9b48f4caf545e91d14041270d9dca767b780c \ + --hash=sha256:07eb3c611ce363c51a933bf6bd7f8e3878a51d124acfc89452a75120bc436289 \ + --hash=sha256:1034d91442ead5a95b5aaef90dbfaca8633b0247d1e41621d1e9f9db88c36298 \ + --hash=sha256:116a7c3c616dd14a3de8c64a965828b197e5f2d121fedd2f8c5585c547e87b02 \ + --hash=sha256:19e0a019d2c4dcc5e598cd4a4bc7b008546b0358bd322537c74ad47a5386884f \ + --hash=sha256:1c7abdac90433b09bad6c43a43af253e688c9cfc1c86d332aed13f9a7c7f65e2 \ + --hash=sha256:1e986b206dae4476f41bcec1faa057851f3889503a70e1bdb2378d406223994a \ + --hash=sha256:272d3892a1e1a2dbc39cc5cde96834c236d5327e2122d3aaa19f6614531bb6eb \ + --hash=sha256:278d2ed7c16cfc075c91378c4f47924c0625f5fc84b2d50d921b18b7975bd210 \ + --hash=sha256:2ad40e31093a4af319dadf503b2467ccdc8f67c72e4bcba97f8c10cb078207b5 \ + --hash=sha256:2e920d506ec85eb4ba50cd4228c2bec05642894d4c73c59b3a2fe20346bd00ee \ + --hash=sha256:3359fc442a743e870f4588fcf5dcbc1bf929df1fad8fb9905cd94e5edb02e84c \ + --hash=sha256:37933ec6e693e51a5b07505bd05de57eee12f3e8c32b07da7e73669398e6630a \ + --hash=sha256:398af5e3ba9cf768787eef45c803ff9614cc3e22a5b2f7d7ae116df8b11e3314 \ + --hash=sha256:3b747a674c20a67343cb61d43fdd9207ce5da6a99f629c6e2541aa0e89215bcd \ + --hash=sha256:461665ff58895f508e2866824a47bdee72497b091c730071f2b7575d5762ab65 \ + --hash=sha256:4c6fdd4fccbec90cc8a01fc00773fcd5fa28db683c116ee3cb35cd5da9ef6c37 \ + --hash=sha256:5829b792bf5822fd0a6f6eb34c5f81dd074f01d570ed7f36aa101d6fc7a0a6e4 \ + --hash=sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13 \ + --hash=sha256:5ae44e10a8e3407dbe138984f21e536583f2bba1be9491239f942c2464ac0894 \ + --hash=sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31 \ + --hash=sha256:64e52e2b3970bd891309c113b54cf0e4384762c934d5ae56e283f9a0afcd953e \ + --hash=sha256:66741ef4ee08ea0b2cc3c86916ab66b6aef03768525627fd6a1b34968b4e3709 \ + --hash=sha256:67b741654b851abafb7bc625b6d1cdd520a379074e64b6a128e3b688c3c04740 \ + --hash=sha256:6ac08d24c1f16bd2bf5eca8eaf8304812f44af5cfe5062006ec676e7e1d50afc \ + --hash=sha256:6f998db4e71b645350b9ac28a2167e6632c239963ca9da411523bb439c5c514d \ + --hash=sha256:72218785ce41b9cfd2fc1d6a017dc1ff7acfc4c17d01053265c41a2c0cc39b8c \ + --hash=sha256:74dea7751d98034887dbd821b7aae3e1d36eda111d6ca36c206c44478035709c \ + --hash=sha256:759ce4851a4bb15ecabae28f4d2e18983c244eddd767f560165563bf9aefbc8d \ + --hash=sha256:77e2fd3057c9d78e225fa0a2160f96b64a824de17840351b26825b0848022906 \ + --hash=sha256:7c074fece789b5034b9b1404a1f8208fc2d4c6ce9decdd16e8220c5a793e6f61 \ + --hash=sha256:7c42c70cd1d362284289c6273adda4c6af8039a8ae12dc451dcd61cdabb8ab57 \ + --hash=sha256:7f57f14606cd1dd0f0de396e1e53824c371e9544a822648cd76c034d209b559c \ + --hash=sha256:83c681c526a3439b5cf94f7420471705bbf96262f49a6fe546a6db5f687a3d4a \ + --hash=sha256:8485b340a6a9e76c62a7dce3c98e5f102c9219f4cfbf896a00cf48caf078d438 \ + --hash=sha256:84e6e8cd997930fc66d5bb4fde61e2b62ba19d62b7abd7a69920406f9ecca946 \ + --hash=sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7 \ + --hash=sha256:8b87e1a59c38f275c0e3676fc2ab6d59eccecfd460be267ac360cc31f7bcde96 \ + --hash=sha256:8f24ed114432de109aa9fd317278518a5af2d31ac2ea6b952b2f7782b43da091 \ + --hash=sha256:98cb4d057f285bd80d8778ebc4fde6b4d509ac3f331758fb1528b733215443ae \ + --hash=sha256:998679bf62b7fb599d2878aa3ed06b9ce688b8974893e7223c60db155f26bd8d \ + --hash=sha256:9ba053c5f50430a3fcfd36f75aff9caeba0440b2d076afdb79a318d6ca245f88 \ + --hash=sha256:9c99616c853bb585301df6de07ca2cadad344fd1ada6d62bb30aec05219c45d2 \ + --hash=sha256:a1fd716e7a01f8e717490fbe2e431d2905ab8aa598b9b12f8d10abebb36b04dd \ + --hash=sha256:a2355cba1f4ad8b6988a4ca3feed5bff33f6af2d7f134852cf279c2aebfde541 \ + --hash=sha256:b1f8133c9a275df5613a451e73f36c2aea4fe13c5c8997e22cf355ebd7bd0728 \ + --hash=sha256:b8667b48e7a7ef66afba2c81e1094ef526388d35b873966d8a9a447974ed9178 \ + --hash=sha256:ba1eb1843304b1e5537e1fca632fa894d6f6deca8d6389636ee5b4797affb968 \ + --hash=sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346 \ + --hash=sha256:c02ec1c5856179f171e032a31d6f8bf84e5a75c45c33b2e20a3de353b266ebd8 \ + --hash=sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93 \ + --hash=sha256:c6c777a480337ac14f38564ac88ae82d4cd238bf293f0a22295b66eb89ffced7 \ + --hash=sha256:cae0274952c079886567f3f4f685bcaf5708f0a23a5f5216fdab71f81a6c0273 \ + --hash=sha256:cd67cf24a553339d5062eff51013780a00d6f97a39ca062781d06b3a73b15462 \ + --hash=sha256:d3515f198eaa2f0ed49f8819d5732d70698c3fa37384146079b3799b97667a94 \ + --hash=sha256:d5280312b9af0976231f9e317c20e4a61cd2f9629b7bfea6a693d1878a264ebd \ + --hash=sha256:de06adc872bcd8c2a4e0dc51250e9e65ef2ca91be023b9d13ebd67c2ba552e1e \ + --hash=sha256:e1674e4307fa3024fc897ca774e9c7562c957af85df55efe2988ed9056dc4e57 \ + --hash=sha256:e2096eddb4e7c7bdae4bd69ad364e55e07b8316653234a56552d9c988bd2d61b \ + --hash=sha256:e560628513ed34759456a416bf86b54b2476c59144a9138165c9a1575801d0d9 \ + --hash=sha256:edfedb64740750e1a3b16152620220f51d58ff1b4abceb339ca92e934775c27a \ + --hash=sha256:f13cae8cc389a440def0c8c52057f37359014ccbc9dc1f0827936bcd367c6100 \ + --hash=sha256:f314013e7dcd5cf45ab1945d92e713eec788166262ae8deb2cfacd53def27325 \ + --hash=sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183 \ + --hash=sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556 \ + --hash=sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4 + # via google-resumable-media +google-resumable-media==2.5.0 \ + --hash=sha256:218931e8e2b2a73a58eb354a288e03a0fd5fb1c4583261ac6e4c078666468c93 \ + --hash=sha256:da1bd943e2e114a56d85d6848497ebf9be6a14d3db23e9fc57581e7c3e8170ec + # via google-cloud-storage +googleapis-common-protos==1.59.0 \ + --hash=sha256:4168fcb568a826a52f23510412da405abd93f4d23ba544bb68d943b14ba3cb44 \ + --hash=sha256:b287dc48449d1d41af0c69f4ea26242b5ae4c3d7249a38b0984c86a4caffff1f + # via google-api-core +halo==0.0.31 \ + --hash=sha256:5350488fb7d2aa7c31a1344120cee67a872901ce8858f60da7946cef96c208ab \ + --hash=sha256:7b67a3521ee91d53b7152d4ee3452811e1d2a6321975137762eb3d70063cc9d6 + # via anyscale +httplib2==0.22.0 \ + --hash=sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc \ + --hash=sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81 + # via oauth2client +humanize==4.6.0 \ + --hash=sha256:401201aca462749773f02920139f302450cb548b70489b9b4b92be39fe3c3c50 \ + --hash=sha256:5f1f22bc65911eb1a6ffe7659bd6598e33dcfeeb904eb16ee1e705a09bf75916 + # via anyscale +idna==3.4 \ + --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ + --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 + # via + # requests + # yarl +importlib-metadata==6.6.0 \ + --hash=sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed \ + --hash=sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705 + # via + # attrs + # click + # humanize + # jsonschema + # pluggy + # pytest +importlib-resources==5.12.0 \ + --hash=sha256:4be82589bf5c1d7999aedf2a45159d10cb3ca4f19b2271f8792bc8e6da7b22f6 \ + --hash=sha256:7b1deeebbf351c7578e09bf2f63fa2ce8b5ffec296e0d349139d43cca061a81a + # via jsonschema +iniconfig==2.0.0 \ + --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ + --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 + # via pytest +jinja2==3.1.2 \ + --hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 \ + --hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61 + # via -r release/requirements_buildkite.in +jmespath==1.0.1 \ + --hash=sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 \ + --hash=sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe + # via + # boto3 + # botocore +jsonpatch==1.32 \ + --hash=sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397 \ + --hash=sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2 + # via anyscale +jsonpointer==2.3 \ + --hash=sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9 \ + --hash=sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a + # via jsonpatch +jsonschema==4.17.3 \ + --hash=sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d \ + --hash=sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6 + # via anyscale +log-symbols==0.0.14 \ + --hash=sha256:4952106ff8b605ab7d5081dd2c7e6ca7374584eff7086f499c06edd1ce56dcca \ + --hash=sha256:cf0bbc6fe1a8e53f0d174a716bc625c4f87043cc21eb55dd8a740cfe22680556 + # via halo +markdown-it-py==2.2.0 \ + --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ + --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 + # via rich +markupsafe==2.1.2 \ + --hash=sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed \ + --hash=sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc \ + --hash=sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2 \ + --hash=sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460 \ + --hash=sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7 \ + --hash=sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0 \ + --hash=sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1 \ + --hash=sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa \ + --hash=sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03 \ + --hash=sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323 \ + --hash=sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65 \ + --hash=sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013 \ + --hash=sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036 \ + --hash=sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f \ + --hash=sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4 \ + --hash=sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419 \ + --hash=sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2 \ + --hash=sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619 \ + --hash=sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a \ + --hash=sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a \ + --hash=sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd \ + --hash=sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7 \ + --hash=sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666 \ + --hash=sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65 \ + --hash=sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859 \ + --hash=sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625 \ + --hash=sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff \ + --hash=sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156 \ + --hash=sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd \ + --hash=sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba \ + --hash=sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f \ + --hash=sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1 \ + --hash=sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094 \ + --hash=sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a \ + --hash=sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513 \ + --hash=sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed \ + --hash=sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d \ + --hash=sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3 \ + --hash=sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147 \ + --hash=sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c \ + --hash=sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603 \ + --hash=sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601 \ + --hash=sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a \ + --hash=sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1 \ + --hash=sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d \ + --hash=sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3 \ + --hash=sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54 \ + --hash=sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2 \ + --hash=sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6 \ + --hash=sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58 + # via jinja2 +mdurl==0.1.2 \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via markdown-it-py +multidict==6.0.4 \ + --hash=sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9 \ + --hash=sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8 \ + --hash=sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03 \ + --hash=sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710 \ + --hash=sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161 \ + --hash=sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664 \ + --hash=sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569 \ + --hash=sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067 \ + --hash=sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313 \ + --hash=sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706 \ + --hash=sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2 \ + --hash=sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636 \ + --hash=sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49 \ + --hash=sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93 \ + --hash=sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603 \ + --hash=sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0 \ + --hash=sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60 \ + --hash=sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4 \ + --hash=sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e \ + --hash=sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1 \ + --hash=sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60 \ + --hash=sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951 \ + --hash=sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc \ + --hash=sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe \ + --hash=sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95 \ + --hash=sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d \ + --hash=sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8 \ + --hash=sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed \ + --hash=sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2 \ + --hash=sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775 \ + --hash=sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87 \ + --hash=sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c \ + --hash=sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2 \ + --hash=sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98 \ + --hash=sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3 \ + --hash=sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe \ + --hash=sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78 \ + --hash=sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660 \ + --hash=sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176 \ + --hash=sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e \ + --hash=sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988 \ + --hash=sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c \ + --hash=sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c \ + --hash=sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0 \ + --hash=sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449 \ + --hash=sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f \ + --hash=sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde \ + --hash=sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5 \ + --hash=sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d \ + --hash=sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac \ + --hash=sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a \ + --hash=sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9 \ + --hash=sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca \ + --hash=sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11 \ + --hash=sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35 \ + --hash=sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063 \ + --hash=sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b \ + --hash=sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982 \ + --hash=sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258 \ + --hash=sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1 \ + --hash=sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52 \ + --hash=sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480 \ + --hash=sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7 \ + --hash=sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461 \ + --hash=sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d \ + --hash=sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc \ + --hash=sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779 \ + --hash=sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a \ + --hash=sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547 \ + --hash=sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0 \ + --hash=sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171 \ + --hash=sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf \ + --hash=sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d \ + --hash=sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba + # via + # aiohttp + # yarl +oauth2client==4.1.3 \ + --hash=sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac \ + --hash=sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6 + # via anyscale +packaging==23.1 \ + --hash=sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61 \ + --hash=sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f + # via + # anyscale + # pytest +pathspec==0.11.1 \ + --hash=sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687 \ + --hash=sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293 + # via anyscale +pkgutil-resolve-name==1.3.10 \ + --hash=sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174 \ + --hash=sha256:ca27cc078d25c5ad71a9de0a7a330146c4e014c2462d9af19c6b828280649c5e + # via jsonschema +pluggy==1.0.0 \ + --hash=sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159 \ + --hash=sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3 + # via pytest +protobuf==4.23.0 \ + --hash=sha256:03eee35b60317112a72d19c54d0bff7bc58ff12fea4cd7b018232bd99758ffdf \ + --hash=sha256:2b94bd6df92d71bd1234a2ffe7ce96ddf6d10cf637a18d6b55ad0a89fbb7fc21 \ + --hash=sha256:36f5370a930cb77c8ad2f4135590c672d0d2c72d4a707c7d0058dce4b4b4a598 \ + --hash=sha256:5f1eba1da2a2f3f7df469fccddef3cc060b8a16cfe3cc65961ad36b4dbcf59c5 \ + --hash=sha256:6c16657d6717a0c62d5d740cb354fbad1b0d8cb811669e06fc1caa0ff4799ddd \ + --hash=sha256:6fe180b56e1169d72ecc4acbd39186339aed20af5384531b8e8979b02bbee159 \ + --hash=sha256:7cb5b9a05ce52c6a782bb97de52679bd3438ff2b7460eff5da348db65650f227 \ + --hash=sha256:9744e934ea5855d12191040ea198eaf704ac78665d365a89d9572e3b627c2688 \ + --hash=sha256:9f5a0fbfcdcc364f3986f9ed9f8bb1328fb84114fd790423ff3d7fdb0f85c2d1 \ + --hash=sha256:baca40d067dddd62141a129f244703160d278648b569e90bb0e3753067644711 \ + --hash=sha256:d5a35ff54e3f62e8fc7be02bb0d2fbc212bba1a5a9cc2748090690093996f07b \ + --hash=sha256:e62fb869762b4ba18666370e2f8a18f17f8ab92dd4467295c6d38be6f8fef60b \ + --hash=sha256:ebde3a023b8e11bfa6c890ef34cd6a8b47d586f26135e86c21344fe433daf2e2 + # via + # -r release/requirements_buildkite.in + # google-api-core + # googleapis-common-protos +py==1.11.0 \ + --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \ + --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378 + # via retry +pyasn1==0.5.0 \ + --hash=sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57 \ + --hash=sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde + # via + # oauth2client + # pyasn1-modules + # rsa +pyasn1-modules==0.3.0 \ + --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \ + --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d + # via + # google-auth + # oauth2client +pycparser==2.21 \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ + --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 + # via cffi +pydantic==1.9.2 \ + --hash=sha256:1061c6ee6204f4f5a27133126854948e3b3d51fcc16ead2e5d04378c199b2f44 \ + --hash=sha256:19b5686387ea0d1ea52ecc4cffb71abb21702c5e5b2ac626fd4dbaa0834aa49d \ + --hash=sha256:2bd446bdb7755c3a94e56d7bdfd3ee92396070efa8ef3a34fab9579fe6aa1d84 \ + --hash=sha256:328558c9f2eed77bd8fffad3cef39dbbe3edc7044517f4625a769d45d4cf7555 \ + --hash=sha256:32e0b4fb13ad4db4058a7c3c80e2569adbd810c25e6ca3bbd8b2a9cc2cc871d7 \ + --hash=sha256:3ee0d69b2a5b341fc7927e92cae7ddcfd95e624dfc4870b32a85568bd65e6131 \ + --hash=sha256:4aafd4e55e8ad5bd1b19572ea2df546ccace7945853832bb99422a79c70ce9b8 \ + --hash=sha256:4b3946f87e5cef3ba2e7bd3a4eb5a20385fe36521d6cc1ebf3c08a6697c6cfb3 \ + --hash=sha256:4de71c718c9756d679420c69f216776c2e977459f77e8f679a4a961dc7304a56 \ + --hash=sha256:5565a49effe38d51882cb7bac18bda013cdb34d80ac336428e8908f0b72499b0 \ + --hash=sha256:5803ad846cdd1ed0d97eb00292b870c29c1f03732a010e66908ff48a762f20e4 \ + --hash=sha256:5da164119602212a3fe7e3bc08911a89db4710ae51444b4224c2382fd09ad453 \ + --hash=sha256:615661bfc37e82ac677543704437ff737418e4ea04bef9cf11c6d27346606044 \ + --hash=sha256:78a4d6bdfd116a559aeec9a4cfe77dda62acc6233f8b56a716edad2651023e5e \ + --hash=sha256:7d0f183b305629765910eaad707800d2f47c6ac5bcfb8c6397abdc30b69eeb15 \ + --hash=sha256:7ead3cd020d526f75b4188e0a8d71c0dbbe1b4b6b5dc0ea775a93aca16256aeb \ + --hash=sha256:84d76ecc908d917f4684b354a39fd885d69dd0491be175f3465fe4b59811c001 \ + --hash=sha256:8cb0bc509bfb71305d7a59d00163d5f9fc4530f0881ea32c74ff4f74c85f3d3d \ + --hash=sha256:91089b2e281713f3893cd01d8e576771cd5bfdfbff5d0ed95969f47ef6d676c3 \ + --hash=sha256:9c9e04a6cdb7a363d7cb3ccf0efea51e0abb48e180c0d31dca8d247967d85c6e \ + --hash=sha256:a8c5360a0297a713b4123608a7909e6869e1b56d0e96eb0d792c27585d40757f \ + --hash=sha256:afacf6d2a41ed91fc631bade88b1d319c51ab5418870802cedb590b709c5ae3c \ + --hash=sha256:b34ba24f3e2d0b39b43f0ca62008f7ba962cff51efa56e64ee25c4af6eed987b \ + --hash=sha256:bd67cb2c2d9602ad159389c29e4ca964b86fa2f35c2faef54c3eb28b4efd36c8 \ + --hash=sha256:c0f5e142ef8217019e3eef6ae1b6b55f09a7a15972958d44fbd228214cede567 \ + --hash=sha256:cdb4272678db803ddf94caa4f94f8672e9a46bae4a44f167095e4d06fec12979 \ + --hash=sha256:d70916235d478404a3fa8c997b003b5f33aeac4686ac1baa767234a0f8ac2326 \ + --hash=sha256:d8ce3fb0841763a89322ea0432f1f59a2d3feae07a63ea2c958b2315e1ae8adb \ + --hash=sha256:e0b214e57623a535936005797567231a12d0da0c29711eb3514bc2b3cd008d0f \ + --hash=sha256:e631c70c9280e3129f071635b81207cad85e6c08e253539467e4ead0e5b219aa \ + --hash=sha256:e78578f0c7481c850d1c969aca9a65405887003484d24f6110458fb02cca7747 \ + --hash=sha256:f0ca86b525264daa5f6b192f216a0d1e860b7383e3da1c65a1908f9c02f42801 \ + --hash=sha256:f1a68f4f65a9ee64b6ccccb5bf7e17db07caebd2730109cb8a95863cfa9c4e55 \ + --hash=sha256:fafe841be1103f340a24977f61dee76172e4ae5f647ab9e7fd1e1fca51524f08 \ + --hash=sha256:ff68fc85355532ea77559ede81f35fff79a6a5543477e168ab3a381887caea76 + # via + # -r release/requirements_buildkite.in + # anyscale +pygments==2.15.1 \ + --hash=sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c \ + --hash=sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1 + # via rich +pyparsing==3.0.9 \ + --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb \ + --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc + # via httplib2 +pyrsistent==0.19.3 \ + --hash=sha256:016ad1afadf318eb7911baa24b049909f7f3bb2c5b1ed7b6a8f21db21ea3faa8 \ + --hash=sha256:1a2994773706bbb4995c31a97bc94f1418314923bd1048c6d964837040376440 \ + --hash=sha256:20460ac0ea439a3e79caa1dbd560344b64ed75e85d8703943e0b66c2a6150e4a \ + --hash=sha256:3311cb4237a341aa52ab8448c27e3a9931e2ee09561ad150ba94e4cfd3fc888c \ + --hash=sha256:3a8cb235fa6d3fd7aae6a4f1429bbb1fec1577d978098da1252f0489937786f3 \ + --hash=sha256:3ab2204234c0ecd8b9368dbd6a53e83c3d4f3cab10ecaf6d0e772f456c442393 \ + --hash=sha256:42ac0b2f44607eb92ae88609eda931a4f0dfa03038c44c772e07f43e738bcac9 \ + --hash=sha256:49c32f216c17148695ca0e02a5c521e28a4ee6c5089f97e34fe24163113722da \ + --hash=sha256:4b774f9288dda8d425adb6544e5903f1fb6c273ab3128a355c6b972b7df39dcf \ + --hash=sha256:4c18264cb84b5e68e7085a43723f9e4c1fd1d935ab240ce02c0324a8e01ccb64 \ + --hash=sha256:5a474fb80f5e0d6c9394d8db0fc19e90fa540b82ee52dba7d246a7791712f74a \ + --hash=sha256:64220c429e42a7150f4bfd280f6f4bb2850f95956bde93c6fda1b70507af6ef3 \ + --hash=sha256:878433581fc23e906d947a6814336eee031a00e6defba224234169ae3d3d6a98 \ + --hash=sha256:99abb85579e2165bd8522f0c0138864da97847875ecbd45f3e7e2af569bfc6f2 \ + --hash=sha256:a2471f3f8693101975b1ff85ffd19bb7ca7dd7c38f8a81701f67d6b4f97b87d8 \ + --hash=sha256:aeda827381f5e5d65cced3024126529ddc4289d944f75e090572c77ceb19adbf \ + --hash=sha256:b735e538f74ec31378f5a1e3886a26d2ca6351106b4dfde376a26fc32a044edc \ + --hash=sha256:c147257a92374fde8498491f53ffa8f4822cd70c0d85037e09028e478cababb7 \ + --hash=sha256:c4db1bd596fefd66b296a3d5d943c94f4fac5bcd13e99bffe2ba6a759d959a28 \ + --hash=sha256:c74bed51f9b41c48366a286395c67f4e894374306b197e62810e0fdaf2364da2 \ + --hash=sha256:c9bb60a40a0ab9aba40a59f68214eed5a29c6274c83b2cc206a359c4a89fa41b \ + --hash=sha256:cc5d149f31706762c1f8bda2e8c4f8fead6e80312e3692619a75301d3dbb819a \ + --hash=sha256:ccf0d6bd208f8111179f0c26fdf84ed7c3891982f2edaeae7422575f47e66b64 \ + --hash=sha256:e42296a09e83028b3476f7073fcb69ffebac0e66dbbfd1bd847d61f74db30f19 \ + --hash=sha256:e8f2b814a3dc6225964fa03d8582c6e0b6650d68a232df41e3cc1b66a5d2f8d1 \ + --hash=sha256:f0774bf48631f3a20471dd7c5989657b639fd2d285b861237ea9e82c36a415a9 \ + --hash=sha256:f0e7c4b2f77593871e918be000b96c8107da48444d57005b6a6bc61fb4331b2c + # via jsonschema +pytest==7.3.1 \ + --hash=sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362 \ + --hash=sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3 + # via -r release/requirements_buildkite.in +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 + # via + # anyscale + # botocore + # freezegun +pytz-deprecation-shim==0.1.0.post0 \ + --hash=sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6 \ + --hash=sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d + # via tzlocal +pyyaml==6.0 \ + --hash=sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf \ + --hash=sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293 \ + --hash=sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b \ + --hash=sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57 \ + --hash=sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b \ + --hash=sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4 \ + --hash=sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07 \ + --hash=sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba \ + --hash=sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9 \ + --hash=sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287 \ + --hash=sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513 \ + --hash=sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0 \ + --hash=sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782 \ + --hash=sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0 \ + --hash=sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92 \ + --hash=sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f \ + --hash=sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2 \ + --hash=sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc \ + --hash=sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1 \ + --hash=sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c \ + --hash=sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86 \ + --hash=sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4 \ + --hash=sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c \ + --hash=sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34 \ + --hash=sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b \ + --hash=sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d \ + --hash=sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c \ + --hash=sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb \ + --hash=sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7 \ + --hash=sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737 \ + --hash=sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3 \ + --hash=sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d \ + --hash=sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358 \ + --hash=sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53 \ + --hash=sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78 \ + --hash=sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803 \ + --hash=sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a \ + --hash=sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f \ + --hash=sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174 \ + --hash=sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5 + # via + # -r release/requirements_buildkite.in + # anyscale +requests==2.30.0 \ + --hash=sha256:10e94cc4f3121ee6da529d358cdaeaff2f1c409cd377dbc72b825852f2f7e294 \ + --hash=sha256:239d7d4458afcb28a692cdd298d87542235f4ca8d36d03a15bfc128a6559a2f4 + # via + # -r release/requirements_buildkite.in + # anyscale + # google-api-core + # google-cloud-storage +retry==0.9.2 \ + --hash=sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606 \ + --hash=sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4 + # via -r release/requirements_buildkite.in +rich==13.3.5 \ + --hash=sha256:2d11b9b8dd03868f09b4fffadc84a6a8cda574e40dc90821bd845720ebb8e89c \ + --hash=sha256:69cdf53799e63f38b95b9bf9c875f8c90e78dd62b2f00c13a911c7a3b9fa4704 + # via anyscale +rsa==4.9 \ + --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \ + --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21 + # via + # google-auth + # oauth2client +s3transfer==0.6.1 \ + --hash=sha256:3c0da2d074bf35d6870ef157158641178a4204a6e689e82546083e31e0311346 \ + --hash=sha256:640bb492711f4c0c0905e1f62b6aaeb771881935ad27884852411f8e9cacbca9 + # via boto3 +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via + # anyscale + # google-auth + # halo + # oauth2client + # python-dateutil +smart-open==6.3.0 \ + --hash=sha256:b4c9ae193ad6d3e7add50944b86afa0d150bd821ab8ec21edb26d9a06b66f6a8 \ + --hash=sha256:d5238825fe9a9340645fac3d75b287c08fbb99fb2b422477de781c9f5f09e019 + # via anyscale +smmap==5.0.0 \ + --hash=sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94 \ + --hash=sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936 + # via gitdb +spinners==0.0.24 \ + --hash=sha256:1eb6aeb4781d72ab42ed8a01dcf20f3002bf50740d7154d12fb8c9769bf9e27f \ + --hash=sha256:2fa30d0b72c9650ad12bbe031c9943b8d441e41b4f5602b0ec977a19f3290e98 + # via halo +tabulate==0.9.0 \ + --hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \ + --hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f + # via anyscale +termcolor==2.3.0 \ + --hash=sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475 \ + --hash=sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a + # via halo +tomli==2.0.1 \ + --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ + --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f + # via pytest +tqdm==4.65.0 \ + --hash=sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5 \ + --hash=sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671 + # via anyscale +typing-extensions==4.5.0 \ + --hash=sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb \ + --hash=sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4 + # via + # aiohttp + # argon2-cffi + # async-timeout + # gitpython + # importlib-metadata + # jsonschema + # markdown-it-py + # pydantic + # rich + # yarl +tzdata==2023.3 \ + --hash=sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a \ + --hash=sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda + # via pytz-deprecation-shim +tzlocal==4.3 \ + --hash=sha256:3f21d09e1b2aa9f2dacca12da240ca37de3ba5237a93addfd6d593afe9073355 \ + --hash=sha256:b44c4388f3d34f25862cfbb387578a4d70fec417649da694a132f628a23367e2 + # via anyscale +urllib3==1.26.15 \ + --hash=sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305 \ + --hash=sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42 + # via + # anyscale + # botocore + # requests +wrapt==1.15.0 \ + --hash=sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0 \ + --hash=sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420 \ + --hash=sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a \ + --hash=sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c \ + --hash=sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079 \ + --hash=sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923 \ + --hash=sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f \ + --hash=sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1 \ + --hash=sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8 \ + --hash=sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86 \ + --hash=sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0 \ + --hash=sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364 \ + --hash=sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e \ + --hash=sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c \ + --hash=sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e \ + --hash=sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c \ + --hash=sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727 \ + --hash=sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff \ + --hash=sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e \ + --hash=sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29 \ + --hash=sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7 \ + --hash=sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72 \ + --hash=sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475 \ + --hash=sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a \ + --hash=sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317 \ + --hash=sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2 \ + --hash=sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd \ + --hash=sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640 \ + --hash=sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98 \ + --hash=sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248 \ + --hash=sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e \ + --hash=sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d \ + --hash=sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec \ + --hash=sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1 \ + --hash=sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e \ + --hash=sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9 \ + --hash=sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92 \ + --hash=sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb \ + --hash=sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094 \ + --hash=sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46 \ + --hash=sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29 \ + --hash=sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd \ + --hash=sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705 \ + --hash=sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8 \ + --hash=sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975 \ + --hash=sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb \ + --hash=sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e \ + --hash=sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b \ + --hash=sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418 \ + --hash=sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019 \ + --hash=sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1 \ + --hash=sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba \ + --hash=sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6 \ + --hash=sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2 \ + --hash=sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3 \ + --hash=sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7 \ + --hash=sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752 \ + --hash=sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416 \ + --hash=sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f \ + --hash=sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1 \ + --hash=sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc \ + --hash=sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145 \ + --hash=sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee \ + --hash=sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a \ + --hash=sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7 \ + --hash=sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b \ + --hash=sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653 \ + --hash=sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0 \ + --hash=sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90 \ + --hash=sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29 \ + --hash=sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6 \ + --hash=sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034 \ + --hash=sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09 \ + --hash=sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559 \ + --hash=sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639 + # via anyscale +yarl==1.9.2 \ + --hash=sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571 \ + --hash=sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3 \ + --hash=sha256:13414591ff516e04fcdee8dc051c13fd3db13b673c7a4cb1350e6b2ad9639ad3 \ + --hash=sha256:149ddea5abf329752ea5051b61bd6c1d979e13fbf122d3a1f9f0c8be6cb6f63c \ + --hash=sha256:159d81f22d7a43e6eabc36d7194cb53f2f15f498dbbfa8edc8a3239350f59fe7 \ + --hash=sha256:1b1bba902cba32cdec51fca038fd53f8beee88b77efc373968d1ed021024cc04 \ + --hash=sha256:22a94666751778629f1ec4280b08eb11815783c63f52092a5953faf73be24191 \ + --hash=sha256:2a96c19c52ff442a808c105901d0bdfd2e28575b3d5f82e2f5fd67e20dc5f4ea \ + --hash=sha256:2b0738fb871812722a0ac2154be1f049c6223b9f6f22eec352996b69775b36d4 \ + --hash=sha256:2c315df3293cd521033533d242d15eab26583360b58f7ee5d9565f15fee1bef4 \ + --hash=sha256:32f1d071b3f362c80f1a7d322bfd7b2d11e33d2adf395cc1dd4df36c9c243095 \ + --hash=sha256:3458a24e4ea3fd8930e934c129b676c27452e4ebda80fbe47b56d8c6c7a63a9e \ + --hash=sha256:38a3928ae37558bc1b559f67410df446d1fbfa87318b124bf5032c31e3447b74 \ + --hash=sha256:3da8a678ca8b96c8606bbb8bfacd99a12ad5dd288bc6f7979baddd62f71c63ef \ + --hash=sha256:494053246b119b041960ddcd20fd76224149cfea8ed8777b687358727911dd33 \ + --hash=sha256:50f33040f3836e912ed16d212f6cc1efb3231a8a60526a407aeb66c1c1956dde \ + --hash=sha256:52a25809fcbecfc63ac9ba0c0fb586f90837f5425edfd1ec9f3372b119585e45 \ + --hash=sha256:53338749febd28935d55b41bf0bcc79d634881195a39f6b2f767870b72514caf \ + --hash=sha256:5415d5a4b080dc9612b1b63cba008db84e908b95848369aa1da3686ae27b6d2b \ + --hash=sha256:5610f80cf43b6202e2c33ba3ec2ee0a2884f8f423c8f4f62906731d876ef4fac \ + --hash=sha256:566185e8ebc0898b11f8026447eacd02e46226716229cea8db37496c8cdd26e0 \ + --hash=sha256:56ff08ab5df8429901ebdc5d15941b59f6253393cb5da07b4170beefcf1b2528 \ + --hash=sha256:59723a029760079b7d991a401386390c4be5bfec1e7dd83e25a6a0881859e716 \ + --hash=sha256:5fcd436ea16fee7d4207c045b1e340020e58a2597301cfbcfdbe5abd2356c2fb \ + --hash=sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18 \ + --hash=sha256:63c48f6cef34e6319a74c727376e95626f84ea091f92c0250a98e53e62c77c72 \ + --hash=sha256:646d663eb2232d7909e6601f1a9107e66f9791f290a1b3dc7057818fe44fc2b6 \ + --hash=sha256:662e6016409828ee910f5d9602a2729a8a57d74b163c89a837de3fea050c7582 \ + --hash=sha256:674ca19cbee4a82c9f54e0d1eee28116e63bc6fd1e96c43031d11cbab8b2afd5 \ + --hash=sha256:6a5883464143ab3ae9ba68daae8e7c5c95b969462bbe42e2464d60e7e2698368 \ + --hash=sha256:6e7221580dc1db478464cfeef9b03b95c5852cc22894e418562997df0d074ccc \ + --hash=sha256:75df5ef94c3fdc393c6b19d80e6ef1ecc9ae2f4263c09cacb178d871c02a5ba9 \ + --hash=sha256:783185c75c12a017cc345015ea359cc801c3b29a2966c2655cd12b233bf5a2be \ + --hash=sha256:822b30a0f22e588b32d3120f6d41e4ed021806418b4c9f0bc3048b8c8cb3f92a \ + --hash=sha256:8288d7cd28f8119b07dd49b7230d6b4562f9b61ee9a4ab02221060d21136be80 \ + --hash=sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8 \ + --hash=sha256:832b7e711027c114d79dffb92576acd1bd2decc467dec60e1cac96912602d0e6 \ + --hash=sha256:838162460b3a08987546e881a2bfa573960bb559dfa739e7800ceeec92e64417 \ + --hash=sha256:83fcc480d7549ccebe9415d96d9263e2d4226798c37ebd18c930fce43dfb9574 \ + --hash=sha256:84e0b1599334b1e1478db01b756e55937d4614f8654311eb26012091be109d59 \ + --hash=sha256:891c0e3ec5ec881541f6c5113d8df0315ce5440e244a716b95f2525b7b9f3608 \ + --hash=sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82 \ + --hash=sha256:8c56986609b057b4839968ba901944af91b8e92f1725d1a2d77cbac6972b9ed1 \ + --hash=sha256:8ea48e0a2f931064469bdabca50c2f578b565fc446f302a79ba6cc0ee7f384d3 \ + --hash=sha256:8ec53a0ea2a80c5cd1ab397925f94bff59222aa3cf9c6da938ce05c9ec20428d \ + --hash=sha256:95d2ecefbcf4e744ea952d073c6922e72ee650ffc79028eb1e320e732898d7e8 \ + --hash=sha256:9b3152f2f5677b997ae6c804b73da05a39daa6a9e85a512e0e6823d81cdad7cc \ + --hash=sha256:9bf345c3a4f5ba7f766430f97f9cc1320786f19584acc7086491f45524a551ac \ + --hash=sha256:a60347f234c2212a9f0361955007fcf4033a75bf600a33c88a0a8e91af77c0e8 \ + --hash=sha256:a74dcbfe780e62f4b5a062714576f16c2f3493a0394e555ab141bf0d746bb955 \ + --hash=sha256:a83503934c6273806aed765035716216cc9ab4e0364f7f066227e1aaea90b8d0 \ + --hash=sha256:ac9bb4c5ce3975aeac288cfcb5061ce60e0d14d92209e780c93954076c7c4367 \ + --hash=sha256:aff634b15beff8902d1f918012fc2a42e0dbae6f469fce134c8a0dc51ca423bb \ + --hash=sha256:b03917871bf859a81ccb180c9a2e6c1e04d2f6a51d953e6a5cdd70c93d4e5a2a \ + --hash=sha256:b124e2a6d223b65ba8768d5706d103280914d61f5cae3afbc50fc3dfcc016623 \ + --hash=sha256:b25322201585c69abc7b0e89e72790469f7dad90d26754717f3310bfe30331c2 \ + --hash=sha256:b7232f8dfbd225d57340e441d8caf8652a6acd06b389ea2d3222b8bc89cbfca6 \ + --hash=sha256:b8cc1863402472f16c600e3e93d542b7e7542a540f95c30afd472e8e549fc3f7 \ + --hash=sha256:b9a4e67ad7b646cd6f0938c7ebfd60e481b7410f574c560e455e938d2da8e0f4 \ + --hash=sha256:be6b3fdec5c62f2a67cb3f8c6dbf56bbf3f61c0f046f84645cd1ca73532ea051 \ + --hash=sha256:bf74d08542c3a9ea97bb8f343d4fcbd4d8f91bba5ec9d5d7f792dbe727f88938 \ + --hash=sha256:c027a6e96ef77d401d8d5a5c8d6bc478e8042f1e448272e8d9752cb0aff8b5c8 \ + --hash=sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9 \ + --hash=sha256:c1012fa63eb6c032f3ce5d2171c267992ae0c00b9e164efe4d73db818465fac3 \ + --hash=sha256:c3a53ba34a636a256d767c086ceb111358876e1fb6b50dfc4d3f4951d40133d5 \ + --hash=sha256:d4e2c6d555e77b37288eaf45b8f60f0737c9efa3452c6c44626a5455aeb250b9 \ + --hash=sha256:de119f56f3c5f0e2fb4dee508531a32b069a5f2c6e827b272d1e0ff5ac040333 \ + --hash=sha256:e65610c5792870d45d7b68c677681376fcf9cc1c289f23e8e8b39c1485384185 \ + --hash=sha256:e9fdc7ac0d42bc3ea78818557fab03af6181e076a2944f43c38684b4b6bed8e3 \ + --hash=sha256:ee4afac41415d52d53a9833ebae7e32b344be72835bbb589018c9e938045a560 \ + --hash=sha256:f364d3480bffd3aa566e886587eaca7c8c04d74f6e8933f3f2c996b7f09bee1b \ + --hash=sha256:f3b078dbe227f79be488ffcfc7a9edb3409d018e0952cf13f15fd6512847f3f7 \ + --hash=sha256:f4e2d08f07a3d7d3e12549052eb5ad3eab1c349c53ac51c209a0e5991bbada78 \ + --hash=sha256:f7a3d8146575e08c29ed1cd287068e6d02f1c7bdff8970db96683b9591b86ee7 + # via aiohttp +zipp==3.15.0 \ + --hash=sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b \ + --hash=sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556 + # via + # importlib-metadata + # importlib-resources diff --git a/release/rllib_tests/debug_app_config.yaml b/release/rllib_tests/debug_app_config.yaml new file mode 100755 index 000000000000..c51fbcc4d39b --- /dev/null +++ b/release/rllib_tests/debug_app_config.yaml @@ -0,0 +1,49 @@ +base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} +env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"} +debian_packages: + - unzip + - zip + + # Needed to run MuJoCo with gymnasium. + - libosmesa6-dev + - libgl1-mesa-glx + - libglfw3 + - patchelf + # End: MuJoCo. + +python: + pip_packages: + ## These dependencies should be handled by requirements_rllib.txt and + ## requirements_ml_docker.txt and removed here + - gymnasium[atari,mujoco]==0.26.3 + - ale-py==0.8.0 + - gym==0.26.2 + - mujoco-py<2.2,>=2.1 + # AutoROM downloads ROMs via torrent when they are built. The torrent is unreliable, + # so we built it for py3 and use that instead. This wheel was tested for python 3.7, 3.8, + # and 3.9. + - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl + - pytest + conda_packages: [] + +post_build_cmds: + - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + # TODO(https://github.com/ray-project/ray/issues/34591) + - pip3 install --force-reinstall -U https://s3-us-west-2.amazonaws.com/ray-wheels/env["RAY_TEST_BRANCH"]/env["RAY_COMMIT_OF_WHEEL"]/ray-3.0.0.dev0%2Bdebug-cp37-cp37m-manylinux2014_x86_64.whl + - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} + # Clone the rl-experiments repo for offline-RL files. + - git clone https://github.com/ray-project/rl-experiments.git + - unzip rl-experiments/halfcheetah-sac/2022-12-17/halfcheetah_1500_mean_reward_sac.zip -d ~/. + # Use torch+CUDA10.2 for our release tests. CUDA11.x has known performance issues in combination with torch+GPU+CNNs + # TODO(sven): remove once nightly image gets upgraded. + - pip3 install torch==1.12.1+cu102 torchvision==0.13.1+cu102 --extra-index-url https://download.pytorch.org/whl/cu102 + + # TODO(sven): remove once nightly image gets gymnasium and the other new dependencies. + - wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz + - mkdir ~/.mujoco + - mv mujoco210-linux-x86_64.tar.gz ~/.mujoco/. + - cd ~/.mujoco + - tar -xf ~/.mujoco/mujoco210-linux-x86_64.tar.gz + + # not strictly necessary, but makes debugging easier + - git clone https://github.com/ray-project/ray.git diff --git a/release/rllib_tests/learning_tests/todo_tests_currently_not_covered.yaml b/release/rllib_tests/learning_tests/todo_tests_currently_not_covered.yaml index eebccd523c5b..f769c8fd07d5 100644 --- a/release/rllib_tests/learning_tests/todo_tests_currently_not_covered.yaml +++ b/release/rllib_tests/learning_tests/todo_tests_currently_not_covered.yaml @@ -5,7 +5,7 @@ # run: ARS # # Minimum reward and total ts (in given time_total_s) to pass this test. # pass_criteria: -# episode_reward_mean: 100.0 +# sampler_results/episode_reward_mean: 100.0 # timesteps_total: 2000000 # stop: # time_total_s: 2000 @@ -29,7 +29,7 @@ # run: DDPPO # # Minimum reward and total ts (in given time_total_s) to pass this test. # pass_criteria: -# episode_reward_mean: 50.0 +# sampler_results/episode_reward_mean: 50.0 # timesteps_total: 10000000 # stop: # time_total_s: 3600 diff --git a/release/rllib_tests/learning_tests/yaml_files/a2c/a2c-breakout-v5.yaml b/release/rllib_tests/learning_tests/yaml_files/a2c/a2c-breakout-v5.yaml index c4e4b0eef19d..c38c9f8fffb0 100644 --- a/release/rllib_tests/learning_tests/yaml_files/a2c/a2c-breakout-v5.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/a2c/a2c-breakout-v5.yaml @@ -3,7 +3,7 @@ a2c-breakoutnoframeskip-v5: run: A2C # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 50.0 + sampler_results/episode_reward_mean: 50.0 timesteps_total: 5000000 stop: time_total_s: 7200 diff --git a/release/rllib_tests/learning_tests/yaml_files/a3c/a3c-pongdeterministic-v5.yaml b/release/rllib_tests/learning_tests/yaml_files/a3c/a3c-pongdeterministic-v5.yaml index 7c7dbce9916d..3ea52a704525 100644 --- a/release/rllib_tests/learning_tests/yaml_files/a3c/a3c-pongdeterministic-v5.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/a3c/a3c-pongdeterministic-v5.yaml @@ -3,7 +3,7 @@ a3c-pongdeterministic-v5: run: A3C # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 18.0 + sampler_results/episode_reward_mean: 18.0 timesteps_total: 5000000 stop: time_total_s: 3600 diff --git a/release/rllib_tests/learning_tests/yaml_files/apex/apex-breakoutnoframeskip-v5.yaml b/release/rllib_tests/learning_tests/yaml_files/apex/apex-breakoutnoframeskip-v5.yaml index 50002c3e4053..81c8fdd20e48 100644 --- a/release/rllib_tests/learning_tests/yaml_files/apex/apex-breakoutnoframeskip-v5.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/apex/apex-breakoutnoframeskip-v5.yaml @@ -3,7 +3,7 @@ apex-breakoutnoframeskip-v5: run: APEX # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 100.0 + sampler_results/episode_reward_mean: 100.0 timesteps_total: 12000000 stop: time_total_s: 7200 diff --git a/release/rllib_tests/learning_tests/yaml_files/appo/appo-pongnoframeskip-v5.yaml b/release/rllib_tests/learning_tests/yaml_files/appo/appo-pongnoframeskip-v5.yaml index 46e57eb31994..9b5e5a84f9bc 100644 --- a/release/rllib_tests/learning_tests/yaml_files/appo/appo-pongnoframeskip-v5.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/appo/appo-pongnoframeskip-v5.yaml @@ -3,7 +3,7 @@ appo-pongnoframeskip-v5: run: APPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 18.0 + sampler_results/episode_reward_mean: 18.0 timesteps_total: 5000000 stop: time_total_s: 1800 diff --git a/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml index fa47849ba227..199022e32d99 100644 --- a/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/bc/bc-halfcheetah-v4.yaml @@ -2,7 +2,7 @@ bc-halfcheetah-v0: env: HalfCheetah-v4 run: BC pass_criteria: - evaluation/episode_reward_mean: 400.0 + evaluation/sampler_results/episode_reward_mean: 400.0 timesteps_total: 2500000 stop: time_total_s: 1800 diff --git a/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml index 511355980f8b..32b7299b9f7f 100644 --- a/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/cql/cql-halfcheetah-v4.yaml @@ -2,7 +2,7 @@ cql-halfcheetah-v4: env: HalfCheetah-v4 run: CQL pass_criteria: - evaluation/episode_reward_mean: 400.0 + evaluation/sampler_results/episode_reward_mean: 400.0 # Can not check throughput for offline methods. timesteps_total: 5000000 stop: diff --git a/release/rllib_tests/learning_tests/yaml_files/ddpg/ddpg-hopper-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/ddpg/ddpg-hopper-v4.yaml index ecf51e28f228..17149db121b4 100644 --- a/release/rllib_tests/learning_tests/yaml_files/ddpg/ddpg-hopper-v4.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/ddpg/ddpg-hopper-v4.yaml @@ -3,7 +3,7 @@ ddpg-hopper-v4: run: DDPG # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 110.0 + sampler_results/episode_reward_mean: 110.0 timesteps_total: 50000 stop: time_total_s: 1800 diff --git a/release/rllib_tests/learning_tests/yaml_files/dqn/dqn-breakoutnoframeskip-v5.yaml b/release/rllib_tests/learning_tests/yaml_files/dqn/dqn-breakoutnoframeskip-v5.yaml index 78947d81f530..2da9c8ac89cc 100644 --- a/release/rllib_tests/learning_tests/yaml_files/dqn/dqn-breakoutnoframeskip-v5.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/dqn/dqn-breakoutnoframeskip-v5.yaml @@ -3,7 +3,7 @@ dqn-breakoutnoframeskip-v5: run: DQN # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 20.0 + sampler_results/episode_reward_mean: 20.0 timesteps_total: 400000 stop: time_total_s: 7200 diff --git a/release/rllib_tests/learning_tests/yaml_files/es/es-humanoid-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/es/es-humanoid-v4.yaml index fa89b337f4c3..90825f64217f 100644 --- a/release/rllib_tests/learning_tests/yaml_files/es/es-humanoid-v4.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/es/es-humanoid-v4.yaml @@ -3,7 +3,7 @@ es-humanoid-v4: run: ES # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 100.0 + sampler_results/episode_reward_mean: 100.0 timesteps_total: 5000000 stop: time_total_s: 3600 diff --git a/release/rllib_tests/learning_tests/yaml_files/impala/impala-breakoutnoframeskip-v5.yaml b/release/rllib_tests/learning_tests/yaml_files/impala/impala-breakoutnoframeskip-v5.yaml index e0d054164cb4..2a12ca052256 100644 --- a/release/rllib_tests/learning_tests/yaml_files/impala/impala-breakoutnoframeskip-v5.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/impala/impala-breakoutnoframeskip-v5.yaml @@ -3,7 +3,7 @@ impala-breakoutnoframeskip-v5: run: IMPALA # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 200.0 + sampler_results/episode_reward_mean: 200.0 timesteps_total: 6000000 stop: time_total_s: 2400 diff --git a/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml index 1a8d6b3f42c3..59ff10051cfb 100644 --- a/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/marwil/marwil-halfcheetah-v4.yaml @@ -3,7 +3,7 @@ marwil-halfcheetah-v4: run: MARWIL pass_criteria: # Can not check throughput for offline methods. - evaluation/episode_reward_mean: 400.0 + evaluation/sampler_results/episode_reward_mean: 400.0 timesteps_total: 2500000 stop: time_total_s: 1800 diff --git a/release/rllib_tests/learning_tests/yaml_files/ppo/tf/ppo-breakoutnoframeskip-v5-tf.yaml b/release/rllib_tests/learning_tests/yaml_files/ppo/tf/ppo-breakoutnoframeskip-v5-tf.yaml index c3d88cc9513d..175fb47f3ccc 100644 --- a/release/rllib_tests/learning_tests/yaml_files/ppo/tf/ppo-breakoutnoframeskip-v5-tf.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/ppo/tf/ppo-breakoutnoframeskip-v5-tf.yaml @@ -3,7 +3,7 @@ ppo-breakoutnoframeskip-v5: run: PPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 50.0 + sampler_results/episode_reward_mean: 50.0 timesteps_total: 7000000 stop: time_total_s: 3600 diff --git a/release/rllib_tests/learning_tests/yaml_files/ppo/torch/ppo-breakoutnoframeskip-v5-torch.yaml b/release/rllib_tests/learning_tests/yaml_files/ppo/torch/ppo-breakoutnoframeskip-v5-torch.yaml index 3241d39d37ab..22e0d3826ee9 100644 --- a/release/rllib_tests/learning_tests/yaml_files/ppo/torch/ppo-breakoutnoframeskip-v5-torch.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/ppo/torch/ppo-breakoutnoframeskip-v5-torch.yaml @@ -3,7 +3,7 @@ ppo-breakoutnoframeskip-v5: run: PPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 50.0 + sampler_results/episode_reward_mean: 50.0 timesteps_total: 7000000 stop: # This is double the time we use for tf because of 2x throughput there. diff --git a/release/rllib_tests/learning_tests/yaml_files/sac/sac-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/sac/sac-halfcheetah-v4.yaml index 3c78bb8a51d0..979bda086a3d 100644 --- a/release/rllib_tests/learning_tests/yaml_files/sac/sac-halfcheetah-v4.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/sac/sac-halfcheetah-v4.yaml @@ -3,7 +3,7 @@ sac-halfcheetah-v4: run: SAC # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 500.0 + sampler_results/episode_reward_mean: 500.0 timesteps_total: 400000 stop: time_total_s: 3600 diff --git a/release/rllib_tests/learning_tests/yaml_files/slateq/slateq-interest-evolution-recsim-env.yaml b/release/rllib_tests/learning_tests/yaml_files/slateq/slateq-interest-evolution-recsim-env.yaml index 39b5e8827468..d7170509d8e1 100644 --- a/release/rllib_tests/learning_tests/yaml_files/slateq/slateq-interest-evolution-recsim-env.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/slateq/slateq-interest-evolution-recsim-env.yaml @@ -2,7 +2,7 @@ slateq-interest-evolution-recsim-env: env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv run: SlateQ pass_criteria: - episode_reward_mean: 160.0 + sampler_results/episode_reward_mean: 160.0 timesteps_total: 300000 stop: time_total_s: 7200 diff --git a/release/rllib_tests/learning_tests/yaml_files/td3/td3-halfcheetah-v4.yaml b/release/rllib_tests/learning_tests/yaml_files/td3/td3-halfcheetah-v4.yaml index 217b658a9080..96d4381c7dbe 100644 --- a/release/rllib_tests/learning_tests/yaml_files/td3/td3-halfcheetah-v4.yaml +++ b/release/rllib_tests/learning_tests/yaml_files/td3/td3-halfcheetah-v4.yaml @@ -3,7 +3,7 @@ td3-halfcheetah-v4: run: TD3 # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 400.0 + sampler_results/episode_reward_mean: 400.0 timesteps_total: 1000000 stop: time_total_s: 3600 diff --git a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml index aa1f60eb46d6..8a312996532a 100644 --- a/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml +++ b/release/rllib_tests/multi_gpu_learning_tests/multi_gpu_learning_tests.yaml @@ -4,7 +4,7 @@ a2c-cartpole-v1: run: A2C # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -18,7 +18,7 @@ appo-cartpole-v1-no-vtrace: run: APPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -42,7 +42,7 @@ appo-cartpole-v1-vtrace: run: APPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -65,7 +65,7 @@ ddpg-repeat-after-me-env: run: DDPG # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: -50.0 + sampler_results/episode_reward_mean: -50.0 timesteps_total: 8000 stop: time_total_s: 600 @@ -85,7 +85,7 @@ dqn-cartpole-v1: run: DQN # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 50000 stop: time_total_s: 600 @@ -105,7 +105,7 @@ impala-cartpole-v1: run: IMPALA # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -120,7 +120,7 @@ pg-cartpole-v1: run: PG # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 130.0 + sampler_results/episode_reward_mean: 130.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -138,7 +138,7 @@ ppo-cartpole-v1: run: PPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 300000 stop: time_total_s: 600 @@ -161,7 +161,7 @@ sac-repeat-after-me-env: run: SAC # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 40.0 + sampler_results/episode_reward_mean: 40.0 timesteps_total: 4500 stop: time_total_s: 600 @@ -183,7 +183,7 @@ sac-repeat-after-me-env-continuous: run: SAC # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: -50.0 + sampler_results/episode_reward_mean: -50.0 timesteps_total: 4500 stop: time_total_s: 600 @@ -208,7 +208,7 @@ simpleq-cartpole-v1: run: SimpleQ # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 85000 stop: time_total_s: 600 @@ -221,7 +221,7 @@ td3-repeat-after-me-env: run: TD3 # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: -50.0 + sampler_results/episode_reward_mean: -50.0 timesteps_total: 25000 stop: time_total_s: 600 diff --git a/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml b/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml index e4c1393fb414..8491f98a81f9 100644 --- a/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml +++ b/release/rllib_tests/multi_gpu_with_attention_learning_tests/multi_gpu_with_attention_learning_tests.yaml @@ -4,7 +4,7 @@ appo-stateless-cartpole-no-vtrace: run: APPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -38,7 +38,7 @@ appo-stateless-cartpole-vtrace: run: APPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -71,7 +71,7 @@ impala-stateless-cartpole: run: IMPALA # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -98,7 +98,7 @@ pg-stateless-cartpole: run: PG # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 130.0 + sampler_results/episode_reward_mean: 130.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -126,7 +126,7 @@ ppo-stateless-cartpole: run: PPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 200000 stop: time_total_s: 600 @@ -160,7 +160,7 @@ ppo-stateless-cartpole: # run: R2D2 # # Minimum reward and total ts (in given time_total_s) to pass this test. # pass_criteria: -# episode_reward_mean: 150.0 +# sampler_results/episode_reward_mean: 150.0 # timesteps_total: 130000 # stop: # time_total_s: 1200 diff --git a/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml b/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml index 76142bdcfa9d..911c8ba0e8ef 100644 --- a/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml +++ b/release/rllib_tests/multi_gpu_with_lstm_learning_tests/multi_gpu_with_lstm_learning_tests.yaml @@ -4,7 +4,7 @@ a2c-stateless-cartpole: run: A2C # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -24,7 +24,7 @@ appo-stateless-cartpole-no-vtrace: run: APPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -50,7 +50,7 @@ appo-stateless-cartpole-vtrace: run: APPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -75,7 +75,7 @@ impala-stateless-cartpole: run: IMPALA # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -94,7 +94,7 @@ pg-stateless-cartpole: run: PG # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 130.0 + sampler_results/episode_reward_mean: 130.0 timesteps_total: 500000 stop: time_total_s: 600 @@ -114,7 +114,7 @@ ppo-stateless-cartpole: run: PPO # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 200000 stop: time_total_s: 600 @@ -140,7 +140,7 @@ ppo-stateless-cartpole: # run: R2D2 # # Minimum reward and total ts (in given time_total_s) to pass this test. # pass_criteria: -# episode_reward_mean: 150.0 +# sampler_results/episode_reward_mean: 150.0 # timesteps_total: 65000 # stop: # time_total_s: 800 diff --git a/release/rllib_tests/multi_node_checkpointing_compute_config_gce.yaml b/release/rllib_tests/multi_node_checkpointing_compute_config_gce.yaml new file mode 100644 index 000000000000..9be7238750de --- /dev/null +++ b/release/rllib_tests/multi_node_checkpointing_compute_config_gce.yaml @@ -0,0 +1,17 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 3 + +head_node_type: + name: head_node + instance_type: n2-standard-8 # m5.2xlarge + +worker_node_types: + - name: worker_node + instance_type: n1-standard-4-nvidia-tesla-t4-1 # g4dn.xlarge + min_workers: 2 + max_workers: 2 + use_spot: false diff --git a/release/run_release_test.sh b/release/run_release_test.sh index 367d2fb8ceb9..8507284cb5ae 100755 --- a/release/run_release_test.sh +++ b/release/run_release_test.sh @@ -44,7 +44,7 @@ export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR BUILDKITE_MAX_RETRIES B if [ -z "${NO_INSTALL}" ]; then pip install --use-deprecated=legacy-resolver -q -r requirements.txt - pip install -q -U boto3 botocore + pip install -q -U boto3 botocore bazel-runfiles if [ "${INSTALL_MATCHING_RAY-false}" == "true" ]; then # Find ray-wheels parameter and install locally @@ -69,8 +69,25 @@ fi if [ -z "${NO_CLONE}" ]; then TMPDIR=$(mktemp -d -t release-XXXXXXXXXX) echo "Cloning test repo ${RAY_TEST_REPO} branch ${RAY_TEST_BRANCH}" - git clone --depth 1 -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}" + git clone -b "${RAY_TEST_BRANCH}" "${RAY_TEST_REPO}" "${TMPDIR}" pushd "${TMPDIR}/release" || true + HEAD_COMMIT=$(git rev-parse HEAD) + echo "The cloned test repo has head commit of ${HEAD_COMMIT}" + + # We only do this if RAY_TEST_REPO and RAY_TEST_BRANCH are pointing to ray master. + # Theoretically, release manager may also run into this issue when manually triggering + # release test runs. But cherry-picks are rare and thus it's less likely to run into + # this racing condition, ignoring for now. + if [ "${RAY_TEST_REPO}" == "https://github.com/ray-project/ray.git" ] && \ + [[ "${PARSED_RAY_WHEELS}" == *"master"* ]] && \ + [ "${RAY_TEST_BRANCH-}" == "master" ] && [ -n "${RAY_COMMIT_OF_WHEEL-}" ] && \ + [ "${HEAD_COMMIT}" != "${RAY_COMMIT_OF_WHEEL}" ]; then + echo "The checked out test code doesn't match with the installed wheel. \ +This is likely due to a racing condition when a PR is landed between \ +a wheel is installed and test code is checked out." + echo "Hard resetting from ${HEAD_COMMIT} to ${RAY_COMMIT_OF_WHEEL}." + git reset --hard "${RAY_COMMIT_OF_WHEEL}" + fi fi if [ -z "${NO_INSTALL}" ]; then diff --git a/release/train_tests/horovod/compute_tpl.yaml b/release/train_tests/horovod/compute_tpl_aws.yaml similarity index 100% rename from release/train_tests/horovod/compute_tpl.yaml rename to release/train_tests/horovod/compute_tpl_aws.yaml diff --git a/release/train_tests/horovod/compute_tpl_gce.yaml b/release/train_tests/horovod/compute_tpl_gce.yaml new file mode 100644 index 000000000000..31730aac6e79 --- /dev/null +++ b/release/train_tests/horovod/compute_tpl_gce.yaml @@ -0,0 +1,25 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 1 + +head_node_type: + name: head_node + # 4 cpus, 16G mem, $0.224/hr on demand + instance_type: n1-standard-4 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-4 + max_workers: 1 + min_workers: 1 + use_spot: false + +#aws: +# TagSpecifications: +# - ResourceType: "instance" +# Tags: +# - Key: ttl-hours +# Value: '24' diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml b/release/tune_tests/cloud_tests/tpl_gce_4x2.yaml similarity index 63% rename from release/air_tests/air_benchmarks/compute_gpu_4x4.yaml rename to release/tune_tests/cloud_tests/tpl_gce_4x2.yaml index 0bb94dc6c3dc..c9e14cff5f0d 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml +++ b/release/tune_tests/cloud_tests/tpl_gce_4x2.yaml @@ -1,15 +1,17 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} -region: us-west-2 +region: us-west1 +allowed_azs: + - us-west1-b max_workers: 3 head_node_type: name: head_node - instance_type: g4dn.12xlarge + instance_type: n1-standard-2 worker_node_types: - name: worker_node - instance_type: g4dn.12xlarge - max_workers: 3 + instance_type: n1-standard-2 min_workers: 3 + max_workers: 3 use_spot: false diff --git a/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml b/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml new file mode 100644 index 000000000000..dbccfa496b2d --- /dev/null +++ b/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml @@ -0,0 +1,28 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west1 +allowed_azs: + - us-west1-b + +max_workers: 16 + +head_node_type: + name: head_node + instance_type: n1-standard-2 + resources: + custom_resources: + head: 1 + +worker_node_types: + - name: worker_node + instance_type: n1-standard-2 + min_workers: 16 + max_workers: 16 + use_spot: true + +## Required to allow nodes to terminate themselves. +#aws: +# TagSpecifications: +# - ResourceType: "instance" +# Tags: +# - Key: chaos-test-name +# Value: 'tune-chaos-test' \ No newline at end of file diff --git a/release/tune_tests/scalability_tests/app_config.yaml b/release/tune_tests/scalability_tests/app_config.yaml index e552178aa270..7832627aba4d 100755 --- a/release/tune_tests/scalability_tests/app_config.yaml +++ b/release/tune_tests/scalability_tests/app_config.yaml @@ -7,9 +7,11 @@ python: pip_packages: - pytest - awscli + - gcsfs<=2022.7.1 - pyarrow>=6.0.1,<7.0.0 conda_packages: [] post_build_cmds: - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + - pip3 install ray[tune] # Installing Tune dependency so we can get protobuf version back. - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} diff --git a/release/tune_tests/scalability_tests/app_config_data.yaml b/release/tune_tests/scalability_tests/app_config_data.yaml index 6a18a57ba889..788d7a952d5a 100755 --- a/release/tune_tests/scalability_tests/app_config_data.yaml +++ b/release/tune_tests/scalability_tests/app_config_data.yaml @@ -7,13 +7,14 @@ python: pip_packages: - pytest - awscli + - gcsfs<=2022.7.1 - xgboost_ray # this will install protobuf version beyond the upper bound of what Tune allows - pyarrow>=6.0.1,<7.0.0 conda_packages: [] post_build_cmds: - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} - - pip3 install ray[tune] # Needed for Ray Client to work. Installing Tune dependency so we can get protobuf version back. + - pip3 install ray[tune] # Installing Tune dependency so we can get protobuf version back. - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} - sudo mkdir -p /data || true - sudo chown ray:1000 /data || true diff --git a/release/tune_tests/scalability_tests/workloads/test_durable_multifile_checkpoints.py b/release/tune_tests/scalability_tests/workloads/test_durable_multifile_checkpoints.py new file mode 100644 index 000000000000..b3d4ba6bf683 --- /dev/null +++ b/release/tune_tests/scalability_tests/workloads/test_durable_multifile_checkpoints.py @@ -0,0 +1,51 @@ +"""Durable trainable with multi-file checkpoints (16 trials, checkpoint to cloud) + +In this run, we will start 16 trials on a cluster. The trials create 16 files a +1 MB checkpoints every 12 seconds and should only keep 2 checkpoints. This test +ensures that durable checkpoints don't slow down experiment progress too much. + +Cluster: cluster_16x2.yaml + +Test owner: krfricke + +Acceptance criteria: Should run faster than 750 seconds. + +Theoretical minimum time: 300 seconds +""" +import argparse + +import ray + +from ray.tune.utils.release_test_util import timed_tune_run + + +def main(bucket): + ray.init(address="auto") + + num_samples = 16 + results_per_second = 5 / 60 # 5 results per minute = 1 every 12 seconds + trial_length_s = 300 + + max_runtime = 750 + + timed_tune_run( + name="durable multi-file checkpoints", + num_samples=num_samples, + results_per_second=results_per_second, + trial_length_s=trial_length_s, + max_runtime=max_runtime, + checkpoint_freq_s=12, # Once every 12 seconds (once per result) + checkpoint_size_b=int(1 * 1000**2), # 1 MB + checkpoint_num_files=16, + keep_checkpoints_num=2, + resources_per_trial={"cpu": 2}, + storage_path=bucket, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--bucket", type=str, help="Bucket name") + args, _ = parser.parse_known_args() + + main(args.bucket or "ray-tune-scalability-test") diff --git a/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py b/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py index 1b293a3fddd0..1a07f6edf651 100644 --- a/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py +++ b/release/tune_tests/scalability_tests/workloads/test_durable_trainable.py @@ -36,15 +36,18 @@ def main(bucket): os.environ[var] = str(y) else: print("No AWS secrets file found. Loading from boto.") - from boto3 import Session + try: + from boto3 import Session - session = Session() - credentials = session.get_credentials() - current_credentials = credentials.get_frozen_credentials() + session = Session() + credentials = session.get_credentials() + current_credentials = credentials.get_frozen_credentials() - os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key - os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key - os.environ["AWS_SESSION_TOKEN"] = current_credentials.token + os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key + os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key + os.environ["AWS_SESSION_TOKEN"] = current_credentials.token + except Exception: + print("Cannot setup AWS credentials (is this running on GCE?)") if all( os.getenv(k, "") @@ -76,7 +79,7 @@ def main(bucket): checkpoint_size_b=int(10 * 1000**2), # 10 MB keep_checkpoints_num=2, resources_per_trial={"cpu": 2}, - storage_path=f"s3://{bucket}/durable/", + storage_path=bucket, ) diff --git a/release/util/download_wheels.sh b/release/util/download_wheels.sh index 40bf28320a57..a7113aeeccca 100755 --- a/release/util/download_wheels.sh +++ b/release/util/download_wheels.sh @@ -34,8 +34,8 @@ download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERS download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp311-cp311-manylinux2014_aarch64.whl" # macOS. -echo "Downloading Ray core MacOS wheels (intel)" -download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_15_intel.whl" +echo "Downloading Ray core MacOS wheels (x86_64)" +download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_15_x86_64.whl" download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp38-cp38-macosx_10_15_x86_64.whl" download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp39-cp39-macosx_10_15_x86_64.whl" download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp310-cp310-macosx_10_15_universal2.whl" @@ -71,8 +71,8 @@ download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERS download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray_cpp-$RAY_VERSION-cp311-cp311-manylinux2014_aarch64.whl" # macOS CPP. -echo "Downloading Ray CPP MacOS wheels (intel)" -download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray_cpp-$RAY_VERSION-cp37-cp37m-macosx_10_15_intel.whl" +echo "Downloading Ray CPP MacOS wheels (x86_64)" +download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray_cpp-$RAY_VERSION-cp37-cp37m-macosx_10_15_x86_64.whl" download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray_cpp-$RAY_VERSION-cp38-cp38-macosx_10_15_x86_64.whl" download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray_cpp-$RAY_VERSION-cp39-cp39-macosx_10_15_x86_64.whl" download_wheel "https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray_cpp-$RAY_VERSION-cp310-cp310-macosx_10_15_universal2.whl" diff --git a/rllib/BUILD b/rllib/BUILD index 100df5ec4262..133d14197952 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -171,6 +171,16 @@ py_test( args = ["--dir=tuned_examples/appo"] ) +py_test( + name = "learning_tests_cartpole_appo_w_rl_modules_and_learner", + main = "tests/run_regression_tests.py", + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "no_tf_static_graph"], + size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer + srcs = ["tests/run_regression_tests.py"], + data = ["tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml"], + args = ["--dir=tuned_examples/appo"] +) + # py_test( # name = "learning_tests_cartpole_appo_vtrace", # main = "tests/run_regression_tests.py", @@ -590,27 +600,15 @@ py_test( ) py_test( - name = "learning_tests_pendulum_ppo_with_rl_module_torch", + name = "learning_tests_pendulum_ppo_with_rl_module", main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "torch_only"], + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "no_tf_static_graph"], size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer srcs = ["tests/run_regression_tests.py"], data = ["tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml"], args = ["--dir=tuned_examples/ppo"] ) -# TODO (Kourosh): tf2 is way slower than torch, eager mode is no enabled, I wonder if -# it would get faster with eager mode once it is enabled. -# py_test( -# name = "learning_tests_pendulum_ppo_with_rl_module_tf2_eager", -# main = "tests/run_regression_tests.py", -# tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "tf2_only", "no_tf_static_graph"], -# size = "large", # bazel may complain about it being too long sometimes - large is on purpose as some frameworks take longer -# srcs = ["tests/run_regression_tests.py"], -# data = ["tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml"], -# args = ["--dir=tuned_examples/ppo"] -# ) - py_test( name = "learning_tests_multi_agent_pendulum_ppo", main = "tests/run_regression_tests.py", @@ -807,7 +805,7 @@ py_test( py_test( name = "test_algorithm_config", tags = ["team:rllib", "algorithms_dir", "algorithms_dir_generic"], - size = "small", + size = "medium", srcs = ["algorithms/tests/test_algorithm_config.py"], ) @@ -931,7 +929,7 @@ py_test( name = "test_appo_learner", tags = ["team:rllib", "algorithms_dir"], size = "medium", - srcs = ["algorithms/appo/tests/tf/test_appo_learner.py"] + srcs = ["algorithms/appo/tests/test_appo_learner.py"] ) # ARS @@ -1867,6 +1865,13 @@ py_test( ) # Default Models +py_test( + name = "test_base_models", + tags = ["team:rllib", "core"], + size = "small", + srcs = ["core/models/tests/test_base_models.py"] +) + py_test( name = "test_cnn_encoders", tags = ["team:rllib", "core", "models"], @@ -1983,6 +1988,13 @@ py_test( srcs = ["core/learner/torch/tests/test_torch_learner.py"] ) +py_test( + name ="tests/test_algorithm_save_load_checkpoint_learner", + tags = ["team:rllib", "core"], + size = "medium", + srcs = ["tests/test_algorithm_save_load_checkpoint_learner.py"] +) + py_test( name = "test_bc_algorithm", tags = ["team:rllib", "core"], @@ -2277,8 +2289,8 @@ py_test( py_test( name = "utils/tests/test_torch_utils", - tags = ["team:rllib", "utils"], - size = "small", + tags = ["team:rllib", "utils", "gpu"], + size = "medium", srcs = ["utils/tests/test_torch_utils.py"] ) @@ -2446,6 +2458,16 @@ py_test( args = ["TestCheckpointRestorePG"] ) + +py_test( + name = "tests/test_checkpoint_restore_pg_gpu", + main = "tests/test_algorithm_checkpoint_restore.py", + tags = ["team:rllib", "tests_dir", "gpu"], + size = "large", + srcs = ["tests/test_algorithm_checkpoint_restore.py"], + args = ["TestCheckpointRestorePG"] +) + py_test( name = "tests/test_checkpoint_restore_off_policy", main = "tests/test_algorithm_checkpoint_restore.py", @@ -2455,6 +2477,16 @@ py_test( args = ["TestCheckpointRestoreOffPolicy"] ) + +py_test( + name = "tests/test_checkpoint_restore_off_policy_gpu", + main = "tests/test_algorithm_checkpoint_restore.py", + tags = ["team:rllib", "tests_dir", "gpu"], + size = "large", + srcs = ["tests/test_algorithm_checkpoint_restore.py"], + args = ["TestCheckpointRestoreOffPolicy"] +) + py_test( name = "tests/test_checkpoint_restore_evolution_algos", main = "tests/test_algorithm_checkpoint_restore.py", @@ -2464,6 +2496,15 @@ py_test( args = ["TestCheckpointRestoreEvolutionAlgos"] ) +py_test( + name = "tests/test_checkpoint_restore_evolution_algos_gpu", + main = "tests/test_algorithm_checkpoint_restore.py", + tags = ["team:rllib", "tests_dir", "gpu"], + size = "medium", + srcs = ["tests/test_algorithm_checkpoint_restore.py"], + args = ["TestCheckpointRestoreEvolutionAlgos"] +) + py_test( name = "policy/tests/test_policy_checkpoint_restore", main = "policy/tests/test_policy_checkpoint_restore.py", @@ -2715,10 +2756,19 @@ py_test( args = ["TestSupportedSpacesPPO"] ) +py_test( + name="tests/test_supported_spaces_dqn", + main="tests/test_supported_spaces.py", + tags=["team:rllib", "tests_dir"], + size="large", + srcs=["tests/test_supported_spaces.py"], + args=["TestSupportedSpacesDQN"] +) + py_test( name = "tests/test_supported_spaces_ppo_no_preproceesor_gpu", main = "tests/test_supported_spaces.py", - tags = ["team:rllib", "tests_dir", "multi_gpu", "exclusive"], + tags = ["team:rllib", "gpu", "no_cpu"], size = "large", srcs = ["tests/test_supported_spaces.py"], args = ["TestSupportedSpacesPPONoPreprocessorGPU"] diff --git a/rllib/algorithms/a3c/a3c.py b/rllib/algorithms/a3c/a3c.py index 7f5a661cb94d..0ecee6f78134 100644 --- a/rllib/algorithms/a3c/a3c.py +++ b/rllib/algorithms/a3c/a3c.py @@ -67,7 +67,13 @@ def __init__(self, algo_class=None): self.use_critic = True self.use_gae = True self.lambda_ = 1.0 + self.grad_clip = 40.0 + # Note: Only when using _enable_learner_api=True can the clipping mode be + # configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + self.lr_schedule = None self.vf_loss_coeff = 0.5 self.entropy_coeff = 0.01 diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index 789039f28628..d47704ae7373 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -76,6 +76,7 @@ ) from ray.rllib.utils.checkpoints import ( CHECKPOINT_VERSION, + CHECKPOINT_VERSION_LEARNER, get_checkpoint_info, try_import_msgpack, ) @@ -209,12 +210,14 @@ class Algorithm(Trainable): # List of keys that are always fully overridden if present in any dict or sub-dict _override_all_key_list = ["off_policy_estimation_methods", "policies"] - _progress_metrics = [ - "episode_reward_mean", - "evaluation/episode_reward_mean", + _progress_metrics = ( "num_env_steps_sampled", "num_env_steps_trained", - ] + "episodes_total", + "sampler_results/episode_len_mean", + "sampler_results/episode_reward_mean", + "evaluation/sampler_results/episode_reward_mean", + ) @staticmethod def from_checkpoint( @@ -456,11 +459,17 @@ def default_logger_creator(config): # (although their values may be nan), so that Tune does not complain # when we use these as stopping criteria. self.evaluation_metrics = { + # TODO: Don't dump sampler results into top-level. "evaluation": { "episode_reward_max": np.nan, "episode_reward_min": np.nan, "episode_reward_mean": np.nan, - } + "sampler_results": { + "episode_reward_max": np.nan, + "episode_reward_min": np.nan, + "episode_reward_mean": np.nan, + }, + }, } super().__init__( @@ -665,7 +674,9 @@ def setup(self, config: AlgorithmConfig) -> None: parallelism = self.evaluation_config.evaluation_num_workers or 1 batch_size = max(ds.count() // parallelism, 1) self.evaluation_dataset = ds.map_batches( - remove_time_dim, batch_size=batch_size + remove_time_dim, + batch_size=batch_size, + batch_format="pandas", ) logger.info("Evaluation dataset created") @@ -712,10 +723,10 @@ def setup(self, config: AlgorithmConfig) -> None: self.learner_group = None if self.config._enable_learner_api: # TODO (Kourosh): This is an interim solution where policies and modules - # co-exist. In this world we have both policy_map and MARLModule that need - # to be consistent with one another. To make a consistent parity between - # the two we need to loop through the policy modules and create a simple - # MARLModule from the RLModule within each policy. + # co-exist. In this world we have both policy_map and MARLModule that need + # to be consistent with one another. To make a consistent parity between + # the two we need to loop through the policy modules and create a simple + # MARLModule from the RLModule within each policy. local_worker = self.workers.local_worker() module_spec = local_worker.marl_module_spec learner_group_config = self.config.get_learner_group_config(module_spec) @@ -1059,6 +1070,11 @@ def duration_fn(num_units_done): keep_custom_metrics=self.config.keep_per_episode_custom_metrics, timeout_seconds=eval_cfg.metrics_episode_collection_timeout_s, ) + + # TODO: Don't dump sampler results into top-level. + if not self.config.custom_evaluation_function: + metrics = dict({"sampler_results": metrics}, **metrics) + metrics[NUM_AGENT_STEPS_SAMPLED_THIS_ITER] = agent_steps_this_iter metrics[NUM_ENV_STEPS_SAMPLED_THIS_ITER] = env_steps_this_iter # TODO: Remove this key at some point. Here for backward compatibility. @@ -1256,11 +1272,14 @@ def remote_fn(worker): f"{unit} done)" ) - metrics = summarize_episodes( + sampler_results = summarize_episodes( rollout_metrics, keep_custom_metrics=eval_cfg["keep_per_episode_custom_metrics"], ) + # TODO: Don't dump sampler results into top-level. + metrics = dict({"sampler_results": sampler_results}, **sampler_results) + metrics[NUM_AGENT_STEPS_SAMPLED_THIS_ITER] = agent_steps_this_iter metrics[NUM_ENV_STEPS_SAMPLED_THIS_ITER] = env_steps_this_iter # TODO: Remove this key at some point. Here for backward compatibility. @@ -2063,6 +2082,14 @@ def save_checkpoint(self, checkpoint_dir: str) -> str: policy_state.pkl pol_2/ policy_state.pkl + learner/ + learner_state.json + module_state/ + module_1/ + ... + optimizer_state/ + optimizers_module_1/ + ... rllib_checkpoint.json algorithm_state.pkl @@ -2085,7 +2112,10 @@ def save_checkpoint(self, checkpoint_dir: str) -> str: policy_states = state["worker"].pop("policy_states", {}) # Add RLlib checkpoint version. - state["checkpoint_version"] = CHECKPOINT_VERSION + if self.config._enable_learner_api: + state["checkpoint_version"] = CHECKPOINT_VERSION_LEARNER + else: + state["checkpoint_version"] = CHECKPOINT_VERSION # Write state (w/o policies) to disk. state_file = os.path.join(checkpoint_dir, "algorithm_state.pkl") @@ -2116,21 +2146,24 @@ def save_checkpoint(self, checkpoint_dir: str) -> str: policy = self.get_policy(pid) policy.export_checkpoint(policy_dir, policy_state=policy_state) + # if we are using the learner API, save the learner group state + if self.config._enable_learner_api: + learner_state_dir = os.path.join(checkpoint_dir, "learner") + self.learner_group.save_state(learner_state_dir) + return checkpoint_dir @override(Trainable) - def load_checkpoint(self, checkpoint: Union[Dict, str]) -> None: + def load_checkpoint(self, checkpoint: str) -> None: # Checkpoint is provided as a directory name. # Restore from the checkpoint file or dir. - if isinstance(checkpoint, str): - checkpoint_info = get_checkpoint_info(checkpoint) - checkpoint_data = Algorithm._checkpoint_info_to_algorithm_state( - checkpoint_info - ) - # Checkpoint is a checkpoint-as-dict -> Restore state from it as-is. - else: - checkpoint_data = checkpoint + + checkpoint_info = get_checkpoint_info(checkpoint) + checkpoint_data = Algorithm._checkpoint_info_to_algorithm_state(checkpoint_info) self.__setstate__(checkpoint_data) + if self.config._enable_learner_api: + learner_state_dir = os.path.join(checkpoint, "learner") + self.learner_group.load_state(learner_state_dir) @override(Trainable) def log_result(self, result: ResultDict) -> None: @@ -2158,7 +2191,7 @@ def default_resource_request( # Default logic for RLlib Algorithms: # Create one bundle per individual worker (local or remote). - # Use `num_cpus_for_driver` and `num_gpus` for the local worker and + # Use `num_cpus_for_local_worker` and `num_gpus` for the local worker and # `num_cpus_per_worker` and `num_gpus_per_worker` for the remote # workers to determine their CPU/GPU resource needs. @@ -3023,22 +3056,32 @@ def _compile_iteration_results( NUM_ENV_STEPS_TRAINED, ]: results[c] = self._counters[c] + time_taken_sec = step_ctx.get_time_taken_sec() if self.config.count_steps_by == "agent_steps": results[NUM_AGENT_STEPS_SAMPLED + "_this_iter"] = step_ctx.sampled results[NUM_AGENT_STEPS_TRAINED + "_this_iter"] = step_ctx.trained + results[NUM_AGENT_STEPS_SAMPLED + "_throughput_per_sec"] = ( + step_ctx.sampled / time_taken_sec + ) + results[NUM_AGENT_STEPS_TRAINED + "_throughput_per_sec"] = ( + step_ctx.trained / time_taken_sec + ) # TODO: For CQL and other algos, count by trained steps. results["timesteps_total"] = self._counters[NUM_AGENT_STEPS_SAMPLED] - # TODO: Backward compatibility. - results[STEPS_TRAINED_THIS_ITER_COUNTER] = step_ctx.trained else: results[NUM_ENV_STEPS_SAMPLED + "_this_iter"] = step_ctx.sampled results[NUM_ENV_STEPS_TRAINED + "_this_iter"] = step_ctx.trained + results[NUM_ENV_STEPS_SAMPLED + "_throughput_per_sec"] = ( + step_ctx.sampled / time_taken_sec + ) + results[NUM_ENV_STEPS_TRAINED + "_throughput_per_sec"] = ( + step_ctx.trained / time_taken_sec + ) # TODO: For CQL and other algos, count by trained steps. results["timesteps_total"] = self._counters[NUM_ENV_STEPS_SAMPLED] - # TODO: Backward compatibility. - results[STEPS_TRAINED_THIS_ITER_COUNTER] = step_ctx.trained # TODO: Backward compatibility. + results[STEPS_TRAINED_THIS_ITER_COUNTER] = step_ctx.trained results["agent_timesteps_total"] = self._counters[NUM_AGENT_STEPS_SAMPLED] # Process timer results. @@ -3102,6 +3145,8 @@ def _validate_config(config, trainer_or_none): class TrainIterCtx: def __init__(self, algo: Algorithm): self.algo = algo + self.time_start = None + self.time_stop = None def __enter__(self): # Before first call to `step()`, `results` is expected to be None -> @@ -3122,7 +3167,11 @@ def __enter__(self): return self def __exit__(self, *args): - pass + self.time_stop = time.time() + + def get_time_taken_sec(self) -> float: + """Returns the time we spent in the context in seconds.""" + return self.time_stop - self.time_start def should_stop(self, results): diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 249781cda12e..241645f632a7 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -1,5 +1,4 @@ import copy -import dataclasses import logging import math import os @@ -8,6 +7,7 @@ Callable, Container, Dict, + List, Mapping, Optional, Tuple, @@ -18,7 +18,7 @@ import ray from ray.rllib.algorithms.callbacks import DefaultCallbacks -from ray.rllib.core.learner.learner import LearnerHPs +from ray.rllib.core.learner.learner import LearnerHyperparameters from ray.rllib.core.learner.learner_group_config import ( LearnerGroupConfig, ModuleSpec, @@ -52,6 +52,7 @@ try_import_gymnasium_and_gym, ) from ray.rllib.utils.policy import validate_policy_id +from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.serialization import ( deserialize_type, NOT_SERIALIZABLE, @@ -316,16 +317,15 @@ def __init__(self, algo_class=None): # `self.training()` self.gamma = 0.99 self.lr = 0.001 + self.lr_schedule = None + self.grad_clip = None + self.grad_clip_by = "global_norm" self.train_batch_size = 32 self.model = copy.deepcopy(MODEL_DEFAULTS) self.optimizer = {} self.max_requests_in_flight_per_sampler_worker = 2 - self.learner_class = None + self._learner_class = None self._enable_learner_api = False - # experimental: this will contain the hyper-parameters that are passed to the - # Learner, for computing loss, etc. New algorithms have to set this to their - # own default. .training() will modify the fields of this object. - self._learner_hps = LearnerHPs() # `self.callbacks()` self.callbacks_class = DefaultCallbacks @@ -419,8 +419,9 @@ def __init__(self, algo_class=None): # `self.rl_module()` self.rl_module_spec = None self._enable_rl_module_api = False - # Whether to error out if exploration config is set when using RLModules. - self._validate_exploration_conf_and_rl_modules = True + # Helper to keep track of the original exploration config when dis-/enabling + # rl modules. + self.__prior_exploration_config = None # `self.experimental()` self._tf_policy_handles_more_than_one_loss = False @@ -466,10 +467,6 @@ def __init__(self, algo_class=None): self.soft_horizon = DEPRECATED_VALUE self.no_done_at_end = DEPRECATED_VALUE - @property - def learner_hps(self) -> LearnerHPs: - return self._learner_hps - def to_dict(self) -> AlgorithmConfigDict: """Converts all settings into a legacy config dict for backward compatibility. @@ -568,6 +565,13 @@ def update_from_dict( """ eval_call = {} + # We deal with this special key before all others because it may influence + # stuff like "exploration_config". + # Namely, we want to re-instantiate the exploration config this config had + # inside `self.rl_module()` before potentially overwriting it in the following. + if "_enable_rl_module_api" in config_dict: + self.rl_module(_enable_rl_module_api=config_dict["_enable_rl_module_api"]) + # Modify our properties one by one. for key, value in config_dict.items(): key = self._translate_special_keys(key, warn_deprecated=False) @@ -577,8 +581,11 @@ def update_from_dict( if key == TRIAL_INFO: continue + if key == "_enable_rl_module_api": + # We've dealt with this above. + continue # Set our multi-agent settings. - if key == "multiagent": + elif key == "multiagent": kwargs = { k: value[k] for k in [ @@ -840,7 +847,7 @@ def validate(self) -> None: error=True, ) - # RLModule API only works with connectors. + # RLModule API only works with connectors and with Learner API. if not self.enable_connectors and self._enable_rl_module_api: raise ValueError( "RLModule API only works with connectors. " @@ -849,39 +856,31 @@ def validate(self) -> None: ) # Learner API requires RLModule API. - if self._enable_learner_api and not self._enable_rl_module_api: + if self._enable_learner_api is not self._enable_rl_module_api: raise ValueError( - "Learner API requires RLModule API. " - "Please enable RLModule API via " - "`config.training(_enable_rl_module_api=True)`." + "Learner API requires RLModule API and vice-versa! " + "Enable RLModule API via " + "`config.rl_module(_enable_rl_module_api=True)` and the Learner API " + "via `config.training(_enable_learner_api=True)` (or set both to " + "False)." ) if bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False)): - # enable RLModule API and connectors if env variable is set + # Enable RLModule API and connectors if env variable is set # (to be used in unittesting) self.rl_module(_enable_rl_module_api=True) + self.training(_enable_learner_api=True) self.enable_connectors = True - # Explore parameter cannot be False with RLModule API enabled. - # The reason is that the explore is not just a parameter that will get passed - # down to the policy.compute_actions() anymore. It is a phase in which RLModule. - # forward_exploration() will get called during smapling. If user needs to - # really disable the stochasticity during this phase, they need to override the - # RLModule.forward_exploration() method or setup model parameters such that it - # will disable the stocalisticity of this method (e.g. by setting the std to 0 - # or setting temprature to 0 for the Categorical distribution). + # LR-schedule checking. + if self._enable_learner_api: + Scheduler.validate(self.lr_schedule, "lr_schedule", "learning rate") - if self._enable_rl_module_api and not self.explore: + # Validate grad clipping settings. + if self.grad_clip_by not in ["value", "norm", "global_norm"]: raise ValueError( - "When RLModule API is enabled, explore parameter cannot be False. " - "Please set explore=None or disable RLModule API via " - "`config.rl_module(_enable_rl_module_api=False)`." - "If you want to disable the stochasticity during the exploration " - "phase, you can customize your RLModule and override the RLModule." - "forward_exploration() method " - "or setup model parameters such that it will disable the " - "stochasticity of this method (e.g. by setting the std to 0 or " - "setting temperature to 0 for the Categorical distribution)." + f"`grad_clip_by` ({self.grad_clip_by}) must be one of: 'value', " + "'norm', or 'global_norm'!" ) # TODO: Deprecate self.simple_optimizer! @@ -1002,25 +1001,16 @@ def validate(self) -> None: self.rl_module_spec = default_rl_module_spec if self.exploration_config: - if self._validate_exploration_conf_and_rl_modules: - # This is not compatible with RLModules, which have a method - # `forward_exploration` to specify custom exploration behavior. - raise ValueError( - "When RLModule API are enabled, exploration_config can not be " - "set. If you want to implement custom exploration behaviour, " - "please modify the `forward_exploration` method of the " - "RLModule at hand. On configs that have a default exploration " - "config, this must be done with " - "`config.exploration_config={}`." - ) - else: - # RLModules don't support exploration_configs anymore. - # AlgorithmConfig has a default exploration config. - logger.warning( - "When RLModule API are enabled, exploration_config " - "will be ignored. Disable RLModule API make use of an " - "exploration_config." - ) + # This is not compatible with RLModules, which have a method + # `forward_exploration` to specify custom exploration behavior. + raise ValueError( + "When RLModule API are enabled, exploration_config can not be " + "set. If you want to implement custom exploration behaviour, " + "please modify the `forward_exploration` method of the " + "RLModule at hand. On configs that have a default exploration " + "config, this must be done with " + "`config.exploration_config={}`." + ) # make sure the resource requirements for learner_group is valid if self.num_learner_workers == 0 and self.num_gpus_per_worker > 1: @@ -1029,11 +1019,6 @@ def validate(self) -> None: "(i.e. num_learner_workers = 0)" ) - # resolve learner class - if self._enable_learner_api and self.learner_class is None: - learner_class_path = self.get_default_learner_class() - self.learner_class = deserialize_type(learner_class_path) - def build( self, env: Optional[Union[str, EnvType]] = None, @@ -1589,8 +1574,12 @@ def rollouts( def training( self, + *, gamma: Optional[float] = NotProvided, lr: Optional[float] = NotProvided, + lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, + grad_clip: Optional[float] = NotProvided, + grad_clip_by: Optional[str] = NotProvided, train_batch_size: Optional[int] = NotProvided, model: Optional[dict] = NotProvided, optimizer: Optional[dict] = NotProvided, @@ -1603,6 +1592,34 @@ def training( Args: gamma: Float specifying the discount factor of the Markov Decision process. lr: The default learning rate. + lr_schedule: Learning rate schedule. In the format of + [[timestep, lr-value], [timestep, lr-value], ...] + Intermediary timesteps will be assigned to interpolated learning rate + values. A schedule config's first entry must start with timestep 0, + i.e.: [[0, initial_value], [...]]. + grad_clip: The value to use for gradient clipping. Depending on the + `grad_clip_by` setting, gradients will either be clipped by value, + norm, or global_norm (see docstring on `grad_clip_by` below for more + details). If `grad_clip` is None, gradients will be left unclipped. + grad_clip_by: If 'value': Will clip all computed gradients individually + inside the interval [-grad_clip, +grad_clip]. + If 'norm', will compute the L2-norm of each weight/bias + gradient tensor and then clip all gradients such that this L2-norm does + not exceed `grad_clip`. The L2-norm of a tensor is computed via: + `sqrt(SUM(w0^2, w1^2, ..., wn^2))` where w[i] are the elements of the + tensor (no matter what the shape of this tensor is). + If 'global_norm', will compute the square of the L2-norm of each + weight/bias gradient tensor, sum up all these squared L2-norms across + all given gradient tensors (e.g. the entire module to + be updated), square root that overall sum, and then clip all gradients + such that this "global" L2-norm does not exceed the given value. + The global L2-norm over a list of tensors (e.g. W and V) is computed + via: + `sqrt[SUM(w0^2, w1^2, ..., wn^2) + SUM(v0^2, v1^2, ..., vm^2)]`, where + w[i] and v[j] are the elements of the tensors W and V (no matter what + the shapes of these tensors are). + Note that if `grad_clip` is None, the `grad_clip_by` setting has no + effect. train_batch_size: Training batch size, if applicable. model: Arguments passed into the policy model. See models/catalog.py for a full list of the available model options. @@ -1631,6 +1648,12 @@ def training( self.gamma = gamma if lr is not NotProvided: self.lr = lr + if lr_schedule is not NotProvided: + self.lr_schedule = lr_schedule + if grad_clip is not NotProvided: + self.grad_clip = grad_clip + if grad_clip_by is not NotProvided: + self.grad_clip_by = grad_clip_by if train_batch_size is not NotProvided: self.train_batch_size = train_batch_size if model is not NotProvided: @@ -1650,7 +1673,7 @@ def training( deprecation_warning( old="AlgorithmConfig.training(_use_default_native_models=True)", help="_use_default_native_models is not supported " - "anymore. To get rid of this error, set `experimental(" + "anymore. To get rid of this error, set `rl_module(" "_enable_rl_module_api` to True. Native models will " "be better supported by the upcoming RLModule API.", # Error out if user tries to enable this @@ -1666,7 +1689,7 @@ def training( if _enable_learner_api is not NotProvided: self._enable_learner_api = _enable_learner_api if learner_class is not NotProvided: - self.learner_class = learner_class + self._learner_class = learner_class return self @@ -2420,7 +2443,26 @@ def rl_module( "config, this must be done with " "`config.exploration_config={}`." ) + self.__prior_exploration_config = self.exploration_config self.exploration_config = {} + elif _enable_rl_module_api is False and not self.exploration_config: + if self.__prior_exploration_config is not None: + logger.warning( + f"Setting `exploration_config=" + f"{self.__prior_exploration_config}` because you set " + f"`_enable_rl_modules=False`. This exploration config was " + f"restored from a prior exploration config that was overriden " + f"when setting `_enable_rl_modules=True`. This occurs because " + f"when RLModule API are enabled, exploration_config can not " + f"be set." + ) + self.exploration_config = self.__prior_exploration_config + self.__prior_exploration_config = None + else: + logger.warning( + "config._enable_rl_module_api was set to False, but no prior " + "exploration config was found to be restored." + ) else: # throw a warning if the user has used this API but not enabled it. logger.warning( @@ -2485,6 +2527,20 @@ def experimental( return self + @property + def learner_class(self) -> Type["Learner"]: + """Returns the Learner sub-class to use by this Algorithm. + + Either + a) User sets a specific learner class via calling `.training(learner_class=...)` + b) User leaves learner class unset (None) and the AlgorithmConfig itself + figures out the actual learner class by calling its own + `.get_default_learner_class()` method. + """ + return self._learner_class or self.get_default_learner_class() + + # TODO: Make rollout_fragment_length as read-only property and replace the current + # self.rollout_fragment_length a private variable. def get_rollout_fragment_length(self, worker_index: int = 0) -> int: """Automatically infers a proper rollout_fragment_length setting if "auto". @@ -2520,6 +2576,8 @@ def get_rollout_fragment_length(self, worker_index: int = 0) -> int: else: return self.rollout_fragment_length + # TODO: Make evaluation_config as read-only property and replace the current + # self.evaluation_config a private variable. def get_evaluation_config_object( self, ) -> Optional["AlgorithmConfig"]: @@ -2813,6 +2871,8 @@ def is_policy_to_train(pid, batch=None): return policies, is_policy_to_train + # TODO: Move this to those algorithms that really need this, which is currently + # only A2C and PG. def validate_train_batch_size_vs_rollout_fragment_length(self) -> None: """Detects mismatches for `train_batch_size` vs `rollout_fragment_length`. @@ -3066,10 +3126,15 @@ def get_learner_group_config(self, module_spec: ModuleSpec) -> LearnerGroupConfi .learner( learner_class=self.learner_class, # TODO (Kourosh): optimizer config can now be more complicated. + # TODO (Sven): Shouldn't optimizer config be part of learner HPs? + # E.g. if we have a lr schedule, this will have to be managed by + # the learner, NOT the optimizer directly. optimizer_config={ "lr": self.lr, + "grad_clip": self.grad_clip, + "grad_clip_by": self.grad_clip_by, }, - learner_hps=self.learner_hps, + learner_hyperparameters=self.get_learner_hyperparameters(), ) .resources( num_learner_workers=self.num_learner_workers, @@ -3082,6 +3147,20 @@ def get_learner_group_config(self, module_spec: ModuleSpec) -> LearnerGroupConfi return config + def get_learner_hyperparameters(self) -> LearnerHyperparameters: + """Returns a new LearnerHyperparameters instance for the respective Learner. + + The LearnerHyperparameters is a dataclass containing only those config settings + from AlgorithmConfig that are used by the algorithm's specific Learner + sub-class. They allow distributing only those settings relevant for learning + across a set of learner workers (instead of having to distribute the entire + AlgorithmConfig object). + + Note that LearnerHyperparameters should always be derived directly from a + AlgorithmConfig object's own settings and considered frozen/read-only. + """ + return LearnerHyperparameters(lr_schedule=self.lr_schedule) + def __setattr__(self, key, value): """Gatekeeper in case we are in frozen state and need to error.""" @@ -3186,10 +3265,6 @@ def _serialize_dict(config): config["model"]["custom_model"] ) - # Serialize dataclasses. - if isinstance(config.get("_learner_hps"), LearnerHPs): - config["_learner_hps"] = dataclasses.asdict(config["_learner_hps"]) - # List'ify `policies`, iff a set or tuple (these types are not JSON'able). ma_config = config.get("multiagent") if ma_config is not None: diff --git a/rllib/algorithms/alpha_star/alpha_star.py b/rllib/algorithms/alpha_star/alpha_star.py index 02d05fcc4324..09a4d6039289 100644 --- a/rllib/algorithms/alpha_star/alpha_star.py +++ b/rllib/algorithms/alpha_star/alpha_star.py @@ -138,7 +138,6 @@ def __init__(self, algo_class=None): # Override some of APPOConfig's default values with AlphaStar-specific # values. - self.vtrace_drop_last_ts = False self.min_time_s_per_iteration = 2 self.policies = None self.simple_optimizer = True diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 1750457236b5..8b27ef5100cc 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -9,14 +9,17 @@ Detailed documentation: https://docs.ray.io/en/master/rllib-algorithms.html#appo """ +import dataclasses from typing import Optional, Type import logging from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.algorithms.appo.appo_learner import ( + AppoHyperparameters, + LEARNER_RESULTS_KL_KEY, +) from ray.rllib.algorithms.impala.impala import Impala, ImpalaConfig -from ray.rllib.algorithms.appo.tf.appo_tf_learner import AppoHPs, LEARNER_RESULTS_KL_KEY from ray.rllib.algorithms.ppo.ppo import UpdateKL -from ray.rllib.execution.common import _get_shared_metrics, STEPS_SAMPLED_COUNTER from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.policy import Policy from ray.rllib.utils.annotations import override @@ -25,8 +28,6 @@ NUM_AGENT_STEPS_SAMPLED, NUM_ENV_STEPS_SAMPLED, NUM_TARGET_UPDATES, - NUM_ENV_STEPS_TRAINED, - NUM_AGENT_STEPS_TRAINED, ) from ray.rllib.utils.metrics import ALL_MODULES, LEARNER_STATS_KEY from ray.rllib.utils.typing import ( @@ -78,7 +79,6 @@ def __init__(self, algo_class=None): # __sphinx_doc_begin__ # APPO specific settings: - self._learner_hps = AppoHPs() self.vtrace = True self.use_critic = True self.use_gae = True @@ -104,7 +104,13 @@ def __init__(self, algo_class=None): self.learner_queue_timeout = 300 self.max_sample_requests_in_flight_per_worker = 2 self.broadcast_interval = 1 + self.grad_clip = 40.0 + # Note: Only when using _enable_learner_api=True can the clipping mode be + # configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + self.opt_type = "adam" self.lr = 0.0005 self.lr_schedule = None @@ -190,76 +196,72 @@ def training( self.lambda_ = lambda_ if clip_param is not NotProvided: self.clip_param = clip_param - self._learner_hps.clip_param = clip_param if use_kl_loss is not NotProvided: self.use_kl_loss = use_kl_loss if kl_coeff is not NotProvided: self.kl_coeff = kl_coeff - self._learner_hps.kl_coeff = kl_coeff if kl_target is not NotProvided: self.kl_target = kl_target - self._learner_hps.kl_target = kl_target if tau is not NotProvided: self.tau = tau - self._learner_hps.tau = tau if target_update_frequency is not NotProvided: self.target_update_frequency = target_update_frequency return self - @override(AlgorithmConfig) + @override(ImpalaConfig) def get_default_learner_class(self): - if self.framework_str == "tf2": + if self.framework_str == "torch": + from ray.rllib.algorithms.appo.torch.appo_torch_learner import ( + APPOTorchLearner, + ) + + return APPOTorchLearner + elif self.framework_str == "tf2": from ray.rllib.algorithms.appo.tf.appo_tf_learner import APPOTfLearner return APPOTfLearner else: raise ValueError(f"The framework {self.framework_str} is not supported.") - @override(AlgorithmConfig) + @override(ImpalaConfig) def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec: - if self.framework_str == "tf2": - from ray.rllib.algorithms.appo.appo_catalog import APPOCatalog - from ray.rllib.algorithms.appo.tf.appo_tf_rl_module import APPOTfRLModule - - return SingleAgentRLModuleSpec( - module_class=APPOTfRLModule, catalog_class=APPOCatalog + if self.framework_str == "torch": + from ray.rllib.algorithms.appo.torch.appo_torch_rl_module import ( + APPOTorchRLModule as RLModule, + ) + elif self.framework_str == "tf2": + from ray.rllib.algorithms.appo.tf.appo_tf_rl_module import ( + APPOTfRLModule as RLModule, ) else: raise ValueError(f"The framework {self.framework_str} is not supported.") + from ray.rllib.algorithms.appo.appo_catalog import APPOCatalog + + return SingleAgentRLModuleSpec(module_class=RLModule, catalog_class=APPOCatalog) + @override(ImpalaConfig) - def validate(self) -> None: - super().validate() - self._learner_hps.tau = self.tau - self._learner_hps.kl_target = self.kl_target - self._learner_hps.kl_coeff = self.kl_coeff - self._learner_hps.clip_param = self.clip_param + def get_learner_hyperparameters(self) -> AppoHyperparameters: + base_hps = super().get_learner_hyperparameters() + return AppoHyperparameters( + use_kl_loss=self.use_kl_loss, + kl_target=self.kl_target, + kl_coeff=self.kl_coeff, + clip_param=self.clip_param, + tau=self.tau, + target_update_frequency_ts=( + self.train_batch_size * self.num_sgd_iter * self.target_update_frequency + ), + **dataclasses.asdict(base_hps), + ) +# Still used by one of the old checkpoints in tests. +# Keep a shim version of this around. class UpdateTargetAndKL: def __init__(self, workers, config): - self.workers = workers - self.config = config - self.update_kl = UpdateKL(workers) - self.target_update_freq = ( - config["num_sgd_iter"] * config["minibatch_buffer_size"] - ) - - def __call__(self, fetches): - metrics = _get_shared_metrics() - cur_ts = metrics.counters[STEPS_SAMPLED_COUNTER] - last_update = metrics.counters[LAST_TARGET_UPDATE_TS] - if cur_ts - last_update > self.target_update_freq: - metrics.counters[NUM_TARGET_UPDATES] += 1 - metrics.counters[LAST_TARGET_UPDATE_TS] = cur_ts - # Update Target Network - self.workers.local_worker().foreach_policy_to_train( - lambda p, _: p.update_target() - ) - # Also update KL Coeff - if self.config.use_kl_loss: - self.update_kl(fetches) + pass class APPO(Impala): @@ -289,9 +291,8 @@ def setup(self, config: AlgorithmConfig): def after_train_step(self, train_results: ResultDict) -> None: """Updates the target network and the KL coefficient for the APPO-loss. - This method is called from within the `training_iteration` method after each - train update. - + This method is called from within the `training_step` method after each train + update. The target network update frequency is calculated automatically by the product of `num_sgd_iter` setting (usually 1 for APPO) and `minibatch_buffer_size`. @@ -300,43 +301,14 @@ def after_train_step(self, train_results: ResultDict) -> None: training step. """ - last_update = self._counters[LAST_TARGET_UPDATE_TS] - - if self.config._enable_learner_api and train_results: - # using steps trained here instead of sampled ... I'm not sure why the - # other implemenetation uses sampled. - # to be quite frank, im not sure if I understand how their target update - # freq would work. The difference in steps sampled/trained is pretty - # much always going to be larger than self.config.num_sgd_iter * - # self.config.minibatch_buffer_size unless the number of steps collected - # is really small. The thing is that the default rollout fragment length - # is 50, so the minibatch buffer size * num_sgd_iter is going to be - # have to be 50 to even meet the threshold of having delayed target - # updates. - # we should instead have the target / kl threshold update be based off - # of the train_batch_size * some target update frequency * num_sgd_iter. - cur_ts = self._counters[ - NUM_ENV_STEPS_TRAINED - if self.config.count_steps_by == "env_steps" - else NUM_AGENT_STEPS_TRAINED - ] - target_update_steps_freq = ( - self.config.train_batch_size - * self.config.num_sgd_iter - * self.config.target_update_frequency - ) - if (cur_ts - last_update) >= target_update_steps_freq: - kls_to_update = {} - for module_id, module_results in train_results.items(): - if module_id != ALL_MODULES: - kls_to_update[module_id] = module_results[LEARNER_STATS_KEY][ - LEARNER_RESULTS_KL_KEY - ] - self._counters[NUM_TARGET_UPDATES] += 1 - self._counters[LAST_TARGET_UPDATE_TS] = cur_ts - self.learner_group.additional_update(sampled_kls=kls_to_update) - + if self.config._enable_learner_api: + if NUM_TARGET_UPDATES in train_results: + self._counters[NUM_TARGET_UPDATES] += train_results[NUM_TARGET_UPDATES] + self._counters[LAST_TARGET_UPDATE_TS] = train_results[ + LAST_TARGET_UPDATE_TS + ] else: + last_update = self._counters[LAST_TARGET_UPDATE_TS] cur_ts = self._counters[ NUM_AGENT_STEPS_SAMPLED if self.config.count_steps_by == "agent_steps" @@ -378,6 +350,17 @@ def update(pi, pi_id): # Worker. self.workers.local_worker().foreach_policy_to_train(update) + @override(Impala) + def _get_additional_update_kwargs(self, train_results) -> dict: + return dict( + last_update=self._counters[LAST_TARGET_UPDATE_TS], + mean_kl_loss_per_module={ + mid: r[LEARNER_STATS_KEY][LEARNER_RESULTS_KL_KEY] + for mid, r in train_results.items() + if mid != ALL_MODULES + }, + ) + @override(Impala) def training_step(self) -> ResultDict: train_results = super().training_step() @@ -398,15 +381,9 @@ def get_default_policy_class( cls, config: AlgorithmConfig ) -> Optional[Type[Policy]]: if config["framework"] == "torch": - if config._enable_rl_module_api: - raise ValueError( - "APPO with the torch backend is not yet supported by " - " the RLModule and Learner API." - ) - else: - from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy + from ray.rllib.algorithms.appo.appo_torch_policy import APPOTorchPolicy - return APPOTorchPolicy + return APPOTorchPolicy elif config["framework"] == "tf": if config._enable_rl_module_api: raise ValueError( @@ -418,13 +395,6 @@ def get_default_policy_class( return APPOTF1Policy else: - if config._enable_rl_module_api: - # TODO(avnishn): This policy class doesn't work just yet - from ray.rllib.algorithms.appo.tf.appo_tf_policy_rlm import ( - APPOTfPolicyWithRLModule, - ) - - return APPOTfPolicyWithRLModule from ray.rllib.algorithms.appo.appo_tf_policy import APPOTF2Policy return APPOTF2Policy diff --git a/rllib/algorithms/appo/appo_catalog.py b/rllib/algorithms/appo/appo_catalog.py index b675cba4b9cd..4c9e14244570 100644 --- a/rllib/algorithms/appo/appo_catalog.py +++ b/rllib/algorithms/appo/appo_catalog.py @@ -10,7 +10,7 @@ class APPOCatalog(PPOCatalog): - Value Function Head: The head used to compute the value function. The ActorCriticEncoder is a wrapper around Encoders to produce separate outputs - for the policy and value function. See implementations of PPORLModuleBase for + for the policy and value function. See implementations of PPORLModule for more details. Any custom ActorCriticEncoder can be built by overriding the diff --git a/rllib/algorithms/appo/appo_learner.py b/rllib/algorithms/appo/appo_learner.py new file mode 100644 index 000000000000..8fa026b2ff61 --- /dev/null +++ b/rllib/algorithms/appo/appo_learner.py @@ -0,0 +1,131 @@ +import abc +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Mapping + +from ray.rllib.algorithms.impala.impala_learner import ( + ImpalaLearner, + ImpalaHyperparameters, +) +from ray.rllib.core.rl_module.marl_module import ModuleID +from ray.rllib.utils.annotations import override +from ray.rllib.utils.metrics import LAST_TARGET_UPDATE_TS, NUM_TARGET_UPDATES + + +LEARNER_RESULTS_KL_KEY = "mean_kl_loss" +LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff" +OLD_ACTION_DIST_KEY = "old_action_dist" +OLD_ACTION_DIST_LOGITS_KEY = "old_action_dist_logits" + + +@dataclass +class AppoHyperparameters(ImpalaHyperparameters): + """Hyperparameters for the APPOLearner sub-classes (framework specific). + + These should never be set directly by the user. Instead, use the APPOConfig + class to configure your algorithm. + See `ray.rllib.algorithms.appo.appo::APPOConfig::training()` for more details on the + individual properties. + """ + + use_kl_loss: bool = None + kl_coeff: float = None + kl_target: float = None + clip_param: float = None + tau: float = None + target_update_frequency_ts: int = None + + +class AppoLearner(ImpalaLearner): + """Adds KL coeff updates via `additional_updates_per_module()` to Impala logic. + + Framework-specific sub-classes must override `_update_module_target_networks()` + and `_update_module_kl_coeff()` + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # We need to make sure kl_coeff are available as framework tensors that are + # registered as part of the graph so that upon update the graph can be updated + # (e.g. in TF with eager tracing). + self.curr_kl_coeffs_per_module = defaultdict( + lambda: self._get_tensor_variable(self.hps.kl_coeff) + ) + + @override(ImpalaLearner) + def remove_module(self, module_id: str): + super().remove_module(module_id) + self.curr_kl_coeffs_per_module.pop(module_id) + + @override(ImpalaLearner) + def additional_update_per_module( + self, + module_id: ModuleID, + *, + last_update: int, + mean_kl_loss_per_module: dict, + timestep: int, + **kwargs, + ) -> Mapping[str, Any]: + """Updates the target networks and KL loss coefficients (per module). + + Args: + module_id: + """ + # TODO (avnish) Using steps trained here instead of sampled ... I'm not sure + # why the other implementation uses sampled. + # The difference in steps sampled/trained is pretty + # much always going to be larger than self.config.num_sgd_iter * + # self.config.minibatch_buffer_size unless the number of steps collected + # is really small. The thing is that the default rollout fragment length + # is 50, so the minibatch buffer size * num_sgd_iter is going to be + # have to be 50 to even meet the threshold of having delayed target + # updates. + # We should instead have the target / kl threshold update be based off + # of the train_batch_size * some target update frequency * num_sgd_iter. + results = super().additional_update_per_module(module_id, timestep=timestep) + + if (timestep - last_update) >= self.hps.target_update_frequency_ts: + self._update_module_target_networks(module_id) + results[NUM_TARGET_UPDATES] = 1 + results[LAST_TARGET_UPDATE_TS] = timestep + else: + results[NUM_TARGET_UPDATES] = 0 + results[LAST_TARGET_UPDATE_TS] = last_update + + if self.hps.use_kl_loss and module_id in mean_kl_loss_per_module: + results.update( + self._update_module_kl_coeff( + module_id, mean_kl_loss_per_module[module_id] + ) + ) + + return results + + @abc.abstractmethod + def _update_module_target_networks(self, module_id: ModuleID) -> None: + """Update the target policy of each module with the current policy. + + Do that update via polyak averaging. + + Args: + module_id: The module ID, whose target network(s) need to be updated. + """ + + @abc.abstractmethod + def _update_module_kl_coeff( + self, module_id: ModuleID, sampled_kl: float + ) -> Mapping[str, Any]: + """Dynamically update the KL loss coefficients of each module with. + + The update is completed using the mean KL divergence between the action + distributions current policy and old policy of each module. That action + distribution is computed during the most recent update/call to `compute_loss`. + + Args: + module_id: The module whose KL loss coefficient to update. + sampled_kl: The computed KL loss for the given Module + (KL divergence between the action distributions of the current + (most recently updated) module and the old module version). + """ diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 9519cf28cfb8..8441f8032ede 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -48,8 +48,7 @@ logger = logging.getLogger(__name__) -# We need this builder function because we want to share the same -# custom logics between TF1 dynamic and TF2 eager policies. +# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs. def get_appo_tf_policy(name: str, base: type) -> type: """Construct an APPOTFPolicy inheriting either dynamic or eager base policies. @@ -82,10 +81,15 @@ def __init__( # First thing first, enable eager execution if necessary. base.enable_eager_execution_if_necessary() - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - VTraceClipGradients.__init__(self) - VTraceOptimizer.__init__(self) + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + if not config.get("_enable_learner_api", False): + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. + VTraceClipGradients.__init__(self) + VTraceOptimizer.__init__(self) # Initialize base class. base.__init__( @@ -105,7 +109,9 @@ def __init__( ) ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) - GradStatsMixin.__init__(self) + + if not config.get("_enable_learner_api", False): + GradStatsMixin.__init__(self) # Note: this is a bit ugly, but loss and optimizer initialization must # happen after all the MixIns are initialized. diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 2c65ed8d183c..4a7754830f32 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -54,8 +54,7 @@ logger = logging.getLogger(__name__) -# We need this builder function because we want to share the same -# custom logics between TF1 dynamic and TF2 eager policies. +# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs. class APPOTorchPolicy( VTraceOptimizer, LearningRateSchedule, @@ -70,9 +69,15 @@ class APPOTorchPolicy( def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.appo.appo.APPOConfig().to_dict(), **config) - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - VTraceOptimizer.__init__(self) + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + if not config.get("_enable_learner_api", False): + # Although this is a no-op, we call __init__ here to make it clear + # that base.__init__ will use the make_model() call. + VTraceOptimizer.__init__(self) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) TorchPolicyV2.__init__( @@ -89,7 +94,6 @@ def __init__(self, observation_space, action_space, config): ValueNetworkMixin.__init__(self, config) KLCoeffMixin.__init__(self, config) - # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch() # Initiate TargetNetwork ops after loss initialization. diff --git a/rllib/algorithms/appo/tests/__init__.py b/rllib/algorithms/appo/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/algorithms/appo/tests/tf/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py similarity index 76% rename from rllib/algorithms/appo/tests/tf/test_appo_learner.py rename to rllib/algorithms/appo/tests/test_appo_learner.py index b0b2989cd8ba..1bc1bd1b0a08 100644 --- a/rllib/algorithms/appo/tests/tf/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -1,6 +1,8 @@ import unittest import numpy as np +import tree # pip install dm_tree + import ray import ray.rllib.algorithms.appo as appo from ray.rllib.algorithms.appo.tf.appo_tf_learner import ( @@ -8,10 +10,10 @@ ) from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch, DEFAULT_POLICY_ID -from ray.rllib.utils.metrics import ALL_MODULES -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY +from ray.rllib.utils.metrics.learner_info import LEARNER_INFO from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.utils.torch_utils import convert_to_torch_tensor tf1, tf, _ = try_import_tf() @@ -66,29 +68,30 @@ def test_appo_loss(self): fcnet_activation="linear", vf_share_layers=False, ), + _enable_learner_api=True, ) .rl_module( _enable_rl_module_api=True, ) ) # We have to set exploration_config here manually because setting it through - # config.exploration() only deepupdates it + # config.exploration() only deep-updates it config.exploration_config = {} - for fw in framework_iterator(config, ("tf2")): - trainer = config.build() - policy = trainer.get_policy() + for fw in framework_iterator(config, frameworks=("torch", "tf2")): + algo = config.build() + policy = algo.get_policy() if fw == "tf2": train_batch = SampleBatch( - tf.nest.map_structure(lambda x: tf.convert_to_tensor(x), FAKE_BATCH) + tree.map_structure(lambda x: tf.convert_to_tensor(x), FAKE_BATCH) ) else: - train_batch = SampleBatch(FAKE_BATCH) - policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) + train_batch = SampleBatch( + tree.map_structure(lambda x: convert_to_torch_tensor(x), FAKE_BATCH) + ) algo_config = config.copy(copy_frozen=False) - algo_config.training(_enable_learner_api=True) algo_config.validate() algo_config.freeze() @@ -103,17 +106,19 @@ def test_appo_loss(self): ) learner_group_config.num_learner_workers = 0 learner_group = learner_group_config.build() - learner_group.set_weights(trainer.get_weights()) - results = learner_group.update(train_batch.as_multi_agent()) - learner_group_loss = results[ALL_MODULES]["total_loss"] + learner_group.set_weights(algo.get_weights()) + learner_group.update(train_batch.as_multi_agent()) - check(learner_group_loss, policy_loss) + algo.stop() def test_kl_coeff_changes(self): initial_kl_coeff = 0.01 config = ( appo.APPOConfig() .environment("CartPole-v1") + .framework(eager_tracing=True) + # Asynchronous Algo, make sure we have some results after 1 iteration. + .reporting(min_time_s_per_iteration=10) .rollouts( num_rollout_workers=0, rollout_fragment_length=frag_length, @@ -127,6 +132,7 @@ def test_kl_coeff_changes(self): vf_share_layers=False, ), _enable_learner_api=True, + use_kl_loss=True, kl_coeff=initial_kl_coeff, ) .rl_module( @@ -134,17 +140,17 @@ def test_kl_coeff_changes(self): ) .exploration(exploration_config={}) ) - for _ in framework_iterator(config, "tf2", with_eager_tracing=True): + for _ in framework_iterator(config, frameworks=("torch", "tf2")): algo = config.build() # Call train while results aren't returned because this is # a asynchronous trainer and results are returned asynchronously. - while 1: + while True: results = algo.train() - if results and "info" in results and LEARNER_INFO in results["info"]: + if results.get("info", {}).get(LEARNER_INFO, {}).get(DEFAULT_POLICY_ID): break curr_kl_coeff = results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][ - LEARNER_STATS_KEY - ][LEARNER_RESULTS_CURR_KL_COEFF_KEY] + LEARNER_RESULTS_CURR_KL_COEFF_KEY + ] self.assertNotEqual(curr_kl_coeff, initial_kl_coeff) diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index 0bd99214255b..dbf8d8d418e4 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -1,15 +1,16 @@ -from collections import defaultdict -from dataclasses import dataclass from typing import Any, Dict, Mapping from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.algorithms.appo.tf.appo_tf_rl_module import OLD_ACTION_DIST_KEY +from ray.rllib.algorithms.appo.appo_learner import ( + AppoLearner, + LEARNER_RESULTS_CURR_KL_COEFF_KEY, + LEARNER_RESULTS_KL_KEY, + OLD_ACTION_DIST_LOGITS_KEY, +) from ray.rllib.algorithms.impala.tf.vtrace_tf_v2 import make_time_major, vtrace_tf2 -from ray.rllib.algorithms.impala.impala_base_learner import ImpalaHPs -from ray.rllib.algorithms.impala.tf.impala_tf_learner import ImpalaTfLearner from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY, ENTROPY_KEY -from ray.rllib.core.rl_module.marl_module import ModuleID from ray.rllib.core.learner.tf.tf_learner import TfLearner +from ray.rllib.core.rl_module.marl_module import ModuleID from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.typing import TensorType @@ -17,77 +18,21 @@ _, tf, _ = try_import_tf() -LEARNER_RESULTS_KL_KEY = "mean_kl_loss" -LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff" - - -@dataclass -class AppoHPs(ImpalaHPs): - """Hyper-parameters for APPO. - - Attributes: - rollout_frag_or_episode_len: The length of a rollout fragment or episode. - Used when making SampleBatches time major for computing loss. - recurrent_seq_len: The length of a recurrent sequence. Used when making - SampleBatches time major for computing loss. - discount_factor: The discount factor to use for computing returns. - vtrace_clip_rho_threshold: The rho threshold to use for clipping the - importance weights. - vtrace_clip_pg_rho_threshold: The rho threshold to use for clipping the - importance weights when computing the policy_gradient loss. - vtrace_drop_last_ts: Whether to drop the last timestep when computing the loss. - This is useful for stabilizing the loss. - NOTE: This shouldn't be True when training on environments where the rewards - come at the end of the episode. - vf_loss_coeff: The amount to weight the value function loss by when computing - the total loss. - entropy_coeff: The amount to weight the average entropy of the actions in the - SampleBatch towards the total_loss for module updates. The higher this - coefficient, the more that the policy network will be encouraged to output - distributions with higher entropy/std deviation, which will encourage - greater exploration. - kl_target: The target kl divergence loss coefficient to use for the KL loss. - kl_coeff: The coefficient to weight the KL divergence between the old policy - and the target policy towards the total loss for module updates. - tau: The factor by which to update the target policy network towards - the current policy network. Can range between 0 and 1. - e.g. updated_param = tau * current_param + (1 - tau) * target_param - - """ - - kl_target: float = 0.01 - kl_coeff: float = 0.1 - clip_param = 0.2 - tau = 1.0 - - -class APPOTfLearner(ImpalaTfLearner): - """Implements APPO loss / update logic on top of ImpalaTfLearner. - - This class implements the APPO loss under `_compute_loss_per_module()` and - implements the target network and KL coefficient updates under - `additional_updates_per_module()` - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.kl_target = self._hps.kl_target - self.clip_param = self._hps.clip_param - # TODO: (avnishn) Make creating the kl coeff a utility function when we add - # torch APPO as well. - self.kl_coeffs = defaultdict( - lambda: tf.Variable(self._hps.kl_coeff, trainable=False, dtype=tf.float32) - ) - self.tau = self._hps.tau +class APPOTfLearner(AppoLearner, TfLearner): + """Implements APPO loss / update logic on top of ImpalaTfLearner.""" @override(TfLearner) def compute_loss_per_module( self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType] ) -> TensorType: values = fwd_out[SampleBatch.VF_PREDS] - target_policy_dist = fwd_out[SampleBatch.ACTION_DIST] - old_target_policy_dist = fwd_out[OLD_ACTION_DIST_KEY] - + action_dist_cls_train = self._module[module_id].get_train_action_dist_cls() + target_policy_dist = action_dist_cls_train.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + old_target_policy_dist = action_dist_cls_train.from_logits( + fwd_out[OLD_ACTION_DIST_LOGITS_KEY] + ) old_target_policy_actions_logp = old_target_policy_dist.logp( batch[SampleBatch.ACTIONS] ) @@ -96,34 +41,29 @@ def compute_loss_per_module( behaviour_actions_logp_time_major = make_time_major( behaviour_actions_logp, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) target_actions_logp_time_major = make_time_major( target_actions_logp, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) old_actions_logp_time_major = make_time_major( old_target_policy_actions_logp, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) values_time_major = make_time_major( values, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) bootstrap_value = values_time_major[-1] rewards_time_major = make_time_major( batch[SampleBatch.REWARDS], - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) # the discount factor that is used should be gamma except for timesteps where @@ -133,22 +73,23 @@ def compute_loss_per_module( - tf.cast( make_time_major( batch[SampleBatch.TERMINATEDS], - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ), dtype=tf.float32, ) - ) * self.discount_factor + ) * self.hps.discount_factor + + # Note that vtrace will compute the main loop on the CPU for better performance. vtrace_adjusted_target_values, pg_advantages = vtrace_tf2( target_action_log_probs=old_actions_logp_time_major, behaviour_action_log_probs=behaviour_actions_logp_time_major, + discounts=discounts_time_major, rewards=rewards_time_major, values=values_time_major, bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.vtrace_clip_pg_rho_threshold, - clip_rho_threshold=self.vtrace_clip_rho_threshold, - discounts=discounts_time_major, + clip_pg_rho_threshold=self.hps.vtrace_clip_pg_rho_threshold, + clip_rho_threshold=self.hps.vtrace_clip_rho_threshold, ) # The policy gradients loss. @@ -167,12 +108,17 @@ def compute_loss_per_module( pg_advantages * logp_ratio, ( pg_advantages - * tf.clip_by_value(logp_ratio, 1 - self.clip_param, 1 + self.clip_param) + * tf.clip_by_value( + logp_ratio, 1 - self.hps.clip_param, 1 + self.hps.clip_param + ) ), ) - action_kl = old_target_policy_dist.kl(target_policy_dist) - mean_kl_loss = tf.math.reduce_mean(action_kl) + if self.hps.use_kl_loss: + action_kl = old_target_policy_dist.kl(target_policy_dist) + mean_kl_loss = tf.math.reduce_mean(action_kl) + else: + mean_kl_loss = 0.0 mean_pi_loss = -tf.math.reduce_mean(surrogate_loss) # The baseline loss. @@ -180,39 +126,32 @@ def compute_loss_per_module( mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2) # The entropy loss. - mean_entropy_loss = -tf.math.reduce_mean(target_actions_logp_time_major) + mean_entropy_loss = -tf.math.reduce_mean(target_policy_dist.entropy()) # The summed weighted loss. total_loss = ( mean_pi_loss - + (mean_vf_loss * self.vf_loss_coeff) - + (mean_entropy_loss * self.entropy_coeff) - + (mean_kl_loss * self.kl_coeffs[module_id]) + + (mean_vf_loss * self.hps.vf_loss_coeff) + + ( + mean_entropy_loss + * self.entropy_coeff_scheduler.get_current_value(module_id) + ) + + (mean_kl_loss * self.curr_kl_coeffs_per_module[module_id]) ) return { self.TOTAL_LOSS_KEY: total_loss, POLICY_LOSS_KEY: mean_pi_loss, VF_LOSS_KEY: mean_vf_loss, - ENTROPY_KEY: mean_entropy_loss, + ENTROPY_KEY: -mean_entropy_loss, LEARNER_RESULTS_KL_KEY: mean_kl_loss, - LEARNER_RESULTS_CURR_KL_COEFF_KEY: self.kl_coeffs[module_id], + LEARNER_RESULTS_CURR_KL_COEFF_KEY: ( + self.curr_kl_coeffs_per_module[module_id] + ), } - @override(ImpalaTfLearner) - def remove_module(self, module_id: str): - super().remove_module(module_id) - self.kl_coeffs.pop(module_id) - + @override(AppoLearner) def _update_module_target_networks(self, module_id: ModuleID): - """Update the target policy of each module with the current policy. - - Do that update via polyak averaging. - - Args: - module_id: The module whose target networks need to be updated. - - """ module = self.module[module_id] target_current_network_pairs = module.get_target_network_pairs() @@ -220,43 +159,24 @@ def _update_module_target_networks(self, module_id: ModuleID): for old_var, current_var in zip( target_network.variables, current_network.variables ): - updated_var = self.tau * current_var + (1.0 - self.tau) * old_var + updated_var = ( + self.hps.tau * current_var + (1.0 - self.hps.tau) * old_var + ) old_var.assign(updated_var) + @override(AppoLearner) def _update_module_kl_coeff( - self, module_id: ModuleID, sampled_kls: Dict[ModuleID, float] - ): - """Dynamically update the KL loss coefficients of each module with. - - The update is completed using the mean KL divergence between the action - distributions current policy and old policy of each module. That action - distribution is computed during the most recent update/call to `compute_loss`. - - Args: - module_id: The module whose KL loss coefficient to update. - sampled_kls: The KL divergence between the action distributions of - the current policy and old policy of each module. - - """ - if module_id in sampled_kls: - sampled_kl = sampled_kls[module_id] - # Update the current KL value based on the recently measured value. - # Increase. - if sampled_kl > 2.0 * self.kl_target: - self.kl_coeffs[module_id].assign(self.kl_coeffs[module_id] * 1.5) - # Decrease. - elif sampled_kl < 0.5 * self.kl_target: - self.kl_coeffs[module_id].assign(self.kl_coeffs[module_id] * 0.5) - - @override(ImpalaTfLearner) - def additional_update_per_module( - self, module_id: ModuleID, sampled_kls: Dict[ModuleID, float], **kwargs - ) -> Mapping[str, Any]: - """Update the target networks and KL loss coefficients of each module. - - Args: - - """ - self._update_module_target_networks(module_id) - self._update_module_kl_coeff(module_id, sampled_kls) - return {} + self, module_id: ModuleID, sampled_kl: float + ) -> Dict[str, Any]: + # Update the current KL value based on the recently measured value. + # Increase. + kl_coeff_var = self.curr_kl_coeffs_per_module[module_id] + + if sampled_kl > 2.0 * self.hps.kl_target: + # TODO (Kourosh) why not *2.0? + kl_coeff_var.assign(kl_coeff_var * 1.5) + # Decrease. + elif sampled_kl < 0.5 * self.hps.kl_target: + kl_coeff_var.assign(kl_coeff_var * 0.5) + + return {LEARNER_RESULTS_CURR_KL_COEFF_KEY: kl_coeff_var.numpy()} diff --git a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py b/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py deleted file mode 100644 index f01235834d85..000000000000 --- a/rllib/algorithms/appo/tf/appo_tf_policy_rlm.py +++ /dev/null @@ -1,227 +0,0 @@ -import logging -from typing import Dict, List, Union - -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, - KLCoeffMixin, - GradStatsMixin, - TargetNetworkMixin, -) - -from ray.rllib.algorithms.impala.impala_tf_policy import ( - VTraceClipGradients, - VTraceOptimizer, -) - -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import Deprecated -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import ( - explained_variance, -) - - -from ray.rllib.algorithms.impala.tf.vtrace_tf_v2 import make_time_major, vtrace_tf2 -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() - -logger = logging.getLogger(__name__) - - -class APPOTfPolicyWithRLModule( - VTraceClipGradients, - VTraceOptimizer, - LearningRateSchedule, - KLCoeffMixin, - EntropyCoeffSchedule, - TargetNetworkMixin, - GradStatsMixin, - EagerTFPolicyV2, -): - def __init__(self, observation_space, action_space, config): - validate_config(config) - EagerTFPolicyV2.enable_eager_execution_if_necessary() - # Initialize MixIns before super().__init__ because base class will call - # self.loss, which requires these MixIns to be initialized. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - # Although this is a no-op, we call __init__ here to make it clear - # that base.__init__ will use the make_model() call. - VTraceClipGradients.__init__(self) - VTraceOptimizer.__init__(self) - self.framework = "tf2" - KLCoeffMixin.__init__(self, config) - GradStatsMixin.__init__(self) - EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - # construct the target model and make its weights the same as the model - self.target_model = self.make_rl_module() - self.target_model.set_weights(self.model.get_weights()) - - # Initiate TargetNetwork ops after loss initialization. - self.maybe_initialize_optimizer_and_loss() - TargetNetworkMixin.__init__(self) - - @Deprecated(new="APPOTfLearner.compute_loss_per_module()", error=False) - @override(EagerTFPolicyV2) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - train_batch[SampleBatch.ACTIONS] - train_batch[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.REWARDS] - train_batch[SampleBatch.TERMINATEDS] - - seqs_len = train_batch.get(SampleBatch.SEQ_LENS) - rollout_frag_or_episode_len = ( - self.config["rollout_fragment_length"] if not seqs_len else None - ) - drop_last = self.config["vtrace_drop_last_ts"] - - target_policy_fwd_out = model.forward_train(train_batch) - values = target_policy_fwd_out[SampleBatch.VF_PREDS] - target_policy_dist = target_policy_fwd_out[SampleBatch.ACTION_DIST] - - old_target_policy_fwd_out = self.target_model.forward_train(train_batch) - old_target_policy_dist = old_target_policy_fwd_out[SampleBatch.ACTION_DIST] - - behaviour_actions_logp = train_batch[SampleBatch.ACTION_LOGP] - target_actions_logp = target_policy_dist.logp(train_batch[SampleBatch.ACTIONS]) - old_target_actions_logp = old_target_policy_dist.logp( - train_batch[SampleBatch.ACTIONS] - ) - behaviour_actions_logp_time_major = make_time_major( - behaviour_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - target_actions_logp_time_major = make_time_major( - target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - old_target_actions_logp_time_major = make_time_major( - old_target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - values_time_major = make_time_major( - values, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - train_batch[SampleBatch.REWARDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ) - - # how to compute discouts? - # should they be pre computed? - discounts_time_major = ( - 1.0 - - tf.cast( - make_time_major( - train_batch[SampleBatch.TERMINATEDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seqs_len, - drop_last=drop_last, - ), - dtype=tf.float32, - ) - ) * self.config["gamma"] - vtrace_adjusted_target_values, pg_advantages = vtrace_tf2( - target_action_log_probs=old_target_actions_logp_time_major, - behaviour_action_log_probs=behaviour_actions_logp_time_major, - rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], - clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], - discounts=discounts_time_major, - ) - - is_ratio = tf.clip_by_value( - tf.math.exp( - behaviour_actions_logp_time_major - target_actions_logp_time_major - ), - 0.0, - 2.0, - ) - logp_ratio = is_ratio * tf.math.exp( - target_actions_logp_time_major - behaviour_actions_logp_time_major - ) - - clip_param = self.config["clip_param"] - surrogate_loss = tf.math.minimum( - pg_advantages * logp_ratio, - ( - pg_advantages - * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param) - ), - ) - action_kl = old_target_policy_dist.kl(target_policy_dist) - mean_kl_loss = tf.math.reduce_mean(action_kl) - mean_pi_loss = -tf.math.reduce_mean(surrogate_loss) - - # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values - mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2) - - # The entropy loss. - mean_entropy_loss = -tf.math.reduce_mean(target_actions_logp_time_major) - - # The summed weighted loss. - total_loss = ( - mean_pi_loss - + (mean_vf_loss * self.config["vf_loss_coeff"]) - + (mean_entropy_loss * self.entropy_coeff) - + (mean_kl_loss * self.kl_coeff) - ) - - self.stats = { - "total_loss": total_loss, - "policy_loss": mean_pi_loss, - "vf_loss": mean_vf_loss, - "values": values_time_major, - "entropy_loss": mean_entropy_loss, - "vtrace_adjusted_target_values": vtrace_adjusted_target_values, - "mean_kl": mean_kl_loss, - } - return total_loss - - @override(EagerTFPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "policy_loss": self.stats["policy_loss"], - "entropy": self.stats["entropy_loss"], - "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), - "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables), - "vf_loss": self.stats["vf_loss"], - "vf_explained_var": explained_variance( - tf.reshape(self.stats["vtrace_adjusted_target_values"], [-1]), - tf.reshape(self.stats["values"], [-1]), - ), - "mean_kl": self.stats["mean_kl"], - } - - @override(EagerTFPolicyV2) - def get_batch_divisibility_req(self) -> int: - return self.config["rollout_fragment_length"] diff --git a/rllib/algorithms/appo/tf/appo_tf_rl_module.py b/rllib/algorithms/appo/tf/appo_tf_rl_module.py index 9972291246da..46c9281283b8 100644 --- a/rllib/algorithms/appo/tf/appo_tf_rl_module.py +++ b/rllib/algorithms/appo/tf/appo_tf_rl_module.py @@ -1,8 +1,10 @@ from typing import List - +from ray.rllib.algorithms.appo.appo_learner import ( + OLD_ACTION_DIST_LOGITS_KEY, +) from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule -from ray.rllib.core.models.base import ACTOR +from ray.rllib.core.models.base import ACTOR, CRITIC, STATE_IN from ray.rllib.core.models.tf.encoder import ENCODER_OUT from ray.rllib.core.rl_module.rl_module_with_target_networks_interface import ( RLModuleWithTargetNetworksInterface, @@ -14,14 +16,8 @@ _, tf, _ = try_import_tf() -OLD_ACTION_DIST_KEY = "old_action_dist" -OLD_ACTION_DIST_LOGITS_KEY = "old_action_dist_logits" - class APPOTfRLModule(PPOTfRLModule, RLModuleWithTargetNetworksInterface): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - def setup(self): super().setup() catalog = self.config.get_catalog() @@ -41,16 +37,27 @@ def get_target_network_pairs(self): @override(PPOTfRLModule) def output_specs_train(self) -> List[str]: return [ - SampleBatch.ACTION_DIST, + SampleBatch.ACTION_DIST_INPUTS, SampleBatch.VF_PREDS, - OLD_ACTION_DIST_KEY, + OLD_ACTION_DIST_LOGITS_KEY, ] + @override(PPOTfRLModule) def _forward_train(self, batch: NestedDict): outs = super()._forward_train(batch) + + # TODO (Artur): Remove this once Policy supports RNN + batch = batch.copy() + if self.encoder.config.shared: + batch[STATE_IN] = None + else: + batch[STATE_IN] = { + ACTOR: None, + CRITIC: None, + } + batch[SampleBatch.SEQ_LENS] = None old_pi_inputs_encoded = self.old_encoder(batch)[ENCODER_OUT][ACTOR] - old_action_dist_logits = self.old_pi(old_pi_inputs_encoded) - old_action_dist = self.action_dist_cls.from_logits(old_action_dist_logits) - outs[OLD_ACTION_DIST_KEY] = old_action_dist + + old_action_dist_logits = tf.stop_gradient(self.old_pi(old_pi_inputs_encoded)) outs[OLD_ACTION_DIST_LOGITS_KEY] = old_action_dist_logits return outs diff --git a/rllib/algorithms/appo/torch/__init__.py b/rllib/algorithms/appo/torch/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py new file mode 100644 index 000000000000..d1e9b4ed4ec6 --- /dev/null +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -0,0 +1,216 @@ +from typing import Any, Dict, Mapping + +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.algorithms.appo.appo_learner import ( + AppoLearner, + LEARNER_RESULTS_CURR_KL_COEFF_KEY, + LEARNER_RESULTS_KL_KEY, + OLD_ACTION_DIST_LOGITS_KEY, +) +from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import ( + make_time_major, + vtrace_torch, +) +from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY, ENTROPY_KEY +from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.core.rl_module.marl_module import ModuleID, MultiAgentRLModule +from ray.rllib.core.rl_module.torch.torch_rl_module import ( + TorchDDPRLModuleWithTargetNetworksInterface, + TorchRLModule, +) +from ray.rllib.core.rl_module.rl_module_with_target_networks_interface import ( + RLModuleWithTargetNetworksInterface, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() + + +class APPOTorchLearner(AppoLearner, TorchLearner): + """Implements APPO loss / update logic on top of ImpalaTorchLearner.""" + + @override(TorchLearner) + def compute_loss_per_module( + self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType] + ) -> TensorType: + + values = fwd_out[SampleBatch.VF_PREDS] + action_dist_cls_train = ( + self.module[module_id].unwrapped().get_train_action_dist_cls() + ) + target_policy_dist = action_dist_cls_train.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + old_target_policy_dist = action_dist_cls_train.from_logits( + fwd_out[OLD_ACTION_DIST_LOGITS_KEY] + ) + old_target_policy_actions_logp = old_target_policy_dist.logp( + batch[SampleBatch.ACTIONS] + ) + behaviour_actions_logp = batch[SampleBatch.ACTION_LOGP] + target_actions_logp = target_policy_dist.logp(batch[SampleBatch.ACTIONS]) + + behaviour_actions_logp_time_major = make_time_major( + behaviour_actions_logp, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, + ) + target_actions_logp_time_major = make_time_major( + target_actions_logp, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, + ) + old_actions_logp_time_major = make_time_major( + old_target_policy_actions_logp, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, + ) + values_time_major = make_time_major( + values, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, + ) + bootstrap_value = values_time_major[-1] + rewards_time_major = make_time_major( + batch[SampleBatch.REWARDS], + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, + ) + + # the discount factor that is used should be gamma except for timesteps where + # the episode is terminated. In that case, the discount factor should be 0. + discounts_time_major = ( + 1.0 + - make_time_major( + batch[SampleBatch.TERMINATEDS], + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, + ).float() + ) * self.hps.discount_factor + + # Note that vtrace will compute the main loop on the CPU for better performance. + vtrace_adjusted_target_values, pg_advantages = vtrace_torch( + target_action_log_probs=old_actions_logp_time_major, + behaviour_action_log_probs=behaviour_actions_logp_time_major, + discounts=discounts_time_major, + rewards=rewards_time_major, + values=values_time_major, + bootstrap_value=bootstrap_value, + clip_pg_rho_threshold=self.hps.vtrace_clip_pg_rho_threshold, + clip_rho_threshold=self.hps.vtrace_clip_rho_threshold, + ) + + # The policy gradients loss. + is_ratio = torch.clip( + torch.exp(behaviour_actions_logp_time_major - old_actions_logp_time_major), + 0.0, + 2.0, + ) + logp_ratio = is_ratio * torch.exp( + target_actions_logp_time_major - behaviour_actions_logp_time_major + ) + + surrogate_loss = torch.minimum( + pg_advantages * logp_ratio, + pg_advantages + * torch.clip(logp_ratio, 1 - self.hps.clip_param, 1 + self.hps.clip_param), + ) + + if self.hps.use_kl_loss: + action_kl = old_target_policy_dist.kl(target_policy_dist) + mean_kl_loss = torch.mean(action_kl) + else: + mean_kl_loss = 0.0 + mean_pi_loss = -torch.mean(surrogate_loss) + + # The baseline loss. + delta = values_time_major - vtrace_adjusted_target_values + mean_vf_loss = 0.5 * torch.mean(delta**2) + + # The entropy loss. + mean_entropy_loss = -torch.mean(target_policy_dist.entropy()) + + # The summed weighted loss. + total_loss = ( + mean_pi_loss + + (mean_vf_loss * self.hps.vf_loss_coeff) + + ( + mean_entropy_loss + * self.entropy_coeff_scheduler.get_current_value(module_id) + ) + + (mean_kl_loss * self.curr_kl_coeffs_per_module[module_id]) + ) + + return { + self.TOTAL_LOSS_KEY: total_loss, + POLICY_LOSS_KEY: mean_pi_loss, + VF_LOSS_KEY: mean_vf_loss, + ENTROPY_KEY: -mean_entropy_loss, + LEARNER_RESULTS_KL_KEY: mean_kl_loss, + } + + @override(TorchLearner) + def _make_modules_ddp_if_necessary(self) -> None: + """Logic for (maybe) making all Modules within self._module DDP. + + This implementation differs from the super's default one in using the special + TorchDDPRLModuleWithTargetNetworksInterface wrapper, instead of the default + TorchDDPRLModule one. + """ + + # If the module is a MultiAgentRLModule and nn.Module we can simply assume + # all the submodules are registered. Otherwise, we need to loop through + # each submodule and move it to the correct device. + # TODO (Kourosh): This can result in missing modules if the user does not + # register them in the MultiAgentRLModule. We should find a better way to + # handle this. + if self._distributed: + # Single agent module: Convert to + # `TorchDDPRLModuleWithTargetNetworksInterface`. + if isinstance(self._module, RLModuleWithTargetNetworksInterface): + self._module = TorchDDPRLModuleWithTargetNetworksInterface(self._module) + # Multi agent module: Convert each submodule to + # `TorchDDPRLModuleWithTargetNetworksInterface`. + else: + assert isinstance(self._module, MultiAgentRLModule) + for key in self._module.keys(): + sub_module = self._module[key] + if isinstance(sub_module, TorchRLModule): + # Wrap and override the module ID key in self._module. + self._module.add_module( + key, + TorchDDPRLModuleWithTargetNetworksInterface(sub_module), + override=True, + ) + + @override(AppoLearner) + def _update_module_target_networks(self, module_id: ModuleID): + module = self.module[module_id] + + target_current_network_pairs = module.get_target_network_pairs() + for target_network, current_network in target_current_network_pairs: + current_state_dict = current_network.state_dict() + new_state_dict = { + k: self.hps.tau * current_state_dict[k] + (1 - self.hps.tau) * v + for k, v in target_network.state_dict().items() + } + target_network.load_state_dict(new_state_dict) + + @override(AppoLearner) + def _update_module_kl_coeff( + self, module_id: ModuleID, sampled_kl: float + ) -> Dict[str, Any]: + # Update the current KL value based on the recently measured value. + # Increase. + kl_coeff_var = self.curr_kl_coeffs_per_module[module_id] + + if sampled_kl > 2.0 * self.hps.kl_target: + # TODO (Kourosh) why not *2.0? + kl_coeff_var.data *= 1.5 + # Decrease. + elif sampled_kl < 0.5 * self.hps.kl_target: + kl_coeff_var.data *= 0.5 + + return {LEARNER_RESULTS_CURR_KL_COEFF_KEY: kl_coeff_var.item()} diff --git a/rllib/algorithms/appo/torch/appo_torch_rl_module.py b/rllib/algorithms/appo/torch/appo_torch_rl_module.py new file mode 100644 index 000000000000..83710ca35b9b --- /dev/null +++ b/rllib/algorithms/appo/torch/appo_torch_rl_module.py @@ -0,0 +1,48 @@ +from typing import List + +from ray.rllib.algorithms.appo.appo_learner import ( + OLD_ACTION_DIST_LOGITS_KEY, +) +from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule +from ray.rllib.core.models.base import ACTOR +from ray.rllib.core.models.tf.encoder import ENCODER_OUT +from ray.rllib.core.rl_module.rl_module_with_target_networks_interface import ( + RLModuleWithTargetNetworksInterface, +) +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.nested_dict import NestedDict + + +class APPOTorchRLModule(PPOTorchRLModule, RLModuleWithTargetNetworksInterface): + def setup(self): + super().setup() + catalog = self.config.get_catalog() + # Old pi and old encoder are the "target networks" that are used for + # the stabilization of the updates of the current pi and encoder. + self.old_pi = catalog.build_pi_head(framework=self.framework) + self.old_encoder = catalog.build_actor_critic_encoder(framework=self.framework) + self.old_pi.load_state_dict(self.pi.state_dict()) + self.old_encoder.load_state_dict(self.encoder.state_dict()) + self.old_pi.trainable = False + self.old_encoder.trainable = False + + @override(RLModuleWithTargetNetworksInterface) + def get_target_network_pairs(self): + return [(self.old_pi, self.pi), (self.old_encoder, self.encoder)] + + @override(PPOTorchRLModule) + def output_specs_train(self) -> List[str]: + return [ + SampleBatch.ACTION_DIST_INPUTS, + OLD_ACTION_DIST_LOGITS_KEY, + SampleBatch.VF_PREDS, + ] + + @override(PPOTorchRLModule) + def _forward_train(self, batch: NestedDict): + outs = super()._forward_train(batch) + old_pi_inputs_encoded = self.old_encoder(batch)[ENCODER_OUT][ACTOR] + old_action_dist_logits = self.old_pi(old_pi_inputs_encoded) + outs[OLD_ACTION_DIST_LOGITS_KEY] = old_action_dist_logits + return outs diff --git a/rllib/algorithms/appo/utils.py b/rllib/algorithms/appo/utils.py index f0bae3e5113a..cbd2efe82161 100644 --- a/rllib/algorithms/appo/utils.py +++ b/rllib/algorithms/appo/utils.py @@ -6,6 +6,7 @@ TARGET_POLICY_SCOPE = "target_func" +# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs. def make_appo_models(policy) -> ModelV2: """Builds model and target model for APPO. diff --git a/rllib/algorithms/ars/ars.py b/rllib/algorithms/ars/ars.py index 168d45efdb97..61547b33f25c 100644 --- a/rllib/algorithms/ars/ars.py +++ b/rllib/algorithms/ars/ars.py @@ -357,6 +357,10 @@ def do_rollouts(self, params, timestep_limit=None): eval_lengths=eval_lengths, ) + def stop(self): + """Releases all resources used by this RolloutWorker.""" + pass + def get_policy_class(config: AlgorithmConfig): if config.framework_str == "torch": diff --git a/rllib/algorithms/dreamer/dreamer.py b/rllib/algorithms/dreamer/dreamer.py index 808f28202bf8..f0c438d47cb7 100644 --- a/rllib/algorithms/dreamer/dreamer.py +++ b/rllib/algorithms/dreamer/dreamer.py @@ -77,7 +77,13 @@ def __init__(self): self.td_model_lr = 6e-4 self.actor_lr = 8e-5 self.critic_lr = 8e-5 + self.grad_clip = 100.0 + # Note: Only when using _enable_learner_api=True can the clipping mode be + # configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + self.lambda_ = 0.95 self.dreamer_train_iters = 100 self.batch_size = 50 diff --git a/rllib/algorithms/es/es.py b/rllib/algorithms/es/es.py index 6c378b10258c..c5dc6b51e840 100644 --- a/rllib/algorithms/es/es.py +++ b/rllib/algorithms/es/es.py @@ -357,6 +357,10 @@ def do_rollouts(self, params, timestep_limit=None): eval_lengths=eval_lengths, ) + def stop(self): + """Releases all resources used by this RolloutWorker.""" + pass + def get_policy_class(config: AlgorithmConfig): if config.framework_str == "torch": diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 6f4c23c352da..128275c6083c 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -1,24 +1,25 @@ import copy +import dataclasses +from functools import partial import logging import platform import queue import random from typing import Callable, List, Optional, Set, Tuple, Type, Union +import numpy as np +import tree # pip install dm_tree + import ray from ray import ObjectRef from ray.rllib import SampleBatch from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided -from ray.rllib.algorithms.impala.impala_base_learner import ( - ImpalaHPs, +from ray.rllib.algorithms.impala.impala_learner import ( + ImpalaHyperparameters, _reduce_impala_results, ) from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog -from ray.rllib.core.learner.learner_group_config import ( - LearnerGroupConfig, - ModuleSpec, -) from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.evaluation.worker_set import handle_remote_call_result_errors from ray.rllib.execution.buffers.mixin_replay_buffer import MixInMultiAgentReplayBuffer @@ -48,10 +49,10 @@ SYNCH_WORKER_WEIGHTS_TIMER, SAMPLE_TIMER, ) +from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder from ray.rllib.utils.replay_buffers.multi_agent_replay_buffer import ReplayMode from ray.rllib.utils.replay_buffers.replay_buffer import _ALL_POLICIES - -from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder +from ray.rllib.utils.schedules.scheduler import Scheduler from ray.rllib.utils.typing import ( PartialAlgorithmConfigDict, PolicyID, @@ -108,11 +109,14 @@ def __init__(self, algo_class=None): # __sphinx_doc_begin__ # IMPALA specific settings: - self._learner_hps = ImpalaHPs() self.vtrace = True self.vtrace_clip_rho_threshold = 1.0 self.vtrace_clip_pg_rho_threshold = 1.0 - self.vtrace_drop_last_ts = True + # TODO (sven): Deprecate this setting. It makes no sense to drop the last ts. + # It's actually dangerous if there are important rewards "hiding" in that ts. + # This setting is already ignored (always False) on the new Learner API + # (if _enable_learner_api=True). + self.vtrace_drop_last_ts = False self.num_multi_gpu_tower_stacks = 1 self.minibatch_buffer_size = 1 self.num_sgd_iter = 1 @@ -125,7 +129,13 @@ def __init__(self, algo_class=None): self.timeout_s_aggregator_manager = 0.0 self.broadcast_interval = 1 self.num_aggregation_workers = 0 + self.grad_clip = 40.0 + # Note: Only when using _enable_learner_api=True can the clipping mode be + # configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + self.opt_type = "adam" self.lr_schedule = None self.decay = 0.99 @@ -138,10 +148,10 @@ def __init__(self, algo_class=None): self._lr_vf = 0.0005 self.after_train_step = None - # Override some of AlgorithmConfig's default values with ARS-specific values. + # Override some of AlgorithmConfig's default values with IMPALA-specific values. self.rollout_fragment_length = 50 self.train_batch_size = 500 - self.minibatch_size = self.train_batch_size + self._minibatch_size = "auto" self.num_rollout_workers = 2 self.num_gpus = 1 self.lr = 0.0005 @@ -173,7 +183,7 @@ def training( gamma: Optional[float] = NotProvided, num_multi_gpu_tower_stacks: Optional[int] = NotProvided, minibatch_buffer_size: Optional[int] = NotProvided, - minibatch_size: Optional[int] = NotProvided, + minibatch_size: Optional[Union[int, str]] = NotProvided, num_sgd_iter: Optional[int] = NotProvided, replay_proportion: Optional[float] = NotProvided, replay_buffer_num_slots: Optional[int] = NotProvided, @@ -227,10 +237,11 @@ def training( minibatch_buffer_size: How many train batches should be retained for minibatching. This conf only has an effect if `num_sgd_iter > 1`. minibatch_size: The size of minibatches that are trained over during - each SGD iteration. Note this only has an effect if - `_enable_learner_api` == True. - Note: minibatch_size must be a multiple of rollout_fragment_length or - sequence_length and smaller than or equal to train_batch_size. + each SGD iteration. If "auto", will use the same value as + `train_batch_size`. + Note that this setting only has an effect if `_enable_learner_api=True` + and it must be a multiple of `rollout_fragment_length` or + `sequence_length` and smaller than or equal to `train_batch_size`. num_sgd_iter: Number of passes to make over each train batch. replay_proportion: Set >0 to enable experience replay. Saved samples will be replayed with a p:1 proportion to new data samples. @@ -346,7 +357,7 @@ def training( if gamma is not NotProvided: self.gamma = gamma if minibatch_size is not NotProvided: - self.minibatch_size = minibatch_size + self._minibatch_size = minibatch_size return self @@ -363,6 +374,13 @@ def validate(self) -> None: # Check `entropy_coeff` for correctness. if self.entropy_coeff < 0.0: raise ValueError("`entropy_coeff` must be >= 0.0!") + # Entropy coeff schedule checking. + if self._enable_learner_api: + Scheduler.validate( + self.entropy_coeff_schedule, + "entropy_coeff_schedule", + "entropy coefficient", + ) # Check whether worker to aggregation-worker ratio makes sense. if self.num_aggregation_workers > self.num_rollout_workers: @@ -395,43 +413,44 @@ def validate(self) -> None: "term/optimizer! Try setting config.training(" "_tf_policy_handles_more_than_one_loss=True)." ) + # Learner API specific checks. if self._enable_learner_api: if not ( (self.minibatch_size % self.rollout_fragment_length == 0) and self.minibatch_size <= self.train_batch_size ): raise ValueError( - "minibatch_size must be a multiple of rollout_fragment_length and " - "must be smaller than or equal to train_batch_size. Got" - f" minibatch_size={self.minibatch_size}, train_batch_size=" - f"{self.train_batch_size}, and rollout_fragment_length=" - f"{self.get_rollout_fragment_length()}" + f"`minibatch_size` ({self._minibatch_size}) must either be 'auto' " + "or a multiple of `rollout_fragment_length` " + f"({self.rollout_fragment_length}) while at the same time smaller " + f"than or equal to `train_batch_size` ({self.train_batch_size})!" ) - # learner hps need to be updated inside of config.validate in order to have - # the correct values for when a user starts an experiment from a dict. This is - # as oppposed to assigning the values inthe builder functions such as `training` - self._learner_hps.rollout_frag_or_episode_len = ( - self.get_rollout_fragment_length() - ) - self._learner_hps.discount_factor = self.gamma - self._learner_hps.entropy_coeff = self.entropy_coeff - self._learner_hps.vf_loss_coeff = self.vf_loss_coeff - self._learner_hps.vtrace_drop_last_ts = self.vtrace_drop_last_ts - self._learner_hps.vtrace_clip_rho_threshold = self.vtrace_clip_rho_threshold - self._learner_hps.vtrace_clip_pg_rho_threshold = ( - self.vtrace_clip_pg_rho_threshold - ) @override(AlgorithmConfig) - def get_learner_group_config(self, module_spec: ModuleSpec) -> LearnerGroupConfig: - lg_config = super().get_learner_group_config(module_spec) - optim_config = lg_config.optimizer_config - # TODO(avnishn): Make grad_clip a default parameter in algorithm_config's base - # class - optim_config.update({"grad_clip": self.grad_clip}) - lg_config = lg_config.learner(optimizer_config=optim_config) - return lg_config + def get_learner_hyperparameters(self) -> ImpalaHyperparameters: + base_hps = super().get_learner_hyperparameters() + learner_hps = ImpalaHyperparameters( + rollout_frag_or_episode_len=self.get_rollout_fragment_length(), + discount_factor=self.gamma, + entropy_coeff=self.entropy_coeff, + vf_loss_coeff=self.vf_loss_coeff, + vtrace_clip_rho_threshold=self.vtrace_clip_rho_threshold, + vtrace_clip_pg_rho_threshold=self.vtrace_clip_pg_rho_threshold, + **dataclasses.asdict(base_hps), + ) + # TODO: We currently do not use the `recurrent_seq_len` property anyways. + # We should re-think the handling of RNN/SEQ_LENs/etc.. once we start + # supporting them in RLModules and then revisit this check here. + # Also, such a check should be moved into `IMPALAConfig.validate()`. + assert (learner_hps.rollout_frag_or_episode_len is None) != ( + learner_hps.recurrent_seq_len is None + ), ( + "One of `rollout_frag_or_episode_len` or `recurrent_seq_len` must be not " + "None in ImpalaHyperparameters!" + ) + return learner_hps + # TODO (sven): Make these get_... methods all read-only @properties instead. def get_replay_ratio(self) -> float: """Returns replay ratio (between 0.0 and 1.0) based off self.replay_proportion. @@ -439,18 +458,28 @@ def get_replay_ratio(self) -> float: """ return (1 / self.replay_proportion) if self.replay_proportion > 0 else 0.0 + @property + def minibatch_size(self): + # If 'auto', use the train_batch_size (meaning each SGD iter is a single pass + # through the entire train batch). Otherwise, use user provided setting. + return ( + self.train_batch_size + if self._minibatch_size == "auto" + else self._minibatch_size + ) + @override(AlgorithmConfig) def get_default_learner_class(self): - if self.framework_str == "tf2": - from ray.rllib.algorithms.impala.tf.impala_tf_learner import ImpalaTfLearner - - return ImpalaTfLearner - elif self.framework_str == "torch": + if self.framework_str == "torch": from ray.rllib.algorithms.impala.torch.impala_torch_learner import ( ImpalaTorchLearner, ) return ImpalaTorchLearner + elif self.framework_str == "tf2": + from ray.rllib.algorithms.impala.tf.impala_tf_learner import ImpalaTfLearner + + return ImpalaTfLearner else: raise ValueError(f"The framework {self.framework_str} is not supported.") @@ -542,66 +571,49 @@ def get_default_policy_class( if not config["vtrace"]: raise ValueError("IMPALA with the learner API does not support non-VTrace ") - if config._enable_rl_module_api: - if config["framework"] == "tf2": - from ray.rllib.algorithms.impala.tf.impala_tf_policy_rlm import ( - ImpalaTfPolicyWithRLModule, - ) - - return ImpalaTfPolicyWithRLModule - if config["framework"] == "torch": - from ray.rllib.algorithms.impala.torch.impala_torch_policy_rlm import ( - ImpalaTorchPolicyWithRLModule, + if config["framework"] == "torch": + if config["vtrace"]: + from ray.rllib.algorithms.impala.impala_torch_policy import ( + ImpalaTorchPolicy, ) - return ImpalaTorchPolicyWithRLModule + return ImpalaTorchPolicy else: - raise ValueError( - f"IMPALA with the learner API does not support framework " - f"{config['framework']} " - ) - else: - if config["framework"] == "torch": - if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_torch_policy import ( - ImpalaTorchPolicy, - ) + from ray.rllib.algorithms.a3c.a3c_torch_policy import A3CTorchPolicy - return ImpalaTorchPolicy - else: - from ray.rllib.algorithms.a3c.a3c_torch_policy import A3CTorchPolicy + return A3CTorchPolicy + elif config["framework"] == "tf": + if config["vtrace"]: + from ray.rllib.algorithms.impala.impala_tf_policy import ( + ImpalaTF1Policy, + ) - return A3CTorchPolicy - elif config["framework"] == "tf": - if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_tf_policy import ( - ImpalaTF1Policy, - ) + return ImpalaTF1Policy + else: + from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy - return ImpalaTF1Policy - else: - from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy + return A3CTFPolicy + else: + if config["vtrace"]: + from ray.rllib.algorithms.impala.impala_tf_policy import ( + ImpalaTF2Policy, + ) - return A3CTFPolicy + return ImpalaTF2Policy else: - if config["vtrace"]: - from ray.rllib.algorithms.impala.impala_tf_policy import ( - ImpalaTF2Policy, - ) + from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy - return ImpalaTF2Policy - else: - from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTFPolicy - - return A3CTFPolicy + return A3CTFPolicy @override(Algorithm) def setup(self, config: AlgorithmConfig): super().setup(config) + # Queue of batches to be sent to the Learner. + self.batches_to_place_on_learner = [] + # Create extra aggregation workers and assign each rollout worker to # one of them. - self.batches_to_place_on_learner = [] self.batch_being_built = [] if self.config.num_aggregation_workers > 0: # This spawns `num_aggregation_workers` actors that aggregate @@ -675,7 +687,8 @@ def training_step(self) -> ResultDict: and self._aggregator_actor_manager.num_healthy_actors() > 0 ) - # Get references to sampled SampleBatches from our workers. + # Get sampled SampleBatches from our workers (by ray references if we use + # tree-aggregation). unprocessed_sample_batches = self.get_samples_from_workers( return_object_refs=use_tree_aggregation, ) @@ -701,8 +714,27 @@ def training_step(self) -> ResultDict: self._counters[NUM_AGENT_STEPS_SAMPLED] += batch.agent_steps() # Concatenate single batches into batches of size `train_batch_size`. self.concatenate_batches_and_pre_queue(batches) + # Using the Learner API. Call `update()` on our LearnerGroup object with + # all collected batches. if self.config._enable_learner_api: train_results = self.learn_on_processed_samples() + additional_results = self.learner_group.additional_update( + module_ids_to_update=set(train_results.keys()) - {ALL_MODULES}, + timestep=self._counters[ + NUM_ENV_STEPS_TRAINED + if self.config.count_steps_by == "env_steps" + else NUM_AGENT_STEPS_TRAINED + ], + # TODO (sven): Feels hacked, but solves the problem of algos inheriting + # from IMPALA (like APPO). In the old stack, we didn't have this + # problem b/c IMPALA didn't need to call any additional update methods + # as the entropy- and lr-schedules were handled by + # `Policy.on_global_var_update()`. + **self._get_additional_update_kwargs(train_results), + ) + for key, res in additional_results.items(): + if key in train_results: + train_results[key].update(res) else: # Move train batches (of size `train_batch_size`) onto learner queue. self.place_processed_samples_on_learner_thread_queue() @@ -714,12 +746,10 @@ def training_step(self) -> ResultDict: if self.config._enable_learner_api: if train_results: pids = list(set(train_results.keys()) - {ALL_MODULES}) - else: - pids = [] - self.update_workers_from_learner_group( - workers_that_need_updates=workers_that_need_updates, - policy_ids=pids, - ) + self.update_workers_from_learner_group( + workers_that_need_updates=workers_that_need_updates, + policy_ids=pids, + ) else: pids = list(train_results.keys()) self.update_workers_if_necessary( @@ -739,7 +769,7 @@ def training_step(self) -> ResultDict: if self.config._enable_learner_api: if train_results: - # store the most recent result and return it if no new result is + # Store the most recent result and return it if no new result is # available. This keeps backwards compatibility with the old # training stack / results reporting stack. This is necessary # any time we develop an asynchronous algorithm. @@ -911,34 +941,39 @@ def learn_on_processed_samples(self) -> ResultDict: Aggregated results from the learner group after an update is completed. """ - result = {} + # There are batches on the queue -> Send them all to the learner group. if self.batches_to_place_on_learner: - batch = self.batches_to_place_on_learner.pop(0) + batches = self.batches_to_place_on_learner[:] + self.batches_to_place_on_learner.clear() # If there are no learner workers and learning is directly on the driver # Then we can't do async updates, so we need to block. blocking = self.config.num_learner_workers == 0 - lg_results = self.learner_group.update( - batch, - reduce_fn=_reduce_impala_results, - block=blocking, - num_iters=self.config.num_sgd_iter, - minibatch_size=self.config.minibatch_size, - ) - else: - lg_results = None - - if lg_results: - self._counters[NUM_ENV_STEPS_TRAINED] += lg_results[ALL_MODULES][ - NUM_ENV_STEPS_TRAINED - ] - self._counters[NUM_AGENT_STEPS_TRAINED] += lg_results[ALL_MODULES][ - NUM_AGENT_STEPS_TRAINED - ] - del lg_results[ALL_MODULES][NUM_ENV_STEPS_TRAINED] - del lg_results[ALL_MODULES][NUM_AGENT_STEPS_TRAINED] - result = lg_results + results = [] + for batch in batches: + result = self.learner_group.update( + batch, + reduce_fn=_reduce_impala_results, + block=blocking, + num_iters=self.config.num_sgd_iter, + minibatch_size=self.config.minibatch_size, + ) + if result: + self._counters[NUM_ENV_STEPS_TRAINED] += result[ALL_MODULES].pop( + NUM_ENV_STEPS_TRAINED + ) + self._counters[NUM_AGENT_STEPS_TRAINED] += result[ALL_MODULES].pop( + NUM_AGENT_STEPS_TRAINED + ) + results.append(result) + self._counters.update(self.learner_group.get_in_queue_stats()) + # If there are results, reduce-mean over each individual value and return. + if results: + return tree.map_structure(lambda *x: np.mean(x), *results) - return result + # Nothing on the queue -> Don't send requests to learner group + # or no results ready (from previous `self.learner_group.update()` calls) for + # reducing. + return {} def place_processed_samples_on_learner_thread_queue(self) -> None: """Place processed samples on the learner queue for training. @@ -1017,10 +1052,9 @@ def process_experiences_directly( Batches that have been processed by the mixin buffer. """ - processed_batches = [] batches = [b for _, b in worker_to_sample_batches] - if not batches: - return processed_batches + processed_batches = [] + for batch in batches: assert not isinstance( batch, ObjectRef @@ -1051,6 +1085,10 @@ def process_experiences_tree_aggregation( workers. """ + + def _process_episodes(actor, batch): + return actor.process_episodes(ray.get(batch)) + for _, batch in worker_to_sample_batches_refs: assert isinstance(batch, ObjectRef), ( "For efficiency, process_experiences_tree_aggregation should " @@ -1061,7 +1099,7 @@ def process_experiences_tree_aggregation( self._aggregator_actor_manager.healthy_actor_ids() ) calls_placed = self._aggregator_actor_manager.foreach_actor_async( - lambda actor: actor.process_episodes(ray.get(batch)), + partial(_process_episodes, batch=batch), remote_actor_ids=[aggregator_id], ) if calls_placed <= 0: @@ -1179,14 +1217,18 @@ def update_workers_if_necessary( timeout_seconds=0, # Don't wait for the workers to finish. ) + def _get_additional_update_kwargs(self, train_results: dict) -> dict: + """Returns the kwargs to `LearnerGroup.additional_update()`. + + Should be overridden by subclasses to specify wanted/needed kwargs for + their own implementation of `Learner.additional_update_per_module()`. + """ + return {} + @override(Algorithm) def _compile_iteration_results(self, *args, **kwargs): result = super()._compile_iteration_results(*args, **kwargs) - if self.config._enable_learner_api: - result["custom_metrics"] = { - "learner_group_queue_size": self.learner_group.in_queue_size - } - else: + if not self.config._enable_learner_api: result = self._learner_thread.add_learner_metrics( result, overwrite_learner_info=False ) diff --git a/rllib/algorithms/impala/impala_base_learner.py b/rllib/algorithms/impala/impala_base_learner.py deleted file mode 100644 index e57bf8ecabae..000000000000 --- a/rllib/algorithms/impala/impala_base_learner.py +++ /dev/null @@ -1,111 +0,0 @@ -from dataclasses import dataclass -import numpy as np -from typing import Any, List, Mapping -import tree - -from ray.rllib.policy.sample_batch import MultiAgentBatch -from ray.rllib.core.learner.learner import LearnerHPs -from ray.rllib.utils.annotations import override -from ray.rllib.core.learner.learner import Learner -from ray.rllib.utils.metrics import ( - ALL_MODULES, - NUM_AGENT_STEPS_TRAINED, - NUM_ENV_STEPS_TRAINED, -) -from ray.rllib.utils.typing import ResultDict - - -@dataclass -class ImpalaHPs(LearnerHPs): - """Hyper-parameters for IMPALA. - - Attributes: - rollout_frag_or_episode_len: The length of a rollout fragment or episode. - Used when making SampleBatches time major for computing loss. - recurrent_seq_len: The length of a recurrent sequence. Used when making - SampleBatches time major for computing loss. - discount_factor: The discount factor to use for computing returns. - vtrace_clip_rho_threshold: The rho threshold to use for clipping the - importance weights. - vtrace_clip_pg_rho_threshold: The rho threshold to use for clipping the - importance weights when computing the policy_gradient loss. - vtrace_drop_last_ts: Whether to drop the last timestep when computing the loss. - This is useful for stabilizing the loss. - NOTE: This shouldn't be True when training on environments where the rewards - come at the end of the episode. - vf_loss_coeff: The amount to weight the value function loss by when computing - the total loss. - entropy_coeff: The amount to weight the average entropy of the actions in the - SampleBatch towards the total_loss for module updates. The higher this - coefficient, the more that the policy network will be encouraged to output - distributions with higher entropy/std deviation, which will encourage - greater exploration. - - """ - - rollout_frag_or_episode_len: int = None - recurrent_seq_len: int = None - discount_factor: float = 0.99 - vtrace_clip_rho_threshold: float = 1.0 - vtrace_clip_pg_rho_threshold: float = 1.0 - vtrace_drop_last_ts: bool = True - vf_loss_coeff: float = 0.5 - entropy_coeff: float = 0.01 - - -class ImpalaBaseLearner(Learner): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.vtrace_clip_rho_threshold = self._hps.vtrace_clip_rho_threshold - self.vtrace_clip_pg_rho_threshold = self._hps.vtrace_clip_pg_rho_threshold - self.vtrace_drop_last_ts = self._hps.vtrace_drop_last_ts - self.vf_loss_coeff = self._hps.vf_loss_coeff - self.entropy_coeff = self._hps.entropy_coeff - self.rollout_frag_or_episode_len = self._hps.rollout_frag_or_episode_len - self.recurrent_seq_len = self._hps.recurrent_seq_len - self.discount_factor = self._hps.discount_factor - assert ( - self.rollout_frag_or_episode_len is not None - or self.recurrent_seq_len is not None - ) and not (self.rollout_frag_or_episode_len and self.recurrent_seq_len), ( - "Either rollout_frag_or_episode_len or recurrent_seq_len" - " must be set in the IMPALA HParams. " - ) - - @override(Learner) - def compile_results( - self, - batch: MultiAgentBatch, - fwd_out: Mapping[str, Any], - postprocessed_loss: Mapping[str, Any], - postprocessed_gradients: Mapping[str, Any], - ) -> Mapping[str, Any]: - results = super().compile_results( - batch, fwd_out, postprocessed_loss, postprocessed_gradients - ) - results[ALL_MODULES][NUM_AGENT_STEPS_TRAINED] = batch.agent_steps() - results[ALL_MODULES][NUM_ENV_STEPS_TRAINED] = batch.env_steps() - return results - - -def _reduce_impala_results(results: List[ResultDict]) -> ResultDict: - """Reduce/Aggregate a list of results from Impala Learners. - - Average the values of the result dicts. Add keys for the number of agent and env - steps trained. - - Args: - results: result dicts to reduce. - - Returns: - A reduced result dict. - """ - result = tree.map_structure(lambda *x: np.mean(x), *results) - agent_steps_trained = sum( - [r[ALL_MODULES][NUM_AGENT_STEPS_TRAINED] for r in results] - ) - env_steps_trained = sum([r[ALL_MODULES][NUM_ENV_STEPS_TRAINED] for r in results]) - result[ALL_MODULES][NUM_AGENT_STEPS_TRAINED] = agent_steps_trained - result[ALL_MODULES][NUM_ENV_STEPS_TRAINED] = env_steps_trained - return result diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py new file mode 100644 index 000000000000..568f40f6c24b --- /dev/null +++ b/rllib/algorithms/impala/impala_learner.py @@ -0,0 +1,109 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Mapping, Optional, Union + +import numpy as np +import tree # pip install dm_tree + +from ray.rllib.core.learner.learner import Learner, LearnerHyperparameters +from ray.rllib.core.rl_module.rl_module import ModuleID +from ray.rllib.policy.sample_batch import MultiAgentBatch +from ray.rllib.utils.annotations import override +from ray.rllib.utils.metrics import ( + ALL_MODULES, + NUM_AGENT_STEPS_TRAINED, + NUM_ENV_STEPS_TRAINED, +) +from ray.rllib.utils.schedules.scheduler import Scheduler +from ray.rllib.utils.typing import ResultDict + + +LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff" + + +@dataclass +class ImpalaHyperparameters(LearnerHyperparameters): + """Hyperparameters for the ImpalaLearner sub-classes (framework specific). + + These should never be set directly by the user. Instead, use the IMPALAConfig + class to configure your algorithm. + See `ray.rllib.algorithms.impala.impala::IMPALAConfig::training()` for more details + on the individual properties. + + Attributes: + rollout_frag_or_episode_len: The length of a rollout fragment or episode. + Used when making SampleBatches time major for computing loss. + recurrent_seq_len: The length of a recurrent sequence. Used when making + SampleBatches time major for computing loss. + """ + + rollout_frag_or_episode_len: int = None + recurrent_seq_len: int = None + discount_factor: float = None + vtrace_clip_rho_threshold: float = None + vtrace_clip_pg_rho_threshold: float = None + vf_loss_coeff: float = None + entropy_coeff: float = None + entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None + + +class ImpalaLearner(Learner): + @override(Learner) + def build(self) -> None: + super().build() + + # Build entropy coeff scheduling tools. + self.entropy_coeff_scheduler = Scheduler( + fixed_value=self.hps.entropy_coeff, + schedule=self.hps.entropy_coeff_schedule, + framework=self.framework, + device=self._device, + ) + + @override(Learner) + def additional_update_per_module( + self, module_id: ModuleID, timestep: int + ) -> Dict[str, Any]: + results = super().additional_update_per_module(module_id, timestep=timestep) + + # Update entropy coefficient via our Scheduler. + new_entropy_coeff = self.entropy_coeff_scheduler.update( + module_id, timestep=timestep + ) + results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff}) + + return results + + @override(Learner) + def compile_results( + self, + batch: MultiAgentBatch, + fwd_out: Mapping[str, Any], + postprocessed_loss: Mapping[str, Any], + postprocessed_gradients: Mapping[str, Any], + ) -> Mapping[str, Any]: + results = super().compile_results( + batch, fwd_out, postprocessed_loss, postprocessed_gradients + ) + results[ALL_MODULES][NUM_AGENT_STEPS_TRAINED] = batch.agent_steps() + results[ALL_MODULES][NUM_ENV_STEPS_TRAINED] = batch.env_steps() + return results + + +def _reduce_impala_results(results: List[ResultDict]) -> ResultDict: + """Reduce/Aggregate a list of results from Impala Learners. + + Average the values of the result dicts. Add keys for the number of agent and env + steps trained (on all modules). + + Args: + results: result dicts to reduce. + + Returns: + A reduced result dict. + """ + result = tree.map_structure(lambda *x: np.mean(x), *results) + agent_steps_trained = sum(r[ALL_MODULES][NUM_AGENT_STEPS_TRAINED] for r in results) + env_steps_trained = sum(r[ALL_MODULES][NUM_ENV_STEPS_TRAINED] for r in results) + result[ALL_MODULES][NUM_AGENT_STEPS_TRAINED] = agent_steps_trained + result[ALL_MODULES][NUM_ENV_STEPS_TRAINED] = env_steps_trained + return result diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index e0e005da69a2..d8b830ef7653 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -85,7 +85,7 @@ def __init__( config: Algorithm config dict. """ - # Compute vtrace on the CPU for better perf. + # Compute vtrace on the CPU for better performance. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_action_log_probs=behaviour_action_logp, @@ -297,13 +297,18 @@ def __init__( existing_model=existing_model, ) - GradStatsMixin.__init__(self) - VTraceClipGradients.__init__(self) - VTraceOptimizer.__init__(self) - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + if not self.config.get("_enable_learner_api"): + GradStatsMixin.__init__(self) + VTraceClipGradients.__init__(self) + VTraceOptimizer.__init__(self) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) # Note: this is a bit ugly, but loss and optimizer initialization must # happen after all the MixIns are initialized. diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 73d4b3c7bd12..71aed0320601 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -201,13 +201,18 @@ def __init__(self, observation_space, action_space, config): ray.rllib.algorithms.impala.impala.ImpalaConfig().to_dict(), **config ) - VTraceOptimizer.__init__(self) - # Need to initialize learning rate variable before calling - # TorchPolicyV2.__init__. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) + # If Learner API is used, we don't need any loss-specific mixins. + # However, we also would like to avoid creating special Policy-subclasses + # for this as the entire Policy concept will soon not be used anymore with + # the new Learner- and RLModule APIs. + if not config.get("_enable_learner_api"): + VTraceOptimizer.__init__(self) + # Need to initialize learning rate variable before calling + # TorchPolicyV2.__init__. + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) TorchPolicyV2.__init__( self, @@ -217,7 +222,6 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) - # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index 5358816b8195..8725434b9751 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -1,14 +1,13 @@ import unittest import numpy as np +import tree # pip install dm_tree import ray from ray.rllib.algorithms.impala import ImpalaConfig from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch, try_import_tf -from ray.rllib.utils.metrics import ALL_MODULES -from ray.rllib.utils.test_utils import check from ray.rllib.utils.test_utils import framework_iterator from ray.rllib.utils.torch_utils import convert_to_torch_tensor @@ -81,19 +80,16 @@ def test_impala_loss(self): config.exploration_config = {} for fw in framework_iterator(config, frameworks=["tf2", "torch"]): - trainer = config.build() - policy = trainer.get_policy() + algo = config.build() + policy = algo.get_policy() if fw == "tf2": - train_batch = tf.nest.map_structure( - lambda x: tf.convert_to_tensor(x), FAKE_BATCH + train_batch = SampleBatch( + tree.map_structure(lambda x: tf.convert_to_tensor(x), FAKE_BATCH) ) elif fw == "torch": train_batch = convert_to_torch_tensor(SampleBatch(FAKE_BATCH)) - policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) - - train_batch = SampleBatch(FAKE_BATCH) algo_config = config.copy(copy_frozen=False) algo_config.validate() algo_config.freeze() @@ -109,12 +105,10 @@ def test_impala_loss(self): ) learner_group_config.num_learner_workers = 0 learner_group = learner_group_config.build() - learner_group.set_weights(trainer.get_weights()) - results = learner_group.update(train_batch.as_multi_agent()) - - learner_group_loss = results[ALL_MODULES]["total_loss"] + learner_group.set_weights(algo.get_weights()) + learner_group.update(train_batch.as_multi_agent()) - check(learner_group_loss, policy_loss) + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/impala/tests/test_impala_off_policyness.py b/rllib/algorithms/impala/tests/test_impala_off_policyness.py index 09600ff3f046..82a92916172f 100644 --- a/rllib/algorithms/impala/tests/test_impala_off_policyness.py +++ b/rllib/algorithms/impala/tests/test_impala_off_policyness.py @@ -1,4 +1,3 @@ -import itertools import unittest import ray @@ -6,7 +5,6 @@ from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.test_utils import ( check_compute_single_action, - check_off_policyness, framework_iterator, ) @@ -28,43 +26,29 @@ def test_impala_off_policyness(self): .environment("CartPole-v1") .resources(num_gpus=0) .rollouts(num_rollout_workers=4) + .training(_enable_learner_api=True) + .rl_module(_enable_rl_module_api=True) ) num_iterations = 3 num_aggregation_workers_options = [0, 1] - enable_rlm_learner_group_options = [True, False] - - default_exploration_config = config.exploration_config.copy() - - for permutation in itertools.product( - num_aggregation_workers_options, enable_rlm_learner_group_options - ): - num_aggregation_workers, enable_learner_api = permutation - for fw in framework_iterator( - config, with_eager_tracing=True, frameworks=["tf2"] + for num_aggregation_workers in num_aggregation_workers_options: + for _ in framework_iterator( + config, frameworks=("tf2", "torch"), with_eager_tracing=True ): - # TODO(avnishn): Enable this for torch when we merge the torch learner. - if enable_learner_api and fw != "tf2": - continue - config.training(_enable_learner_api=enable_learner_api) - config.rl_module(_enable_rl_module_api=enable_learner_api) - if enable_learner_api: - # We have to set exploration_config here manually because setting - # it through config.exploration() only deepupdates it - config.exploration_config = {} - else: - config.exploration_config = default_exploration_config + # We have to set exploration_config here manually because setting + # it through config.exploration() only deepupdates it + config.exploration_config = {} config.num_aggregation_workers = num_aggregation_workers print("aggregation-workers={}".format(config.num_aggregation_workers)) algo = config.build() for i in range(num_iterations): - results = algo.train() + algo.train() # TODO (Avnish): Add off-policiness check when the metrics are - # added back to the IMPALA Learner - if not enable_learner_api: - off_policy_ness = check_off_policyness(results, upper_limit=2.0) - print(f"off-policy'ness={off_policy_ness}") + # added back to the IMPALA Learner. + # off_policy_ness = check_off_policyness(results, upper_limit=2.0) + # print(f"off-policy'ness={off_policy_ness}") check_compute_single_action( algo, diff --git a/rllib/algorithms/impala/tf/impala_tf_learner.py b/rllib/algorithms/impala/tf/impala_tf_learner.py index 8bb9ce099a69..d397eb268fdc 100644 --- a/rllib/algorithms/impala/tf/impala_tf_learner.py +++ b/rllib/algorithms/impala/tf/impala_tf_learner.py @@ -1,7 +1,8 @@ from typing import Mapping -from ray.rllib.algorithms.impala.impala_base_learner import ImpalaBaseLearner +from ray.rllib.algorithms.impala.impala_learner import ImpalaLearner from ray.rllib.algorithms.impala.tf.vtrace_tf_v2 import make_time_major, vtrace_tf2 +from ray.rllib.core.learner.learner import ENTROPY_KEY from ray.rllib.core.learner.tf.tf_learner import TfLearner from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override @@ -11,18 +12,17 @@ _, tf, _ = try_import_tf() -class ImpalaTfLearner(TfLearner, ImpalaBaseLearner): +class ImpalaTfLearner(ImpalaLearner, TfLearner): """Implements the IMPALA loss function in tensorflow.""" - def __init__(self, *args, **kwargs): - TfLearner.__init__(self, *args, **kwargs) - ImpalaBaseLearner.__init__(self, *args, **kwargs) - @override(TfLearner) def compute_loss_per_module( self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType] ) -> TensorType: - target_policy_dist = fwd_out[SampleBatch.ACTION_DIST] + action_dist_class_train = self.module[module_id].get_train_action_dist_cls() + target_policy_dist = action_dist_class_train.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) values = fwd_out[SampleBatch.VF_PREDS] behaviour_actions_logp = batch[SampleBatch.ACTION_LOGP] @@ -30,28 +30,24 @@ def compute_loss_per_module( behaviour_actions_logp_time_major = make_time_major( behaviour_actions_logp, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) target_actions_logp_time_major = make_time_major( target_actions_logp, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) values_time_major = make_time_major( values, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) bootstrap_value = values_time_major[-1] rewards_time_major = make_time_major( batch[SampleBatch.REWARDS], - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) # the discount factor that is used should be gamma except for timesteps where @@ -61,23 +57,23 @@ def compute_loss_per_module( - tf.cast( make_time_major( batch[SampleBatch.TERMINATEDS], - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ), dtype=tf.float32, ) - ) * self.discount_factor - # TODO(Artur): See if we should compute v-trace corrected targets on CPU + ) * self.hps.discount_factor + + # Note that vtrace will compute the main loop on the CPU for better performance. vtrace_adjusted_target_values, pg_advantages = vtrace_tf2( target_action_log_probs=target_actions_logp_time_major, behaviour_action_log_probs=behaviour_actions_logp_time_major, + discounts=discounts_time_major, rewards=rewards_time_major, values=values_time_major, bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.vtrace_clip_pg_rho_threshold, - clip_rho_threshold=self.vtrace_clip_rho_threshold, - discounts=discounts_time_major, + clip_pg_rho_threshold=self.hps.vtrace_clip_pg_rho_threshold, + clip_rho_threshold=self.hps.vtrace_clip_rho_threshold, ) # Sample size is T x B, where T is the trajectory length and B is the batch size @@ -93,14 +89,18 @@ def compute_loss_per_module( mean_vf_loss = vf_loss / batch_size # The entropy loss. - entropy_loss = -tf.reduce_sum(target_actions_logp_time_major) + mean_entropy_loss = -tf.reduce_mean(target_policy_dist.entropy()) # The summed weighted loss. total_loss = ( - pi_loss + vf_loss * self.vf_loss_coeff + entropy_loss * self.entropy_coeff + pi_loss + + vf_loss * self.hps.vf_loss_coeff + + mean_entropy_loss + * (self.entropy_coeff_scheduler.get_current_value(module_id)) ) return { self.TOTAL_LOSS_KEY: total_loss, "pi_loss": mean_pi_loss, "vf_loss": mean_vf_loss, + ENTROPY_KEY: -mean_entropy_loss, } diff --git a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py b/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py deleted file mode 100644 index 0244a96c0ac2..000000000000 --- a/rllib/algorithms/impala/tf/impala_tf_policy_rlm.py +++ /dev/null @@ -1,163 +0,0 @@ -import logging -from typing import Dict, List, Union - -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, -) -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.deprecation import Deprecated -from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_utils import ( - explained_variance, -) -from ray.rllib.algorithms.impala.tf.vtrace_tf_v2 import make_time_major, vtrace_tf2 -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() - -logger = logging.getLogger(__name__) - - -class ImpalaTfPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - EagerTFPolicyV2, -): - def __init__(self, observation_space, action_space, config): - validate_config(config) - EagerTFPolicyV2.enable_eager_execution_if_necessary() - # Initialize MixIns before super().__init__ because base class will call - # self.loss, which requires these MixIns to be initialized. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - - self.maybe_initialize_optimizer_and_loss() - - @Deprecated(new="ImpalaTfLearner.compute_loss_per_module()", error=False) - @override(EagerTFPolicyV2) - def loss( - self, - model: PPOTfRLModule, - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - seq_len = train_batch.get(SampleBatch.SEQ_LENS) - rollout_frag_or_episode_len = ( - self.config["rollout_fragment_length"] if not seq_len else None - ) - drop_last = self.config["vtrace_drop_last_ts"] - - fwd_out = model.forward_train(train_batch) - - values = fwd_out[SampleBatch.VF_PREDS] - target_policy_dist = fwd_out[SampleBatch.ACTION_DIST] - - # this is probably a horribly inefficient way to do this. I should be able to - # compute this in a batch fashion - behaviour_actions_logp = train_batch[SampleBatch.ACTION_LOGP] - target_actions_logp = target_policy_dist.logp(train_batch[SampleBatch.ACTIONS]) - behaviour_actions_logp_time_major = make_time_major( - behaviour_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - target_actions_logp_time_major = make_time_major( - target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - values_time_major = make_time_major( - values, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - train_batch[SampleBatch.REWARDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - - # how to compute discouts? - # should they be pre computed? - discounts_time_major = ( - 1.0 - - tf.cast( - make_time_major( - train_batch[SampleBatch.TERMINATEDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ), - dtype=tf.float32, - ) - ) * self.config["gamma"] - vtrace_adjusted_target_values, pg_advantages = vtrace_tf2( - target_action_log_probs=target_actions_logp_time_major, - behaviour_action_log_probs=behaviour_actions_logp_time_major, - rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], - clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], - discounts=discounts_time_major, - ) - - # The policy gradients loss. - pi_loss = -tf.reduce_sum(target_actions_logp_time_major * pg_advantages) - mean_pi_loss = -tf.reduce_mean(target_actions_logp_time_major * pg_advantages) - - # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values - vf_loss = 0.5 * tf.reduce_sum(tf.math.pow(delta, 2.0)) - mean_vf_loss = 0.5 * tf.reduce_mean(tf.math.pow(delta, 2.0)) - - # The entropy loss. - entropy_loss = -tf.reduce_sum(target_actions_logp_time_major) - - # The summed weighted loss. - total_loss = ( - pi_loss - + vf_loss * self.config["vf_loss_coeff"] - + entropy_loss * self.entropy_coeff - ) - self.stats = { - "total_loss": total_loss, - "pi_loss": mean_pi_loss, - "vf_loss": mean_vf_loss, - "values": values_time_major, - "entropy_loss": entropy_loss, - "vtrace_adjusted_target_values": vtrace_adjusted_target_values, - } - return total_loss - - @override(EagerTFPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "policy_loss": self.stats["pi_loss"], - "entropy": self.stats["entropy_loss"], - "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), - "var_gnorm": tf.linalg.global_norm(self.model.trainable_variables), - "vf_loss": self.stats["vf_loss"], - "vf_explained_var": explained_variance( - tf.reshape(self.stats["vtrace_adjusted_target_values"], [-1]), - tf.reshape(self.stats["values"], [-1]), - ), - } - - @override(EagerTFPolicyV2) - def get_batch_divisibility_req(self) -> int: - return self.config["rollout_fragment_length"] diff --git a/rllib/algorithms/impala/tf/vtrace_tf_v2.py b/rllib/algorithms/impala/tf/vtrace_tf_v2.py index 5712a191d811..5f878ddbc1c1 100644 --- a/rllib/algorithms/impala/tf/vtrace_tf_v2.py +++ b/rllib/algorithms/impala/tf/vtrace_tf_v2.py @@ -1,11 +1,8 @@ -from typing import List, Union, TYPE_CHECKING +from typing import List, Union from ray.rllib.utils.framework import try_import_tf _, tf, _ = try_import_tf() -if TYPE_CHECKING: - _, tf, _ = try_import_tf() - def make_time_major( tensor: Union["tf.Tensor", List["tf.Tensor"]], @@ -115,28 +112,6 @@ def vtrace_tf2( """ log_rhos = target_action_log_probs - behaviour_action_log_probs - discounts = tf.convert_to_tensor(discounts, dtype=tf.float32) - rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) - values = tf.convert_to_tensor(values, dtype=tf.float32) - bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32) - if clip_rho_threshold is not None: - clip_rho_threshold = tf.convert_to_tensor(clip_rho_threshold, dtype=tf.float32) - if clip_pg_rho_threshold is not None: - clip_pg_rho_threshold = tf.convert_to_tensor( - clip_pg_rho_threshold, dtype=tf.float32 - ) - - # Make sure tensor ranks are consistent. - rho_rank = log_rhos.shape.ndims # Usually 2. - values.shape.assert_has_rank(rho_rank) - bootstrap_value.shape.assert_has_rank(rho_rank - 1) - discounts.shape.assert_has_rank(rho_rank) - rewards.shape.assert_has_rank(rho_rank) - if clip_rho_threshold is not None: - clip_rho_threshold.shape.assert_has_rank(0) - if clip_pg_rho_threshold is not None: - clip_pg_rho_threshold.shape.assert_has_rank(0) - rhos = tf.math.exp(log_rhos) if clip_rho_threshold is not None: clipped_rhos = tf.minimum(clip_rho_threshold, rhos, name="clipped_rhos") @@ -164,17 +139,18 @@ def scanfunc(acc, sequence_item): discount_t, c_t, delta_t = sequence_item return delta_t + discount_t * c_t * acc - initial_values = tf.zeros_like(bootstrap_value) - vs_minus_v_xs = tf.nest.map_structure( - tf.stop_gradient, - tf.scan( - fn=scanfunc, - elems=sequences, - initializer=initial_values, - parallel_iterations=1, - name="scan", - ), - ) + with tf.device("/cpu:0"): + initial_values = tf.zeros_like(bootstrap_value) + vs_minus_v_xs = tf.nest.map_structure( + tf.stop_gradient, + tf.scan( + fn=scanfunc, + elems=sequences, + initializer=initial_values, + parallel_iterations=1, + name="scan", + ), + ) # Reverse the results back to original order. vs_minus_v_xs = tf.reverse(vs_minus_v_xs, [0]) diff --git a/rllib/algorithms/impala/torch/impala_torch_learner.py b/rllib/algorithms/impala/torch/impala_torch_learner.py index 6809027d8df3..bd1c4c37f3d4 100644 --- a/rllib/algorithms/impala/torch/impala_torch_learner.py +++ b/rllib/algorithms/impala/torch/impala_torch_learner.py @@ -1,10 +1,11 @@ from typing import Mapping -from ray.rllib.algorithms.impala.impala_base_learner import ImpalaBaseLearner +from ray.rllib.algorithms.impala.impala_learner import ImpalaLearner from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import ( vtrace_torch, make_time_major, ) +from ray.rllib.core.learner.learner import ENTROPY_KEY from ray.rllib.core.learner.torch.torch_learner import TorchLearner from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override @@ -15,18 +16,19 @@ torch, nn = try_import_torch() -class ImpalaTorchLearner(TorchLearner, ImpalaBaseLearner): +class ImpalaTorchLearner(ImpalaLearner, TorchLearner): """Implements the IMPALA loss function in torch.""" - def __init__(self, *args, **kwargs): - TorchLearner.__init__(self, *args, **kwargs) - ImpalaBaseLearner.__init__(self, *args, **kwargs) - @override(TorchLearner) def compute_loss_per_module( self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType] ) -> TensorType: - target_policy_dist = fwd_out[SampleBatch.ACTION_DIST] + action_dist_class_train = ( + self.module[module_id].unwrapped().get_train_action_dist_cls() + ) + target_policy_dist = action_dist_class_train.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) values = fwd_out[SampleBatch.VF_PREDS] behaviour_actions_logp = batch[SampleBatch.ACTION_LOGP] @@ -38,28 +40,24 @@ def compute_loss_per_module( target_actions_logp_time_major = make_time_major( target_actions_logp, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) behaviour_actions_logp_time_major = make_time_major( behaviour_actions_logp, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) values_time_major = make_time_major( values, - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) bootstrap_value = values_time_major[-1] rewards_time_major = make_time_major( batch[SampleBatch.REWARDS], - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ) # the discount factor that is used should be gamma except for timesteps where @@ -68,17 +66,16 @@ def compute_loss_per_module( 1.0 - make_time_major( batch[SampleBatch.TERMINATEDS], - trajectory_len=self.rollout_frag_or_episode_len, - recurrent_seq_len=self.recurrent_seq_len, - drop_last=self.vtrace_drop_last_ts, + trajectory_len=self.hps.rollout_frag_or_episode_len, + recurrent_seq_len=self.hps.recurrent_seq_len, ).type(dtype=torch.float32) - ) * self.discount_factor + ) * self.hps.discount_factor # TODO(Artur) Why was there `TorchCategorical if is_multidiscrete else # dist_class` in the old code torch impala policy? device = behaviour_actions_logp_time_major[0].device - # TODO(Artur): See if we should compute v-trace corrected targets on CPU + # Note that vtrace will compute the main loop on the CPU for better performance. vtrace_adjusted_target_values, pg_advantages = vtrace_torch( target_action_log_probs=target_actions_logp_time_major, behaviour_action_log_probs=behaviour_actions_logp_time_major, @@ -86,8 +83,8 @@ def compute_loss_per_module( rewards=rewards_time_major, values=values_time_major, bootstrap_value=bootstrap_value, - clip_rho_threshold=self.vtrace_clip_rho_threshold, - clip_pg_rho_threshold=self.vtrace_clip_pg_rho_threshold, + clip_rho_threshold=self.hps.vtrace_clip_rho_threshold, + clip_pg_rho_threshold=self.hps.vtrace_clip_pg_rho_threshold, ) # Sample size is T x B, where T is the trajectory length and B is the batch size @@ -110,14 +107,18 @@ def compute_loss_per_module( mean_vf_loss = vf_loss / batch_size # The entropy loss. - entropy_loss = -torch.sum(target_actions_logp_time_major) + mean_entropy_loss = -torch.mean(target_policy_dist.entropy()) # The summed weighted loss. total_loss = ( - pi_loss + vf_loss * self.vf_loss_coeff + entropy_loss * self.entropy_coeff + pi_loss + + vf_loss * self.hps.vf_loss_coeff + + mean_entropy_loss + * (self.entropy_coeff_scheduler.get_current_value(module_id)) ) return { self.TOTAL_LOSS_KEY: total_loss, "pi_loss": mean_pi_loss, "vf_loss": mean_vf_loss, + ENTROPY_KEY: -mean_entropy_loss, } diff --git a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py b/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py deleted file mode 100644 index 751b441098a8..000000000000 --- a/rllib/algorithms/impala/torch/impala_torch_policy_rlm.py +++ /dev/null @@ -1,165 +0,0 @@ -import logging -from typing import Dict, List, Union - -from ray.rllib.algorithms.impala.torch.vtrace_torch_v2 import ( - make_time_major, - vtrace_torch, -) -from ray.rllib.algorithms.ppo.ppo_torch_policy import validate_config -from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils.torch_utils import convert_to_torch_tensor -from ray.rllib.policy.torch_mixins import ( - EntropyCoeffSchedule, - LearningRateSchedule, -) -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override, Deprecated -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import ( - explained_variance, - global_norm, -) -from ray.rllib.utils.typing import TensorType - -torch, nn = try_import_torch() - -logger = logging.getLogger(__name__) - - -class ImpalaTorchPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - TorchPolicyV2, -): - def __init__(self, observation_space, action_space, config): - validate_config(config) - TorchPolicyV2.__init__(self, observation_space, action_space, config) - # Initialize MixIns. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() - - @Deprecated(new="ImpalaTorchLearner.compute_loss_per_module()", error=False) - @override(TorchPolicyV2) - def loss( - self, - model: PPOTorchRLModule, - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - seq_len = train_batch.get(SampleBatch.SEQ_LENS) - rollout_frag_or_episode_len = ( - self.config["rollout_fragment_length"] if not seq_len else None - ) - drop_last = self.config["vtrace_drop_last_ts"] - - fwd_out = model.forward_train(train_batch) - - values = fwd_out[SampleBatch.VF_PREDS] - target_policy_dist = fwd_out[SampleBatch.ACTION_DIST] - - # this is probably a horribly inefficient way to do this. I should be able to - # compute this in a batch fashion - behaviour_actions_logp = train_batch[SampleBatch.ACTION_LOGP] - target_actions_logp = target_policy_dist.logp(train_batch[SampleBatch.ACTIONS]) - behaviour_actions_logp_time_major = make_time_major( - behaviour_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - target_actions_logp_time_major = make_time_major( - target_actions_logp, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - values_time_major = make_time_major( - values, - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - train_batch[SampleBatch.REWARDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ) - - # how to compute discouts? - # should they be pre computed? - discounts_time_major = ( - 1.0 - - make_time_major( - train_batch[SampleBatch.TERMINATEDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=seq_len, - drop_last=drop_last, - ).type(dtype=torch.float32) - ) * self.config["gamma"] - vtrace_adjusted_target_values, pg_advantages = vtrace_torch( - target_action_log_probs=target_actions_logp_time_major, - behaviour_action_log_probs=behaviour_actions_logp_time_major, - rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, - clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], - clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], - discounts=discounts_time_major, - ) - - # The policy gradients loss. - pi_loss = -torch.sum(target_actions_logp_time_major * pg_advantages) - mean_pi_loss = -torch.mean(target_actions_logp_time_major * pg_advantages) - - # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values - vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0)) - mean_vf_loss = 0.5 * torch.mean(torch.pow(delta, 2.0)) - - # The entropy loss. - entropy_loss = -torch.sum(target_actions_logp_time_major) - - # The summed weighted loss. - total_loss = ( - pi_loss - + vf_loss * self.config["vf_loss_coeff"] - + entropy_loss * self.entropy_coeff - ) - self.stats = { - "total_loss": total_loss, - "pi_loss": mean_pi_loss, - "vf_loss": mean_vf_loss, - "values": values_time_major, - "entropy_loss": entropy_loss, - "vtrace_adjusted_target_values": vtrace_adjusted_target_values, - } - return total_loss - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_lr": convert_to_torch_tensor(self.cur_lr).type(torch.float64), - "policy_loss": self.stats["pi_loss"], - "entropy": self.stats["entropy_loss"], - "entropy_coeff": convert_to_torch_tensor(self.entropy_coeff).type( - torch.float64 - ), - "var_gnorm": global_norm(self.model.parameters()), - "vf_loss": self.stats["vf_loss"], - "vf_explained_var": explained_variance( - torch.reshape(self.stats["vtrace_adjusted_target_values"], [-1]), - torch.reshape(self.stats["values"], [-1]), - ), - } - - @override(TorchPolicyV2) - def get_batch_divisibility_req(self) -> int: - return self.config["rollout_fragment_length"] diff --git a/rllib/algorithms/impala/torch/vtrace_torch_v2.py b/rllib/algorithms/impala/torch/vtrace_torch_v2.py index 404904109e9b..1f4b6f9411fa 100644 --- a/rllib/algorithms/impala/torch/vtrace_torch_v2.py +++ b/rllib/algorithms/impala/torch/vtrace_torch_v2.py @@ -1,6 +1,5 @@ from typing import List, Union from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_utils import convert_to_torch_tensor torch, nn = try_import_torch() @@ -70,7 +69,7 @@ def vtrace_torch( clip_rho_threshold: Union[float, "torch.Tensor"] = 1.0, clip_pg_rho_threshold: Union[float, "torch.Tensor"] = 1.0, ): - r"""V-trace for softmax policies implemented with torch. + """V-trace for softmax policies implemented with torch. Calculates V-trace actor critic targets for softmax polices as described in "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner @@ -113,25 +112,6 @@ def vtrace_torch( on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). """ log_rhos = target_action_log_probs - behaviour_action_log_probs - discounts = convert_to_torch_tensor(discounts) - rewards = convert_to_torch_tensor(rewards) - values = convert_to_torch_tensor(values) - bootstrap_value = convert_to_torch_tensor(bootstrap_value) - if clip_rho_threshold is not None: - clip_rho_threshold = convert_to_torch_tensor(clip_rho_threshold) - if clip_pg_rho_threshold is not None: - clip_pg_rho_threshold = convert_to_torch_tensor(clip_pg_rho_threshold) - - # Make sure tensor ranks are consistent. - rho_rank = log_rhos.dim() # Usually 2. - assert values.dim() == rho_rank - assert bootstrap_value.dim() == rho_rank - 1 - assert discounts.dim() == rho_rank - assert rewards.dim() == rho_rank - if clip_rho_threshold is not None: - assert clip_rho_threshold.dim() == 0 - if clip_pg_rho_threshold is not None: - assert clip_pg_rho_threshold.dim() == 0 rhos = torch.exp(log_rhos) if clip_rho_threshold is not None: @@ -147,11 +127,17 @@ def vtrace_torch( deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values) - vs_minus_v_xs = [torch.zeros_like(bootstrap_value)] - for i in reversed(range(len(discounts))): - discount_t, c_t, delta_t = discounts[i], cs[i], deltas[i] - vs_minus_v_xs.append(delta_t + discount_t * c_t * vs_minus_v_xs[-1]) - vs_minus_v_xs = torch.stack(vs_minus_v_xs[1:]) + # Only move the for-loop to CPU. + discounts_cpu = discounts.to("cpu") + cs_cpu = cs.to("cpu") + deltas_cpu = deltas.to("cpu") + vs_minus_v_xs_cpu = [torch.zeros_like(bootstrap_value, device="cpu")] + for i in reversed(range(len(discounts_cpu))): + discount_t, c_t, delta_t = discounts_cpu[i], cs_cpu[i], deltas_cpu[i] + vs_minus_v_xs_cpu.append(delta_t + discount_t * c_t * vs_minus_v_xs_cpu[-1]) + vs_minus_v_xs_cpu = torch.stack(vs_minus_v_xs_cpu[1:]) + # Move results back to GPU - if applicable. + vs_minus_v_xs = vs_minus_v_xs_cpu.to(deltas.device) # Reverse the results back to original order. vs_minus_v_xs = torch.flip(vs_minus_v_xs, dims=[0]) diff --git a/rllib/algorithms/mock.py b/rllib/algorithms/mock.py index abc4de8b3f13..ae885d96679b 100644 --- a/rllib/algorithms/mock.py +++ b/rllib/algorithms/mock.py @@ -1,5 +1,7 @@ import os import pickle +import time + import numpy as np from ray.tune import result as tune_result @@ -22,6 +24,7 @@ def get_default_config(cls) -> AlgorithmConfig: "persistent_error": False, "test_variable": 1, "user_checkpoint_freq": 0, + "sleep": 0, } ) ) @@ -46,6 +49,8 @@ def step(self): and (self.config.persistent_error or not self.restored) ): raise Exception("mock error") + if self.config.sleep: + time.sleep(self.config.sleep) result = dict( episode_reward_mean=10, episode_len_mean=10, timesteps_this_iter=10, info={} ) diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 4ce1347cf67a..05f531117901 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -9,33 +9,34 @@ Detailed documentation: https://docs.ray.io/en/master/rllib-algorithms.html#ppo """ +import dataclasses import logging from typing import List, Optional, Type, Union, TYPE_CHECKING -from ray.util.debug import log_once from ray.rllib.algorithms.algorithm import Algorithm from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided from ray.rllib.algorithms.pg import PGConfig -from ray.rllib.algorithms.ppo.ppo_learner_config import PPOLearnerHPs from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog +from ray.rllib.algorithms.ppo.ppo_learner import ( + PPOLearnerHyperparameters, + LEARNER_RESULTS_KL_KEY, +) from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.execution.rollout_ops import ( standardize_fields, + synchronous_parallel_sample, ) from ray.rllib.execution.train_ops import ( train_one_step, multi_gpu_train_one_step, ) -from ray.rllib.utils.annotations import ExperimentalAPI from ray.rllib.policy.policy import Policy -from ray.rllib.utils.annotations import override +from ray.rllib.utils.annotations import ExperimentalAPI, override from ray.rllib.utils.deprecation import ( DEPRECATED_VALUE, deprecation_warning, ) from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY -from ray.rllib.utils.typing import ResultDict -from ray.rllib.execution.rollout_ops import synchronous_parallel_sample from ray.rllib.utils.metrics import ( NUM_AGENT_STEPS_SAMPLED, NUM_ENV_STEPS_SAMPLED, @@ -43,6 +44,9 @@ SAMPLE_TIMER, ALL_MODULES, ) +from ray.rllib.utils.schedules.scheduler import Scheduler +from ray.rllib.utils.typing import ResultDict +from ray.util.debug import log_once if TYPE_CHECKING: from ray.rllib.core.learner.learner import Learner @@ -94,7 +98,6 @@ def __init__(self, algo_class=None): # fmt: off # __sphinx_doc_begin__ # PPO specific settings: - self._learner_hps = PPOLearnerHPs() self.use_critic = True self.use_gae = True self.lambda_ = 1.0 @@ -166,6 +169,21 @@ def get_default_learner_class(self) -> Union[Type["Learner"], str]: else: raise ValueError(f"The framework {self.framework_str} is not supported.") + @override(AlgorithmConfig) + def get_learner_hyperparameters(self) -> PPOLearnerHyperparameters: + base_hps = super().get_learner_hyperparameters() + return PPOLearnerHyperparameters( + use_critic=self.use_critic, + kl_coeff=self.kl_coeff, + vf_loss_coeff=self.vf_loss_coeff, + entropy_coeff=self.entropy_coeff, + entropy_coeff_schedule=self.entropy_coeff_schedule, + clip_param=self.clip_param, + vf_clip_param=self.vf_clip_param, + kl_target=self.kl_target, + **dataclasses.asdict(base_hps), + ) + @override(AlgorithmConfig) def training( self, @@ -212,7 +230,7 @@ def training( tune this if you set vf_share_layers=True inside your model's config. entropy_coeff: Coefficient of the entropy regularizer. entropy_coeff_schedule: Decay schedule for the entropy regularizer. - clip_param: PPO clip parameter. + clip_param: The PPO clip parameter. vf_clip_param: Clip param for the value function. Note that this is sensitive to the scale of the rewards. If your expected V is large, increase this. @@ -232,6 +250,7 @@ def training( # Pass kwargs onto super's `training()` method. super().training(**kwargs) + # TODO (sven): Move to generic AlgorithmConfig. if lr_schedule is not NotProvided: self.lr_schedule = lr_schedule if use_critic is not NotProvided: @@ -306,17 +325,13 @@ def validate(self) -> None: # Check `entropy_coeff` for correctness. if self.entropy_coeff < 0.0: raise ValueError("`entropy_coeff` must be >= 0.0") - # learner hps need to be updated inside of config.validate in order to have - # the correct values for when a user starts an experiment from a dict. This is - # as oppposed to assigning the values inthe builder functions such as `training` - self._learner_hps.use_critic = self.use_critic - self._learner_hps.kl_coeff = self.kl_coeff - self._learner_hps.vf_loss_coeff = self.vf_loss_coeff - self._learner_hps.entropy_coeff = self.entropy_coeff - self._learner_hps.entropy_coeff_schedule = self.entropy_coeff_schedule - self._learner_hps.clip_param = self.clip_param - self._learner_hps.vf_clip_param = self.vf_clip_param - self._learner_hps.kl_target = self.kl_target + # Entropy coeff schedule checking. + if self._enable_learner_api: + Scheduler.validate( + self.entropy_coeff_schedule, + "entropy_coeff_schedule", + "entropy coefficient", + ) class UpdateKL: @@ -363,32 +378,17 @@ def get_default_policy_class( ) -> Optional[Type[Policy]]: if config["framework"] == "torch": - if config._enable_rl_module_api: - from ray.rllib.algorithms.ppo.torch.ppo_torch_policy_rlm import ( - PPOTorchPolicyWithRLModule, - ) - - return PPOTorchPolicyWithRLModule - else: - from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy + from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy - return PPOTorchPolicy + return PPOTorchPolicy elif config["framework"] == "tf": from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF1Policy return PPOTF1Policy else: - if config._enable_rl_module_api: - from ray.rllib.algorithms.ppo.tf.ppo_tf_policy_rlm import ( - PPOTfPolicyWithRLModule, - ) - - return PPOTfPolicyWithRLModule - else: - - from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF2Policy + from ray.rllib.algorithms.ppo.ppo_tf_policy import PPOTF2Policy - return PPOTF2Policy + return PPOTF2Policy @ExperimentalAPI def training_step(self) -> ResultDict: @@ -432,12 +432,12 @@ def training_step(self) -> ResultDict: train_results = multi_gpu_train_one_step(self, train_batch) if self.config._enable_learner_api: - # the train results's loss keys are pids to their loss values. But we also + # The train results's loss keys are pids to their loss values. But we also # return a total_loss key at the same level as the pid keys. So we need to # subtract that to get the total set of pids to update. # TODO (Kourosh): We should also not be using train_results as a message - # passing medium to infer whcih policies to update. We could use - # policies_to_train variable that is given by the user to infer this. + # passing medium to infer which policies to update. We could use + # policies_to_train variable that is given by the user to infer this. policies_to_update = set(train_results.keys()) - {ALL_MODULES} else: policies_to_update = list(train_results.keys()) @@ -471,18 +471,17 @@ def training_step(self) -> ResultDict: if self.config._enable_learner_api: kl_dict = { - # TODO (Kourosh): Train results don't match the old format. The thing - # that used to be under `kl` is now under `mean_kl_loss`. Fix this. Do - # we need get here? - pid: train_results[pid][LEARNER_STATS_KEY].get("kl") + pid: train_results[pid][LEARNER_STATS_KEY][LEARNER_RESULTS_KL_KEY] for pid in policies_to_update } # triggers a special update method on RLOptimizer to update the KL values. - self.learner_group.additional_update( + additional_results = self.learner_group.additional_update( module_ids_to_update=policies_to_update, sampled_kl_values=kl_dict, timestep=self._counters[NUM_AGENT_STEPS_SAMPLED], ) + for pid, res in additional_results.items(): + train_results[pid].update(res) return train_results diff --git a/rllib/algorithms/ppo/ppo_base_learner.py b/rllib/algorithms/ppo/ppo_base_learner.py deleted file mode 100644 index 94a933ea5e3b..000000000000 --- a/rllib/algorithms/ppo/ppo_base_learner.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import Mapping, Any - -import abc -from ray.rllib.core.rl_module.rl_module import ModuleID -from ray.rllib.core.learner.learner import Learner -from ray.rllib.utils.annotations import override - - -class PPOBaseLearner(Learner): - def build(self) -> None: - super().build() - - # TODO (Kourosh): Move these failures to config.validate() or support them. - self.entropy_coeff_scheduler = None - if self.hps.entropy_coeff_schedule: - raise ValueError("entropy_coeff_schedule is not supported in Learner yet") - - # TODO (Kourosh): This needs to be native tensor variable to be traced. - self.entropy_coeff = self.hps.entropy_coeff - - # TODO (Kourosh): Create a way on the base class for users to define arbitrary - # schedulers for learning rates. - self.lr_scheduler = None - if self.hps.lr_schedule: - raise ValueError("lr_schedule is not supported in Learner yet") - - # TODO (Kourosh): We can still use mix-ins in the new design. Do we want that? - # Most likely not. I rather be specific about everything. kl_coeff is a - # none-gradient based update which we can define here and add as update with - # additional_update() method. - - # We need to make sure that the kl_coeff is a framework tensor that is - # registered as part of the graph so that upon update the graph can be updated - # (e.g. in TF with eager tracing) - self.kl_coeff_val = self.hps.kl_coeff - self.kl_coeff = self._create_kl_variable(self.hps.kl_coeff) - - self.kl_target = self.hps.kl_target - - @override(Learner) - def additional_update_per_module( - self, module_id: ModuleID, sampled_kl_values: dict, timestep: int - ) -> Mapping[str, Any]: - assert sampled_kl_values, "Sampled KL values are empty." - - sampled_kl = sampled_kl_values[module_id] - if sampled_kl > 2.0 * self.kl_target: - # TODO (Kourosh) why not 2? - self.kl_coeff_val *= 1.5 - elif sampled_kl < 0.5 * self.kl_target: - self.kl_coeff_val *= 0.5 - - self._set_kl_coeff(self.kl_coeff_val) - results = {"kl_coeff": self.kl_coeff_val} - - # TODO (Kourosh): We may want to index into the schedulers to get the right one - # for this module - if self.entropy_coeff_scheduler is not None: - self.entropy_coeff_scheduler.update(timestep) - - if self.lr_scheduler is not None: - self.lr_scheduler.update(timestep) - - return results - - @abc.abstractmethod - def _create_kl_variable(self, value: float) -> Any: - """Creates the kl_coeff tensor variable. - - This is a framework specific method that should be implemented by the - framework specific sub-class. - - Args: - value: The initial value for the kl_coeff variable. - """ - - @abc.abstractmethod - def _set_kl_coeff(self, value: float) -> None: - """Sets the value of the kl_coeff variable. - - This is a framework specific method that should be implemented by the - framework specific sub-class. - - Args: - value: The new value for the kl_coeff variable. - """ diff --git a/rllib/algorithms/ppo/ppo_base_rl_module.py b/rllib/algorithms/ppo/ppo_base_rl_module.py deleted file mode 100644 index 2f48a61c6dee..000000000000 --- a/rllib/algorithms/ppo/ppo_base_rl_module.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -This file holds framework-agnostic components for PPO's RLModules. -""" - -import abc - -from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleConfig -from ray.rllib.utils.annotations import ExperimentalAPI -from ray.rllib.core.models.base import ActorCriticEncoder - - -@ExperimentalAPI -class PPORLModuleBase(RLModule, abc.ABC): - def __init__(self, config: RLModuleConfig): - super().__init__(config) - - def setup(self): - # __sphinx_doc_begin__ - catalog = self.config.get_catalog() - - # Build models from catalog - self.encoder = catalog.build_actor_critic_encoder(framework=self.framework) - self.pi = catalog.build_pi_head(framework=self.framework) - self.vf = catalog.build_vf_head(framework=self.framework) - - self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework) - # __sphinx_doc_end__ - - assert isinstance(self.encoder, ActorCriticEncoder) diff --git a/rllib/algorithms/ppo/ppo_catalog.py b/rllib/algorithms/ppo/ppo_catalog.py index 186953b57aba..c9c53aa514ff 100644 --- a/rllib/algorithms/ppo/ppo_catalog.py +++ b/rllib/algorithms/ppo/ppo_catalog.py @@ -38,7 +38,7 @@ class PPOCatalog(Catalog): - Value Function Head: The head used to compute the value function. The ActorCriticEncoder is a wrapper around Encoders to produce separate outputs - for the policy and value function. See implementations of PPORLModuleBase for + for the policy and value function. See implementations of PPORLModule for more details. Any custom ActorCriticEncoder can be built by overriding the @@ -89,8 +89,9 @@ def __init__( hidden_layer_dims=post_fcnet_hiddens, hidden_layer_activation=post_fcnet_activation, output_activation="linear", - output_dims=None, # We don't know the output dimension yet, because it - # depends on the action distribution input dimension + # We don't know the output dimension yet, because it depends on the + # action distribution input dimension. + output_dims=None, ) self.vf_head_config = MLPHeadConfig( @@ -106,7 +107,7 @@ def build_actor_critic_encoder(self, framework: str) -> ActorCriticEncoder: The default behavior is to build the encoder from the encoder_config. This can be overridden to build a custom ActorCriticEncoder as a means of - configuring the behavior of a PPORLModuleBase implementation. + configuring the behavior of a PPORLModule implementation. Args: framework: The framework to use. Either "torch" or "tf2". @@ -131,7 +132,7 @@ def build_pi_head(self, framework: str) -> Model: The default behavior is to build the head from the pi_head_config. This can be overridden to build a custom policy head as a means of configuring - the behavior of a PPORLModuleBase implementation. + the behavior of a PPORLModule implementation. Args: framework: The framework to use. Either "torch" or "tf2". @@ -156,7 +157,7 @@ def build_vf_head(self, framework: str) -> Model: The default behavior is to build the head from the vf_head_config. This can be overridden to build a custom value function head as a means of - configuring the behavior of a PPORLModuleBase implementation. + configuring the behavior of a PPORLModule implementation. Args: framework: The framework to use. Either "torch" or "tf2". diff --git a/rllib/algorithms/ppo/ppo_learner.py b/rllib/algorithms/ppo/ppo_learner.py new file mode 100644 index 000000000000..be16bbb53112 --- /dev/null +++ b/rllib/algorithms/ppo/ppo_learner.py @@ -0,0 +1,75 @@ +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +from ray.rllib.core.learner.learner import LearnerHyperparameters +from ray.rllib.core.learner.learner import Learner +from ray.rllib.core.rl_module.rl_module import ModuleID +from ray.rllib.utils.annotations import override +from ray.rllib.utils.schedules.scheduler import Scheduler + + +LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY = "vf_loss_unclipped" +LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY = "vf_explained_var" +LEARNER_RESULTS_KL_KEY = "mean_kl_loss" +LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff" +LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY = "curr_entropy_coeff" + + +@dataclass +class PPOLearnerHyperparameters(LearnerHyperparameters): + """Hyperparameters for the PPOLearner sub-classes (framework specific). + + These should never be set directly by the user. Instead, use the PPOConfig + class to configure your algorithm. + See `ray.rllib.algorithms.ppo.ppo::PPOConfig::training()` for more details on the + individual properties. + """ + + kl_coeff: float = None + kl_target: float = None + use_critic: bool = None + clip_param: float = None + vf_clip_param: float = None + entropy_coeff: float = None + entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None + vf_loss_coeff: float = None + + +class PPOLearner(Learner): + @override(Learner) + def build(self) -> None: + super().build() + + # Build entropy coeff scheduling tools. + self.entropy_coeff_scheduler = Scheduler( + fixed_value=self.hps.entropy_coeff, + schedule=self.hps.entropy_coeff_schedule, + framework=self.framework, + device=self._device, + ) + + # Set up KL coefficient variables (per module). + # Note that the KL coeff is not controlled by a schedul, but seeks + # to stay close to a given kl_target value. + self.curr_kl_coeffs_per_module = defaultdict( + lambda: self._get_tensor_variable(self.hps.kl_coeff) + ) + + @override(Learner) + def additional_update_per_module( + self, module_id: ModuleID, sampled_kl_values: dict, timestep: int + ) -> Dict[str, Any]: + results = super().additional_update_per_module( + module_id, + sampled_kl_values=sampled_kl_values, + timestep=timestep, + ) + + # Update entropy coefficient via our Scheduler. + new_entropy_coeff = self.entropy_coeff_scheduler.update( + module_id, timestep=timestep + ) + results.update({LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY: new_entropy_coeff}) + + return results diff --git a/rllib/algorithms/ppo/ppo_learner_config.py b/rllib/algorithms/ppo/ppo_learner_config.py deleted file mode 100644 index e6850efa6b6a..000000000000 --- a/rllib/algorithms/ppo/ppo_learner_config.py +++ /dev/null @@ -1,21 +0,0 @@ -from dataclasses import dataclass -from typing import List, Optional, Union - -from ray.rllib.core.learner.learner import LearnerHPs - - -@dataclass -class PPOLearnerHPs(LearnerHPs): - """Hyperparameters for the PPO RL Trainer""" - - kl_coeff: float = 0.2 - kl_target: float = 0.01 - use_critic: bool = True - clip_param: float = 0.3 - vf_clip_param: float = 10.0 - entropy_coeff: float = 0.0 - vf_loss_coeff: float = 1.0 - - # experimental placeholder for things that could be part of the base LearnerHPs - lr_schedule: Optional[List[List[Union[int, float]]]] = None - entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = None diff --git a/rllib/algorithms/ppo/ppo_rl_module.py b/rllib/algorithms/ppo/ppo_rl_module.py new file mode 100644 index 000000000000..cc14de1bf2d4 --- /dev/null +++ b/rllib/algorithms/ppo/ppo_rl_module.py @@ -0,0 +1,77 @@ +""" +This file holds framework-agnostic components for PPO's RLModules. +""" + +import abc +from typing import Type + +from ray.rllib.core.models.base import ActorCriticEncoder +from ray.rllib.core.models.specs.specs_dict import SpecDict +from ray.rllib.core.rl_module.rl_module import RLModule +from ray.rllib.models.distributions import Distribution +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import ExperimentalAPI +from ray.rllib.utils.annotations import override + + +@ExperimentalAPI +class PPORLModule(RLModule, abc.ABC): + def setup(self): + # __sphinx_doc_begin__ + catalog = self.config.get_catalog() + + # Build models from catalog + self.encoder = catalog.build_actor_critic_encoder(framework=self.framework) + self.pi = catalog.build_pi_head(framework=self.framework) + self.vf = catalog.build_vf_head(framework=self.framework) + + self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework) + # __sphinx_doc_end__ + + assert isinstance(self.encoder, ActorCriticEncoder) + + def get_train_action_dist_cls(self) -> Type[Distribution]: + return self.action_dist_cls + + def get_exploration_action_dist_cls(self) -> Type[Distribution]: + return self.action_dist_cls + + def get_inference_action_dist_cls(self) -> Type[Distribution]: + return self.action_dist_cls + + @override(RLModule) + def get_initial_state(self) -> dict: + if hasattr(self.encoder, "get_initial_state"): + return self.encoder.get_initial_state() + else: + return {} + + @override(RLModule) + def input_specs_inference(self) -> SpecDict: + return self.input_specs_exploration() + + @override(RLModule) + def output_specs_inference(self) -> SpecDict: + return [SampleBatch.ACTION_DIST_INPUTS] + + @override(RLModule) + def input_specs_exploration(self): + return [SampleBatch.OBS] + + @override(RLModule) + def output_specs_exploration(self) -> SpecDict: + return [ + SampleBatch.VF_PREDS, + SampleBatch.ACTION_DIST_INPUTS, + ] + + @override(RLModule) + def input_specs_train(self) -> SpecDict: + return self.input_specs_exploration() + + @override(RLModule) + def output_specs_train(self) -> SpecDict: + return [ + SampleBatch.VF_PREDS, + SampleBatch.ACTION_DIST_INPUTS, + ] diff --git a/rllib/algorithms/ppo/ppo_tf_policy.py b/rllib/algorithms/ppo/ppo_tf_policy.py index a00f8c037eb6..76e8d0161689 100644 --- a/rllib/algorithms/ppo/ppo_tf_policy.py +++ b/rllib/algorithms/ppo/ppo_tf_policy.py @@ -89,11 +89,11 @@ def __init__( # Initialize MixIns. ValueNetworkMixin.__init__(self, config) - KLCoeffMixin.__init__(self, config) EntropyCoeffSchedule.__init__( self, config["entropy_coeff"], config["entropy_coeff_schedule"] ) LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + KLCoeffMixin.__init__(self, config) # Note: this is a bit ugly, but loss and optimizer initialization must # happen after all the MixIns are initialized. diff --git a/rllib/algorithms/ppo/ppo_torch_policy.py b/rllib/algorithms/ppo/ppo_torch_policy.py index df45eefeb14c..26a52dbe4d2b 100644 --- a/rllib/algorithms/ppo/ppo_torch_policy.py +++ b/rllib/algorithms/ppo/ppo_torch_policy.py @@ -44,8 +44,6 @@ class PPOTorchPolicy( def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.algorithms.ppo.ppo.PPOConfig().to_dict(), **config) - # TODO: Move into Policy API, if needed at all here. Why not move this into - # `PPOConfig`?. validate_config(config) TorchPolicyV2.__init__( @@ -63,7 +61,6 @@ def __init__(self, observation_space, action_space, config): ) KLCoeffMixin.__init__(self, config) - # TODO: Don't require users to call this manually. self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index 12e910ed8599..e16ea35641e6 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -2,6 +2,7 @@ import unittest import numpy as np import torch +import tempfile import tensorflow as tf import tree # pip install dm-tree @@ -11,7 +12,6 @@ from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.test_utils import check, framework_iterator -from ray.rllib.utils.metrics import ALL_MODULES from ray.rllib.evaluation.postprocessing import ( compute_gae_for_sample_batch, @@ -67,6 +67,7 @@ def test_loss(self): fcnet_activation="linear", vf_share_layers=False, ), + _enable_learner_api=True, ) .rl_module( _enable_rl_module_api=True, @@ -74,8 +75,8 @@ def test_loss(self): ) for fw in framework_iterator(config, ("tf2", "torch"), with_eager_tracing=True): - trainer = config.build() - policy = trainer.get_policy() + algo = config.build() + policy = algo.get_policy() train_batch = SampleBatch(FAKE_BATCH) train_batch = compute_gae_for_sample_batch(policy, train_batch) @@ -86,15 +87,11 @@ def test_loss(self): lambda x: torch.as_tensor(x).float(), train_batch ) else: - # tf train_batch = tree.map_structure( lambda x: tf.convert_to_tensor(x), train_batch ) - policy_loss = policy.loss(policy.model, policy.dist_class, train_batch) - algo_config = config.copy(copy_frozen=False) - algo_config.training(_enable_learner_api=True) algo_config.validate() algo_config.freeze() @@ -109,13 +106,55 @@ def test_loss(self): ) learner_group = learner_group_config.build() - # load the trainer weights onto the learner_group - learner_group.set_weights(trainer.get_weights()) - results = learner_group.update(train_batch.as_multi_agent()) + # Load the algo weights onto the learner_group. + learner_group.set_weights(algo.get_weights()) + learner_group.update(train_batch.as_multi_agent()) + + algo.stop() - learner_group_loss = results[ALL_MODULES]["total_loss"] + def test_save_load_state(self): + """Tests saving and loading the state of the PPO Learner Group.""" + config = ( + ppo.PPOConfig() + .environment("CartPole-v1") + .rollouts( + num_rollout_workers=0, + ) + .training( + gamma=0.99, + model=dict( + fcnet_hiddens=[10, 10], + fcnet_activation="linear", + vf_share_layers=False, + ), + _enable_learner_api=True, + ) + .rl_module( + _enable_rl_module_api=True, + ) + ) + algo = config.build() + policy = algo.get_policy() - check(learner_group_loss, policy_loss) + for _ in framework_iterator(config, ("tf2", "torch"), with_eager_tracing=True): + algo_config = config.copy(copy_frozen=False) + algo_config.validate() + algo_config.freeze() + learner_group_config = algo_config.get_learner_group_config( + SingleAgentRLModuleSpec( + module_class=algo_config.rl_module_spec.module_class, + observation_space=policy.observation_space, + action_space=policy.action_space, + model_config_dict=policy.config["model"], + catalog_class=PPOCatalog, + ) + ) + learner_group1 = learner_group_config.build() + learner_group2 = learner_group_config.build() + with tempfile.TemporaryDirectory() as tmpdir: + learner_group1.save_state(tmpdir) + learner_group2.load_state(tmpdir) + check(learner_group1.get_state(), learner_group2.get_state()) if __name__ == "__main__": diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py index 3c861d6aa8ed..938e285b1eb3 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py @@ -15,6 +15,7 @@ from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import ( PPOTorchRLModule, ) +from ray.rllib.core.models.base import STATE_IN from ray.rllib.core.rl_module.rl_module import RLModuleConfig from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.utils.numpy import convert_to_numpy @@ -46,7 +47,7 @@ def get_expected_module_config( return config -def dummy_torch_ppo_loss(batch, fwd_out): +def dummy_torch_ppo_loss(module, batch, fwd_out): """Dummy PPO loss function for testing purposes. Will eventually use the actual PPO loss function implemented in the PPOTfTrainer. @@ -63,19 +64,24 @@ def dummy_torch_ppo_loss(batch, fwd_out): # this is not exactly a ppo loss, just something to show that the # forward train works adv = batch[SampleBatch.REWARDS] - fwd_out[SampleBatch.VF_PREDS] - actor_loss = -(fwd_out[SampleBatch.ACTION_LOGP] * adv).mean() + action_dist_class = module.get_train_action_dist_cls() + action_probs = action_dist_class.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ).logp(batch[SampleBatch.ACTIONS]) + actor_loss = -(action_probs * adv).mean() critic_loss = (adv**2).mean() loss = actor_loss + critic_loss return loss -def dummy_tf_ppo_loss(batch, fwd_out): +def dummy_tf_ppo_loss(module, batch, fwd_out): """Dummy PPO loss function for testing purposes. Will eventually use the actual PPO loss function implemented in the PPOTfTrainer. Args: + module: PPOTfRLModule batch: SampleBatch used for training. fwd_out: Forward output of the model. @@ -83,7 +89,10 @@ def dummy_tf_ppo_loss(batch, fwd_out): Loss tensor """ adv = batch[SampleBatch.REWARDS] - fwd_out[SampleBatch.VF_PREDS] - action_probs = fwd_out[SampleBatch.ACTION_DIST].logp(batch[SampleBatch.ACTIONS]) + action_dist_class = module.get_train_action_dist_cls() + action_probs = action_dist_class.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ).logp(batch[SampleBatch.ACTIONS]) actor_loss = -tf.reduce_mean(action_probs * adv) critic_loss = tf.reduce_mean(tf.square(adv)) return actor_loss + critic_loss @@ -105,9 +114,13 @@ def _get_input_batch_from_obs(framework, obs): if framework == "torch": batch = { SampleBatch.OBS: convert_to_torch_tensor(obs)[None], + STATE_IN: None, } else: - batch = {SampleBatch.OBS: tf.convert_to_tensor([obs])} + batch = { + SampleBatch.OBS: tf.convert_to_tensor([obs]), + STATE_IN: None, + } return batch @@ -133,9 +146,6 @@ def test_rollouts(self): if lstm and fw == "tf2": # LSTM not implemented in TF2 yet continue - if env_name == "ALE/Breakout-v5" and fw == "tf2": - # TODO(Artur): Implement CNN in TF2. - continue print(f"[FW={fw} | [ENV={env_name}] | [FWD={fwd_fn}] | LSTM" f"={lstm}") if env_name.startswith("ALE/"): env = gym.make("GymV26Environment-v0", env_id=env_name) @@ -181,9 +191,6 @@ def test_forward_train(self): if lstm and fw == "tf2": # LSTM not implemented in TF2 yet continue - if env_name == "ALE/Breakout-v5" and fw == "tf2": - # TODO(Artur): Implement CNN in TF2. - continue print(f"[FW={fw} | [ENV={env_name}] | LSTM={lstm}") # TODO(Artur): Figure out why this is needed and fix it. if env_name.startswith("ALE/"): @@ -220,9 +227,13 @@ def test_forward_train(self): # input_batch[SampleBatch.SEQ_LENS] = np.array([1]) fwd_out = module.forward_exploration(input_batch) - _action = fwd_out["action_dist"].sample() + action_dist_cls = module.get_exploration_action_dist_cls() + action_dist = action_dist_cls.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + _action = action_dist.sample() action = convert_to_numpy(_action[0]) - action_logp = convert_to_numpy(fwd_out["action_dist"].logp(_action)[0]) + action_logp = convert_to_numpy(action_dist.logp(_action)[0]) new_obs, reward, terminated, truncated, _ = env.step(action) new_obs = preprocessor.transform(new_obs) output_batch = { @@ -233,6 +244,7 @@ def test_forward_train(self): SampleBatch.REWARDS: np.array(reward), SampleBatch.TERMINATEDS: np.array(terminated), SampleBatch.TRUNCATEDS: np.array(truncated), + STATE_IN: None, } # TODO (Artur): Un-uncomment once Policy supports RNN @@ -259,7 +271,7 @@ def test_forward_train(self): module.to("cpu") module.train() fwd_out = module.forward_train(fwd_in) - loss = dummy_torch_ppo_loss(fwd_in, fwd_out) + loss = dummy_torch_ppo_loss(module, fwd_in, fwd_out) loss.backward() # check that all neural net parameters have gradients @@ -274,7 +286,7 @@ def test_forward_train(self): # fwd_in[SampleBatch.SEQ_LENS] = torch.Tensor([10]) with tf.GradientTape() as tape: fwd_out = module.forward_train(fwd_in) - loss = dummy_tf_ppo_loss(fwd_in, fwd_out) + loss = dummy_tf_ppo_loss(module, fwd_in, fwd_out) grads = tape.gradient(loss, module.trainable_variables) for grad in grads: self.assertIsNotNone(grad) diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index 62853861b65a..b365f25a1043 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -4,13 +4,19 @@ import ray import ray.rllib.algorithms.ppo as ppo +from ray.rllib.algorithms.ppo.ppo_learner import ( + LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY, +) from ray.rllib.algorithms.callbacks import DefaultCallbacks from ray.rllib.algorithms.ppo.tests.test_ppo import PENDULUM_FAKE_BATCH +from ray.rllib.core.learner.learner import ( + LEARNER_RESULTS_CURR_LR_KEY, +) from ray.rllib.evaluation.postprocessing import ( compute_gae_for_sample_batch, ) from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY +from ray.rllib.utils.metrics.learner_info import LEARNER_INFO from ray.rllib.utils.test_utils import ( check, check_compute_single_action, @@ -47,36 +53,27 @@ def get_model_config(framework, lstm=False): class MyCallbacks(DefaultCallbacks): - @staticmethod - def _check_lr_torch(policy, policy_id): - for j, opt in enumerate(policy._optimizers): - for p in opt.param_groups: - assert p["lr"] == policy.cur_lr, "LR scheduling error!" - - @staticmethod - def _check_lr_tf(policy, policy_id): - lr = policy.cur_lr - sess = policy.get_session() - if sess: - lr = sess.run(lr) - optim_lr = sess.run(policy._optimizer._lr) - else: - lr = lr.numpy() - optim_lr = policy._optimizer.lr.numpy() - assert lr == optim_lr, "LR scheduling error!" - def on_train_result(self, *, algorithm, result: dict, **kwargs): - stats = result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY] - # Learning rate should go to 0 after 1 iter. - check(stats["cur_lr"], 5e-5 if algorithm.iteration == 1 else 0.0) + stats = result["info"][LEARNER_INFO][DEFAULT_POLICY_ID] # Entropy coeff goes to 0.05, then 0.0 (per iter). - check(stats["entropy_coeff"], 0.1 if algorithm.iteration == 1 else 0.05) + check( + stats[LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY], + 0.05 if algorithm.iteration == 1 else 0.0, + ) - algorithm.workers.foreach_policy( - self._check_lr_torch + # Learning rate should decrease by 0.0001/4 per iteration. + check( + stats[LEARNER_RESULTS_CURR_LR_KEY], + 0.0000075 if algorithm.iteration == 1 else 0.000005, + ) + # Compare reported curr lr vs the actual lr found in the optimizer object. + optim = algorithm.learner_group._learner._named_optimizers[DEFAULT_POLICY_ID] + actual_optimizer_lr = ( + optim.param_groups[0]["lr"] if algorithm.config.framework_str == "torch" - else self._check_lr_tf + else optim.lr ) + check(stats[LEARNER_RESULTS_CURR_LR_KEY], actual_optimizer_lr) class TestPPO(unittest.TestCase): @@ -96,26 +93,24 @@ def test_ppo_compilation_and_schedule_mixins(self): ppo.PPOConfig() .training( num_sgd_iter=2, - # Setup lr schedule for testing. - lr_schedule=[[0, 5e-5], [128, 0.0]], + # Setup lr schedule for testing lr-scheduling correctness. + lr_schedule=[[0, 0.00001], [512, 0.0]], # 512=4x128 # Set entropy_coeff to a faulty value to proof that it'll get # overridden by the schedule below (which is expected). entropy_coeff=100.0, - entropy_coeff_schedule=[[0, 0.1], [256, 0.0]], + entropy_coeff_schedule=[[0, 0.1], [256, 0.0]], # 256=2x128 train_batch_size=128, - # TODO (Kourosh): Enable when the scheduler is supported in the new - # Learner API stack. - _enable_learner_api=False, + _enable_learner_api=True, ) .rollouts( num_rollout_workers=1, # Test with compression. - compress_observations=True, + # compress_observations=True, enable_connectors=True, ) .callbacks(MyCallbacks) .rl_module(_enable_rl_module_api=True) - ) # For checking lr-schedule correctness. + ) num_iterations = 2 @@ -124,9 +119,6 @@ def test_ppo_compilation_and_schedule_mixins(self): ): # TODO (Kourosh) Bring back "FrozenLake-v1" for env in ["CartPole-v1", "Pendulum-v1", "ALE/Breakout-v5"]: - if env == "ALE/Breakout-v5" and fw == "tf2": - # TODO(Artur): Implement CNN in TF2. - continue print("Env={}".format(env)) # TODO (Kourosh, Avnishn): for now just do lstm=False for lstm in [False]: @@ -134,11 +126,22 @@ def test_ppo_compilation_and_schedule_mixins(self): config.training(model=get_model_config(fw, lstm=lstm)) algo = config.build(env=env) - policy = algo.get_policy() - entropy_coeff = algo.get_policy().entropy_coeff - lr = policy.cur_lr + # TODO: Maybe add an API to get the Learner(s) instances within + # a learner group, remote or not. + learner = algo.learner_group._learner + optim = algo.learner_group._learner._named_optimizers[ + DEFAULT_POLICY_ID + ] + # Check initial LR directly set in optimizer vs the first (ts=0) + # value from the schedule. + lr = optim.param_groups[0]["lr"] if fw == "torch" else optim.lr + check(lr, config.lr_schedule[0][1]) + + # Check current entropy coeff value using the respective Scheduler. + entropy_coeff = learner.entropy_coeff_scheduler.get_current_value( + DEFAULT_POLICY_ID + ) check(entropy_coeff, 0.1) - check(lr, config.lr) for i in range(num_iterations): results = algo.train() @@ -164,21 +167,22 @@ def test_ppo_exploration_setup(self): enable_connectors=True, ) .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) ) obs = np.array(0) - for fw in framework_iterator( + for _ in framework_iterator( config, frameworks=("torch", "tf2"), with_eager_tracing=True ): # Default Agent should be setup with StochasticSampling. - trainer = config.build() + algo = config.build() # explore=False, always expect the same (deterministic) action. - a_ = trainer.compute_single_action( + a_ = algo.compute_single_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0) ) for _ in range(50): - a = trainer.compute_single_action( + a = algo.compute_single_action( obs, explore=False, prev_action=np.array(2), @@ -190,12 +194,12 @@ def test_ppo_exploration_setup(self): actions = [] for _ in range(300): actions.append( - trainer.compute_single_action( + algo.compute_single_action( obs, prev_action=np.array(2), prev_reward=np.array(1.0) ) ) check(np.mean(actions), 1.5, atol=0.2) - trainer.stop() + algo.stop() def test_ppo_free_log_std_with_rl_modules(self): """Tests the free log std option works.""" @@ -220,27 +224,23 @@ def test_ppo_free_log_std_with_rl_modules(self): .training(_enable_learner_api=True) ) - # TODO(Artur): Enable this test for tf2 once we support CNNs - for fw in framework_iterator(config, frameworks=["tf2", "torch"]): - trainer = config.build() - policy = trainer.get_policy() + for fw in framework_iterator(config, frameworks=("torch", "tf2")): + algo = config.build() + policy = algo.get_policy() + learner = algo.learner_group._learner + module = learner.module[DEFAULT_POLICY_ID] # Check the free log std var is created. if fw == "torch": - matching = [ - v for (n, v) in policy.model.named_parameters() if "log_std" in n - ] + matching = [v for (n, v) in module.named_parameters() if "log_std" in n] else: matching = [ - v for v in policy.model.trainable_variables if "log_std" in str(v) + v for v in module.trainable_variables if "log_std" in str(v) ] assert len(matching) == 1, matching log_std_var = matching[0] - # linter yells at you if you don't pass in the parameters. - # reason: https://docs.python-guide.org/writing/gotchas/ - # #late-binding-closures - def get_value(fw=fw, policy=policy, log_std_var=log_std_var): + def get_value(): if fw == "torch": return log_std_var.detach().cpu().numpy()[0] else: @@ -250,14 +250,13 @@ def get_value(fw=fw, policy=policy, log_std_var=log_std_var): init_std = get_value() assert init_std == 0.0, init_std batch = compute_gae_for_sample_batch(policy, PENDULUM_FAKE_BATCH.copy()) - if fw == "torch": - batch = policy._lazy_tensor_dict(batch) - policy.learn_on_batch(batch) + batch = policy._lazy_tensor_dict(batch) + algo.learner_group.update(batch.as_multi_agent()) # Check the variable is updated. post_std = get_value() assert post_std != 0.0, post_std - trainer.stop() + algo.stop() if __name__ == "__main__": diff --git a/rllib/algorithms/ppo/tf/ppo_tf_learner.py b/rllib/algorithms/ppo/tf/ppo_tf_learner.py index f3db723894f1..1794990713f6 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_learner.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_learner.py @@ -1,8 +1,16 @@ import logging -from typing import Mapping, Any - -from ray.rllib.algorithms.ppo.ppo_base_learner import PPOBaseLearner +from typing import Any, Dict, Mapping + +from ray.rllib.algorithms.ppo.ppo_learner import ( + LEARNER_RESULTS_KL_KEY, + LEARNER_RESULTS_CURR_KL_COEFF_KEY, + LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY, + LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY, + PPOLearner, +) +from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY, ENTROPY_KEY from ray.rllib.core.learner.tf.tf_learner import TfLearner +from ray.rllib.core.rl_module.rl_module import ModuleID from ray.rllib.evaluation.postprocessing import Postprocessing from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_tf @@ -15,8 +23,8 @@ logger = logging.getLogger(__name__) -class PPOTfLearner(PPOBaseLearner, TfLearner): - """Implements tf-specific PPO loss logic on top of PPOBaseLearner. +class PPOTfLearner(PPOLearner, TfLearner): + """Implements tf-specific PPO loss logic on top of PPOLearner. This class implements the ppo loss under `_compute_loss_per_module()`. """ @@ -31,14 +39,20 @@ def compute_loss_per_module( # learning rate for that agent. # TODO (Kourosh): come back to RNNs later - curr_action_dist = fwd_out[SampleBatch.ACTION_DIST] - action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST]) - prev_action_dist = action_dist_class.from_logits( + action_dist_class_train = self.module[module_id].get_train_action_dist_cls() + action_dist_class_exploration = self.module[ + module_id + ].get_exploration_action_dist_cls() + curr_action_dist = action_dist_class_train.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + prev_action_dist = action_dist_class_exploration.from_logits( batch[SampleBatch.ACTION_DIST_INPUTS] ) logp_ratio = tf.exp( - fwd_out[SampleBatch.ACTION_LOGP] - batch[SampleBatch.ACTION_LOGP] + curr_action_dist.logp(batch[SampleBatch.ACTIONS]) + - batch[SampleBatch.ACTION_LOGP] ) # Only calculate kl loss if necessary (kl-coeff > 0.0). @@ -53,13 +67,13 @@ def compute_loss_per_module( "This can happen naturally in deterministic " "environments where the optimal policy has zero mass " "for a specific action. To fix this issue, consider " - "setting the coefficient for the KL loss term to " - "zero or increasing policy entropy." + "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your " + "config." ) else: mean_kl_loss = tf.constant(0.0, dtype=logp_ratio.dtype) - curr_entropy = fwd_out["entropy"] + curr_entropy = curr_action_dist.entropy() mean_entropy = tf.reduce_mean(curr_entropy) surrogate_loss = tf.minimum( @@ -88,32 +102,46 @@ def compute_loss_per_module( total_loss = tf.reduce_mean( -surrogate_loss + self.hps.vf_loss_coeff * vf_loss_clipped - - self.entropy_coeff * curr_entropy + - self.entropy_coeff_scheduler.get_current_value(module_id) * curr_entropy ) # Add mean_kl_loss (already processed through `reduce_mean_valid`), # if necessary. if self.hps.kl_coeff > 0.0: - total_loss += self.kl_coeff * mean_kl_loss + total_loss += self.curr_kl_coeffs_per_module[module_id] * mean_kl_loss return { self.TOTAL_LOSS_KEY: total_loss, - "policy_loss": -tf.reduce_mean(surrogate_loss), - "vf_loss": mean_vf_loss, - "unclipped_vf_loss": mean_vf_unclipped_loss, - "vf_explained_var": explained_variance( + POLICY_LOSS_KEY: -tf.reduce_mean(surrogate_loss), + VF_LOSS_KEY: mean_vf_loss, + LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY: mean_vf_unclipped_loss, + LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY: explained_variance( batch[Postprocessing.VALUE_TARGETS], value_fn_out ), - "entropy": mean_entropy, - "kl": mean_kl_loss, - "entropy_coeff": self.entropy_coeff, - "cur_kl_coeff": self.kl_coeff, + ENTROPY_KEY: mean_entropy, + LEARNER_RESULTS_KL_KEY: mean_kl_loss, } - @override(PPOBaseLearner) - def _create_kl_variable(self, value: float) -> Any: - return tf.Variable(value, trainable=False, dtype=tf.float32) + @override(PPOLearner) + def additional_update_per_module( + self, module_id: ModuleID, sampled_kl_values: dict, timestep: int + ) -> Dict[str, Any]: + assert sampled_kl_values, "Sampled KL values are empty." + + results = super().additional_update_per_module( + module_id, + sampled_kl_values=sampled_kl_values, + timestep=timestep, + ) - @override(PPOBaseLearner) - def _set_kl_coeff(self, value: float) -> None: - self.kl_coeff.assign(value) + # Update KL coefficient. + sampled_kl = sampled_kl_values[module_id] + curr_var = self.curr_kl_coeffs_per_module[module_id] + if sampled_kl > 2.0 * self.hps.kl_target: + # TODO (Kourosh) why not 2? + curr_var.assign(curr_var * 1.5) + elif sampled_kl < 0.5 * self.hps.kl_target: + curr_var.assign(curr_var * 0.5) + results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.numpy()}) + + return results diff --git a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py b/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py deleted file mode 100644 index 66aa0e408d3a..000000000000 --- a/rllib/algorithms/ppo/tf/ppo_tf_policy_rlm.py +++ /dev/null @@ -1,183 +0,0 @@ -import logging -from typing import Dict, List, Union - -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tf_mixins import ( - EntropyCoeffSchedule, - KLCoeffMixin, - LearningRateSchedule, -) -from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_tf - -from ray.rllib.utils.tf_utils import ( - explained_variance, - warn_if_infinite_kl_divergence, -) - -from ray.rllib.utils.typing import TensorType - -tf1, tf, tfv = try_import_tf() - -logger = logging.getLogger(__name__) - - -class PPOTfPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - KLCoeffMixin, - EagerTFPolicyV2, -): - """PyTorch policy class used with PPO. - - This class is copied from PPOTFPolicy and is modified to support RLModules. - Some subtle differences: - - if config._enable_rl_module api is true make_rl_module should be implemented by - the policy the policy is assumed to be compatible with rl_modules (i.e. self.model - would be an RLModule) - - Tower stats no longer belongs to the model (i.e. RLModule) instead it belongs to - the policy itself. - - Connectors should be enabled to use this policy - - So far it only works for vectorized obs and action spaces (Fully connected neural - networks). we need model catalog to work for other obs and action spaces. - - # TODO: In the future we will deprecate doing all phases of training, exploration, - # and inference via one policy abstraction. Instead, we will use separate - # abstractions for each phase. For training (i.e. gradient updates, given the - # sample that have been collected) we will use Learner which will own one or - # possibly many RLModules, and RLOptimizer. For exploration, we will use RLSampler - # which will own RLModule, and RLTrajectoryProcessor. The exploration and inference - # phase details are TBD but the whole point is to make rllib extremely modular. - """ - - def __init__(self, observation_space, action_space, config): - # TODO: Move into Policy API, if needed at all here. Why not move this into - # `PPOConfig`?. - self.framework = "tf2" - EagerTFPolicyV2.enable_eager_execution_if_necessary() - validate_config(config) - # Initialize MixIns. - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - KLCoeffMixin.__init__(self, config) - EagerTFPolicyV2.__init__(self, observation_space, action_space, config) - - self.maybe_initialize_optimizer_and_loss() - - @override(EagerTFPolicyV2) - def loss( - self, - model: Union[ModelV2, "tf.keras.Model"], - dist_class, - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - - fwd_out = model.forward_train(train_batch) - curr_action_dist = fwd_out[SampleBatch.ACTION_DIST] - - action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST]) - prev_action_dist = action_dist_class.from_logits( - train_batch[SampleBatch.ACTION_DIST_INPUTS] - ) - - logp_ratio = tf.exp( - fwd_out[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTION_LOGP] - ) - - # Only calculate kl loss if necessary (kl-coeff > 0.0). - if self.config["kl_coeff"] > 0.0: - action_kl = prev_action_dist.kl(curr_action_dist) - mean_kl_loss = tf.reduce_mean(action_kl) - warn_if_infinite_kl_divergence(self, mean_kl_loss) - else: - mean_kl_loss = tf.constant(0.0) - - curr_entropy = fwd_out["entropy"] - mean_entropy = tf.reduce_mean(curr_entropy) - - surrogate_loss = tf.minimum( - train_batch[Postprocessing.ADVANTAGES] * logp_ratio, - train_batch[Postprocessing.ADVANTAGES] - * tf.clip_by_value( - logp_ratio, - 1 - self.config["clip_param"], - 1 + self.config["clip_param"], - ), - ) - - # Compute a value function loss. - if self.config["use_critic"]: - value_fn_out = fwd_out[SampleBatch.VF_PREDS] - vf_loss = tf.math.square( - value_fn_out - train_batch[Postprocessing.VALUE_TARGETS] - ) - vf_loss_clipped = tf.clip_by_value( - vf_loss, - 0, - self.config["vf_clip_param"], - ) - mean_vf_loss = tf.reduce_mean(vf_loss_clipped) - mean_vf_unclipped_loss = tf.reduce_mean(vf_loss) - # Ignore the value function. - else: - mean_vf_unclipped_loss = tf.constant(0.0) - value_fn_out = vf_loss_clipped = mean_vf_loss = tf.constant(0.0) - - total_loss = tf.reduce_mean( - -surrogate_loss - + self.config["vf_loss_coeff"] * vf_loss_clipped - - self.entropy_coeff * curr_entropy - ) - # Add mean_kl_loss (already processed through `reduce_mean_valid`), - # if necessary. - if self.config["kl_coeff"] > 0.0: - total_loss += self.kl_coeff * mean_kl_loss - - # Store stats in policy for stats_fn. - self._total_loss = total_loss - self._mean_policy_loss = tf.reduce_mean(-surrogate_loss) - self._mean_vf_loss = mean_vf_loss - self._unclipped_mean_vf_loss = mean_vf_unclipped_loss - self._mean_entropy = mean_entropy - # Backward compatibility: Deprecate self._mean_kl. - self._mean_kl_loss = self._mean_kl = mean_kl_loss - self._value_fn_out = value_fn_out - self._value_mean = tf.reduce_mean(value_fn_out) - - return total_loss - - @override(EagerTFPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return { - "cur_kl_coeff": tf.cast(self.kl_coeff, tf.float64), - "cur_lr": tf.cast(self.cur_lr, tf.float64), - "total_loss": self._total_loss, - "policy_loss": self._mean_policy_loss, - "vf_loss": self._mean_vf_loss, - "unclipped_vf_loss": self._unclipped_mean_vf_loss, - "vf_explained_var": explained_variance( - train_batch[Postprocessing.VALUE_TARGETS], self._value_fn_out - ), - "kl": self._mean_kl_loss, - "entropy": self._mean_entropy, - "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), - "value_mean": tf.cast(self._value_mean, tf.float64), - } - - @override(EagerTFPolicyV2) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - sample_batch = super().postprocess_trajectory(sample_batch) - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) diff --git a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py index e84ee2bec90b..63b51892f01d 100644 --- a/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py +++ b/rllib/algorithms/ppo/tf/ppo_tf_rl_module.py @@ -1,12 +1,10 @@ -from typing import Mapping, Any, List +from typing import Mapping, Any -from ray.rllib.algorithms.ppo.ppo_base_rl_module import PPORLModuleBase +from ray.rllib.algorithms.ppo.ppo_rl_module import PPORLModule from ray.rllib.core.models.base import ACTOR, CRITIC -from ray.rllib.core.models.tf.encoder import ENCODER_OUT -from ray.rllib.models.distributions import Distribution +from ray.rllib.core.models.tf.encoder import ENCODER_OUT, STATE_OUT from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule -from ray.rllib.core.models.specs.specs_dict import SpecDict from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf @@ -15,103 +13,41 @@ tf1, tf, _ = try_import_tf() -class PPOTfRLModule(PPORLModuleBase, TfRLModule): +class PPOTfRLModule(PPORLModule, TfRLModule): framework: str = "tf2" def __init__(self, *args, **kwargs): TfRLModule.__init__(self, *args, **kwargs) - PPORLModuleBase.__init__(self, *args, **kwargs) - - # TODO(Artur): Comment in as soon as we support RNNs from Polciy side - # @override(RLModule) - # def get_initial_state(self) -> NestedDict: - # if hasattr(self.encoder, "get_initial_state"): - # return self.encoder.get_initial_state() - # else: - # return NestedDict({}) - - @override(RLModule) - def input_specs_train(self) -> List[str]: - return [SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.ACTION_LOGP] - - @override(RLModule) - def output_specs_train(self) -> List[str]: - return [ - SampleBatch.ACTION_DIST_INPUTS, - SampleBatch.ACTION_DIST, - SampleBatch.ACTION_LOGP, - SampleBatch.VF_PREDS, - "entropy", - ] - - @override(RLModule) - def input_specs_exploration(self): - return [] - - @override(RLModule) - def output_specs_exploration(self) -> List[str]: - return [ - SampleBatch.ACTION_DIST, - SampleBatch.VF_PREDS, - SampleBatch.ACTION_DIST_INPUTS, - ] - - @override(RLModule) - def input_specs_inference(self) -> SpecDict: - return self.input_specs_exploration() - - @override(RLModule) - def output_specs_inference(self) -> SpecDict: - return SpecDict({SampleBatch.ACTION_DIST: Distribution}) + PPORLModule.__init__(self, *args, **kwargs) @override(RLModule) def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]: output = {} - # TODO (Artur): Remove this once Policy supports RNN - # if self.encoder.config.shared: - # batch[STATE_IN] = None - # else: - # batch[STATE_IN] = { - # ACTOR: None, - # CRITIC: None, - # } - # batch[SampleBatch.SEQ_LENS] = None - encoder_outs = self.encoder(batch) - # TODO (Artur): Un-uncomment once Policy supports RNN - # output[STATE_OUT] = encoder_outs[STATE_OUT] + if STATE_OUT in encoder_outs: + output[STATE_OUT] = encoder_outs[STATE_OUT] # Actions action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - action_dist = self.action_dist_cls.from_logits(action_logits) - output[SampleBatch.ACTION_DIST] = action_dist.to_deterministic() + output[SampleBatch.ACTION_DIST_INPUTS] = action_logits return output @override(RLModule) def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: """PPO forward pass during exploration. + Besides the action distribution, this method also returns the parameters of the policy distribution to be used for computing KL divergence between the old policy and the new policy during training. """ output = {} - # TODO (Artur): Remove this once Policy supports RNN - # if self.encoder.config.shared: - # batch[STATE_IN] = None - # else: - # batch[STATE_IN] = { - # ACTOR: None, - # CRITIC: None, - # } - # batch[SampleBatch.SEQ_LENS] = None - # Shared encoder encoder_outs = self.encoder(batch) - # TODO (Artur): Un-uncomment once Policy supports RNN - # output[STATE_OUT] = encoder_outs[STATE_OUT] + if STATE_OUT in encoder_outs: + output[STATE_OUT] = encoder_outs[STATE_OUT] # Value head vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC]) @@ -121,9 +57,6 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) output[SampleBatch.ACTION_DIST_INPUTS] = action_logits - output[SampleBatch.ACTION_DIST] = self.action_dist_cls.from_logits( - logits=action_logits - ) return output @@ -131,35 +64,17 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: def _forward_train(self, batch: NestedDict): output = {} - # TODO (Artur): Remove this once Policy supports RNN - # if self.encoder.config.shared: - # batch[STATE_IN] = None - # else: - # batch[STATE_IN] = { - # ACTOR: None, - # CRITIC: None, - # } - # batch[SampleBatch.SEQ_LENS] = None - # Shared encoder encoder_outs = self.encoder(batch) - # TODO (Artur): Un-uncomment once Policy supports RNN - # output[STATE_OUT] = encoder_outs[STATE_OUT] + if STATE_OUT in encoder_outs: + output[STATE_OUT] = encoder_outs[STATE_OUT] # Value head vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC]) output[SampleBatch.VF_PREDS] = tf.squeeze(vf_out, axis=-1) # Policy head - pi_out = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - action_logits = pi_out - action_dist = self.action_dist_cls.from_logits(logits=action_logits) - logp = action_dist.logp(batch[SampleBatch.ACTIONS]) - entropy = action_dist.entropy() - + action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) output[SampleBatch.ACTION_DIST_INPUTS] = action_logits - output[SampleBatch.ACTION_DIST] = action_dist - output[SampleBatch.ACTION_LOGP] = logp - output["entropy"] = entropy return output diff --git a/rllib/algorithms/ppo/torch/ppo_torch_learner.py b/rllib/algorithms/ppo/torch/ppo_torch_learner.py index f605b06eb970..0b3ba822a066 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_learner.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_learner.py @@ -1,8 +1,16 @@ import logging -from typing import Mapping, Any - -from ray.rllib.algorithms.ppo.ppo_base_learner import PPOBaseLearner +from typing import Any, Dict, Mapping + +from ray.rllib.algorithms.ppo.ppo_learner import ( + LEARNER_RESULTS_KL_KEY, + LEARNER_RESULTS_CURR_KL_COEFF_KEY, + LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY, + LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY, + PPOLearner, +) +from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY, ENTROPY_KEY from ray.rllib.core.learner.torch.torch_learner import TorchLearner +from ray.rllib.core.rl_module.rl_module import ModuleID from ray.rllib.evaluation.postprocessing import Postprocessing from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch @@ -15,8 +23,8 @@ logger = logging.getLogger(__name__) -class PPOTorchLearner(PPOBaseLearner, TorchLearner): - """Implements torch-specific PPO loss logic on top of PPOBaseLearner. +class PPOTorchLearner(PPOLearner, TorchLearner): + """Implements torch-specific PPO loss logic on top of PPOLearner. This class implements the ppo loss under `_compute_loss_per_module()`. """ @@ -31,18 +39,23 @@ def compute_loss_per_module( # learning rate for that agent. # TODO (Kourosh): come back to RNNs later - # make sure all the coefficients are on the same device as the model - if self.kl_coeff.device != self._device: - self.kl_coeff = self.kl_coeff.to(self._device) + action_dist_class_train = ( + self.module[module_id].unwrapped().get_train_action_dist_cls() + ) + action_dist_class_exploration = ( + self.module[module_id].unwrapped().get_exploration_action_dist_cls() + ) - curr_action_dist = fwd_out[SampleBatch.ACTION_DIST] - action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST]) - prev_action_dist = action_dist_class.from_logits( + curr_action_dist = action_dist_class_train.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + prev_action_dist = action_dist_class_exploration.from_logits( batch[SampleBatch.ACTION_DIST_INPUTS] ) logp_ratio = torch.exp( - fwd_out[SampleBatch.ACTION_LOGP] - batch[SampleBatch.ACTION_LOGP] + curr_action_dist.logp(batch[SampleBatch.ACTIONS]) + - batch[SampleBatch.ACTION_LOGP] ) # Only calculate kl loss if necessary (kl-coeff > 0.0). @@ -57,13 +70,13 @@ def compute_loss_per_module( "This can happen naturally in deterministic " "environments where the optimal policy has zero mass " "for a specific action. To fix this issue, consider " - "setting the coefficient for the KL loss term to " - "zero or increasing policy entropy." + "setting `kl_coeff` to 0.0 or increasing `entropy_coeff` in your " + "config." ) else: mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device) - curr_entropy = fwd_out["entropy"] + curr_entropy = curr_action_dist.entropy() mean_entropy = torch.mean(curr_entropy) surrogate_loss = torch.min( @@ -88,32 +101,46 @@ def compute_loss_per_module( total_loss = torch.mean( -surrogate_loss + self.hps.vf_loss_coeff * vf_loss_clipped - - self.entropy_coeff * curr_entropy + - self.entropy_coeff_scheduler.get_current_value(module_id) * curr_entropy ) # Add mean_kl_loss (already processed through `reduce_mean_valid`), # if necessary. if self.hps.kl_coeff > 0.0: - total_loss += self.kl_coeff * mean_kl_loss + total_loss += self.curr_kl_coeffs_per_module[module_id] * mean_kl_loss return { self.TOTAL_LOSS_KEY: total_loss, - "policy_loss": -torch.mean(surrogate_loss), - "vf_loss": mean_vf_loss, - "unclipped_vf_loss": mean_vf_unclipped_loss, - "vf_explained_var": explained_variance( + POLICY_LOSS_KEY: -torch.mean(surrogate_loss), + VF_LOSS_KEY: mean_vf_loss, + LEARNER_RESULTS_VF_LOSS_UNCLIPPED_KEY: mean_vf_unclipped_loss, + LEARNER_RESULTS_VF_EXPLAINED_VAR_KEY: explained_variance( batch[Postprocessing.VALUE_TARGETS], value_fn_out ), - "entropy": mean_entropy, - "kl": mean_kl_loss, - "entropy_coeff": self.entropy_coeff, - "cur_kl_coeff": self.kl_coeff, + ENTROPY_KEY: mean_entropy, + LEARNER_RESULTS_KL_KEY: mean_kl_loss, } - @override(PPOBaseLearner) - def _create_kl_variable(self, value: float) -> Any: - return torch.tensor(value) + @override(PPOLearner) + def additional_update_per_module( + self, module_id: ModuleID, sampled_kl_values: dict, timestep: int + ) -> Dict[str, Any]: + assert sampled_kl_values, "Sampled KL values are empty." + + results = super().additional_update_per_module( + module_id, + sampled_kl_values=sampled_kl_values, + timestep=timestep, + ) - @override(PPOBaseLearner) - def _set_kl_coeff(self, value: float): - self.kl_coeff.data = torch.tensor(value, device=self.kl_coeff.device) + # Update KL coefficient. + sampled_kl = sampled_kl_values[module_id] + curr_var = self.curr_kl_coeffs_per_module[module_id] + if sampled_kl > 2.0 * self.hps.kl_target: + # TODO (Kourosh) why not 2? + curr_var.data *= 1.5 + elif sampled_kl < 0.5 * self.hps.kl_target: + curr_var.data *= 0.5 + results.update({LEARNER_RESULTS_CURR_KL_COEFF_KEY: curr_var.item()}) + + return results diff --git a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py b/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py deleted file mode 100644 index 04ac92fc2ba5..000000000000 --- a/rllib/algorithms/ppo/torch/ppo_torch_policy_rlm.py +++ /dev/null @@ -1,246 +0,0 @@ -import logging -from typing import Dict, List, Type, Union - -from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config -from ray.rllib.evaluation.postprocessing import ( - Postprocessing, - compute_gae_for_sample_batch, -) -from ray.rllib.models.action_dist import ActionDistribution -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_mixins import ( - EntropyCoeffSchedule, - KLCoeffMixin, - LearningRateSchedule, -) -from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.numpy import convert_to_numpy -from ray.rllib.utils.torch_utils import ( - apply_grad_clipping, - explained_variance, - sequence_mask, - warn_if_infinite_kl_divergence, -) -from ray.rllib.utils.typing import TensorType - -torch, nn = try_import_torch() - -logger = logging.getLogger(__name__) - - -class PPOTorchPolicyWithRLModule( - LearningRateSchedule, - EntropyCoeffSchedule, - KLCoeffMixin, - TorchPolicyV2, -): - """PyTorch policy class used with PPO. - - This class is copied from PPOTorchPolicyV2 and is modified to support RLModules. - Some subtle differences: - - if config._enable_rl_module api is true make_rl_module should be implemented by - the policy the policy is assumed to be compatible with rl_modules (i.e. self.model - would be an RLModule) - - Tower stats no longer belongs to the model (i.e. RLModule) instead it belongs to - the policy itself. - - Connectors should be enabled to use this policy - - So far it only works for vectorized obs and action spaces (Fully connected neural - networks). we need model catalog to work for other obs and action spaces. - - # TODO: In the future we will deprecate doing all phases of training, exploration, - # and inference via one policy abstraction. Instead, we will use separate - # abstractions for each phase. For training (i.e. gradient updates, given the - # sample that have been collected) we will use Learner which will own one or - # possibly many RLModules, and RLOptimizer. For exploration, we will use RLSampler - # which will own RLModule, and RLTrajectoryProcessor. The exploration and inference - # phase details are TBD but the whole point is to make rllib extremely modular. - """ - - def __init__(self, observation_space, action_space, config): - # TODO: Move into Policy API, if needed at all here. Why not move this into - # `PPOConfig`?. - validate_config(config) - - TorchPolicyV2.__init__( - self, - observation_space, - action_space, - config, - max_seq_len=config["model"]["max_seq_len"], - ) - - LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) - EntropyCoeffSchedule.__init__( - self, config["entropy_coeff"], config["entropy_coeff_schedule"] - ) - KLCoeffMixin.__init__(self, config) - - # TODO: Don't require users to call this manually. - self._initialize_loss_from_dummy_batch() - - @override(TorchPolicyV2) - def loss( - self, - model: ModelV2, - dist_class: Type[ActionDistribution], - train_batch: SampleBatch, - ) -> Union[TensorType, List[TensorType]]: - """Compute loss for Proximal Policy Objective. - - Args: - model: The Model to calculate the loss for. - dist_class: The action distr. class. - train_batch: The training data. - - Returns: - The PPO loss tensor given the input batch. - """ - - fwd_out = model.forward_train(train_batch) - curr_action_dist = fwd_out[SampleBatch.ACTION_DIST] - state = fwd_out.get("state_out", {}) - - # TODO (Kourosh): come back to RNNs later - # RNN case: Mask away 0-padded chunks at end of time axis. - if state: - B = len(train_batch[SampleBatch.SEQ_LENS]) - max_seq_len = train_batch[SampleBatch.OBS].shape[0] // B - mask = sequence_mask( - train_batch[SampleBatch.SEQ_LENS], - max_seq_len, - time_major=self.config["model"]["_time_major"], - ) - mask = torch.reshape(mask, [-1]) - num_valid = torch.sum(mask) - - def reduce_mean_valid(t): - return torch.sum(t[mask]) / num_valid - - # non-RNN case: No masking. - else: - mask = None - reduce_mean_valid = torch.mean - - action_dist_class = type(fwd_out[SampleBatch.ACTION_DIST]) - prev_action_dist = action_dist_class.from_logits( - train_batch[SampleBatch.ACTION_DIST_INPUTS] - ) - - logp_ratio = torch.exp( - fwd_out[SampleBatch.ACTION_LOGP] - train_batch[SampleBatch.ACTION_LOGP] - ) - - # Only calculate kl loss if necessary (kl-coeff > 0.0). - if self.config["kl_coeff"] > 0.0: - action_kl = prev_action_dist.kl(curr_action_dist) - mean_kl_loss = reduce_mean_valid(action_kl) - # TODO smorad: should we do anything besides warn? Could discard KL term - # for this update - warn_if_infinite_kl_divergence(self, mean_kl_loss) - else: - mean_kl_loss = torch.tensor(0.0, device=logp_ratio.device) - - curr_entropy = fwd_out["entropy"] - mean_entropy = reduce_mean_valid(curr_entropy) - - surrogate_loss = torch.min( - train_batch[Postprocessing.ADVANTAGES] * logp_ratio, - train_batch[Postprocessing.ADVANTAGES] - * torch.clamp( - logp_ratio, 1 - self.config["clip_param"], 1 + self.config["clip_param"] - ), - ) - - # Compute a value function loss. - if self.config["use_critic"]: - value_fn_out = fwd_out[SampleBatch.VF_PREDS] - vf_loss = torch.pow( - value_fn_out - train_batch[Postprocessing.VALUE_TARGETS], 2.0 - ) - vf_loss_clipped = torch.clamp(vf_loss, 0, self.config["vf_clip_param"]) - mean_vf_loss = reduce_mean_valid(vf_loss_clipped) - mean_vf_unclipped_loss = reduce_mean_valid(vf_loss) - # Ignore the value function. - else: - value_fn_out = torch.tensor(0.0).to(surrogate_loss.device) - mean_vf_unclipped_loss = vf_loss_clipped = mean_vf_loss = torch.tensor( - 0.0 - ).to(surrogate_loss.device) - - total_loss = reduce_mean_valid( - -surrogate_loss - + self.config["vf_loss_coeff"] * vf_loss_clipped - - self.entropy_coeff * curr_entropy - ) - - # Add mean_kl_loss (already processed through `reduce_mean_valid`), - # if necessary. - if self.config["kl_coeff"] > 0.0: - total_loss += self.kl_coeff * mean_kl_loss - - # TODO (Kourosh) Where would tower_stats go? How should stats_fn be implemented - # here? - # Store values for stats function in model (tower), such that for - # multi-GPU, we do not override them during the parallel loss phase. - self.tower_stats[model]["total_loss"] = total_loss - self.tower_stats[model]["mean_policy_loss"] = reduce_mean_valid(-surrogate_loss) - self.tower_stats[model]["mean_vf_loss"] = mean_vf_loss - self.tower_stats[model]["unclipped_vf_loss"] = mean_vf_unclipped_loss - self.tower_stats[model]["vf_explained_var"] = explained_variance( - train_batch[Postprocessing.VALUE_TARGETS], value_fn_out - ) - self.tower_stats[model]["mean_entropy"] = mean_entropy - self.tower_stats[model]["mean_kl_loss"] = mean_kl_loss - - return total_loss - - # TODO: Make this an event-style subscription (e.g.: - # "after_gradients_computed"). - @override(TorchPolicyV2) - def extra_grad_process(self, local_optimizer, loss): - return apply_grad_clipping(self, local_optimizer, loss) - - @override(TorchPolicyV2) - def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - return convert_to_numpy( - { - "cur_kl_coeff": self.kl_coeff, - "cur_lr": self.cur_lr, - "total_loss": torch.mean( - torch.stack(self.get_tower_stats("total_loss")) - ), - "policy_loss": torch.mean( - torch.stack(self.get_tower_stats("mean_policy_loss")) - ), - "vf_loss": torch.mean( - torch.stack(self.get_tower_stats("mean_vf_loss")) - ), - "vf_explained_var": torch.mean( - torch.stack(self.get_tower_stats("vf_explained_var")) - ), - "kl": torch.mean(torch.stack(self.get_tower_stats("mean_kl_loss"))), - "entropy": torch.mean( - torch.stack(self.get_tower_stats("mean_entropy")) - ), - "entropy_coeff": self.entropy_coeff, - "unclipped_vf_loss": torch.mean( - torch.stack(self.get_tower_stats("unclipped_vf_loss")) - ), - } - ) - - @override(TorchPolicyV2) - def postprocess_trajectory( - self, sample_batch, other_agent_batches=None, episode=None - ): - # Do all post-processing always with no_grad(). - # Not using this here will introduce a memory leak - # in torch (issue #6962). - # TODO: no_grad still necessary? - with torch.no_grad(): - return compute_gae_for_sample_batch( - self, sample_batch, other_agent_batches, episode - ) diff --git a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py index 0cef8d2a404a..33461f58e3ea 100644 --- a/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py +++ b/rllib/algorithms/ppo/torch/ppo_torch_rl_module.py @@ -1,13 +1,10 @@ from typing import Mapping, Any -from ray.rllib.algorithms.ppo.ppo_base_rl_module import PPORLModuleBase +from ray.rllib.algorithms.ppo.ppo_rl_module import PPORLModule -from ray.rllib.core.models.base import ACTOR, CRITIC, ENCODER_OUT, STATE_IN +from ray.rllib.core.models.base import ACTOR, CRITIC, ENCODER_OUT, STATE_OUT from ray.rllib.core.rl_module.rl_module import RLModule from ray.rllib.core.rl_module.torch import TorchRLModule -from ray.rllib.core.models.specs.specs_dict import SpecDict -from ray.rllib.core.models.specs.specs_torch import TorchTensorSpec -from ray.rllib.models.distributions import Distribution from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch @@ -16,71 +13,27 @@ torch, nn = try_import_torch() -def get_ppo_loss(fwd_in, fwd_out): - # TODO: we should replace these components later with real ppo components when - # RLOptimizer and RLModule are integrated together. - # this is not exactly a ppo loss, just something to show that the - # forward train works - adv = fwd_in[SampleBatch.REWARDS] - fwd_out[SampleBatch.VF_PREDS] - actor_loss = -(fwd_out[SampleBatch.ACTION_LOGP] * adv).mean() - critic_loss = (adv**2).mean() - loss = actor_loss + critic_loss - - return loss - - -class PPOTorchRLModule(PPORLModuleBase, TorchRLModule): +class PPOTorchRLModule(PPORLModule, TorchRLModule): framework: str = "torch" def __init__(self, *args, **kwargs): TorchRLModule.__init__(self, *args, **kwargs) - PPORLModuleBase.__init__(self, *args, **kwargs) - - @override(RLModule) - def input_specs_inference(self) -> SpecDict: - return self.input_specs_exploration() - - @override(RLModule) - def output_specs_inference(self) -> SpecDict: - return SpecDict({SampleBatch.ACTION_DIST: Distribution}) + PPORLModule.__init__(self, *args, **kwargs) @override(RLModule) def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]: output = {} - # TODO (Artur): Remove this once Policy supports RNN - if self.encoder.config.shared: - batch[STATE_IN] = None - else: - batch[STATE_IN] = { - ACTOR: None, - CRITIC: None, - } - batch[SampleBatch.SEQ_LENS] = None - encoder_outs = self.encoder(batch) - # TODO (Artur): Un-uncomment once Policy supports RNN - # output[STATE_OUT] = encoder_outs[STATE_OUT] + if STATE_OUT in encoder_outs: + output[STATE_OUT] = encoder_outs[STATE_OUT] # Actions action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - action_dist = self.action_dist_cls.from_logits(action_logits) - output[SampleBatch.ACTION_DIST] = action_dist.to_deterministic() + output[SampleBatch.ACTION_DIST_INPUTS] = action_logits return output - @override(RLModule) - def input_specs_exploration(self): - return [] - - @override(RLModule) - def output_specs_exploration(self) -> SpecDict: - return [ - SampleBatch.VF_PREDS, - SampleBatch.ACTION_DIST, - SampleBatch.ACTION_DIST_INPUTS, - ] - @override(RLModule) def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: """PPO forward pass during exploration. @@ -90,20 +43,10 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: """ output = {} - # TODO (Artur): Remove this once Policy supports RNN - if self.encoder.config.shared: - batch[STATE_IN] = None - else: - batch[STATE_IN] = { - ACTOR: None, - CRITIC: None, - } - batch[SampleBatch.SEQ_LENS] = None - # Shared encoder encoder_outs = self.encoder(batch) - # TODO (Artur): Un-uncomment once Policy supports RNN - # output[STATE_OUT] = encoder_outs[STATE_OUT] + if STATE_OUT in encoder_outs: + output[STATE_OUT] = encoder_outs[STATE_OUT] # Value head vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC]) @@ -111,65 +54,25 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: # Policy head action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - output[SampleBatch.ACTION_DIST_INPUTS] = action_logits - output[SampleBatch.ACTION_DIST] = self.action_dist_cls.from_logits( - logits=action_logits - ) - return output - @override(RLModule) - def input_specs_train(self) -> SpecDict: - specs = self.input_specs_exploration() - specs.append(SampleBatch.ACTIONS) - if SampleBatch.OBS in specs: - specs.append(SampleBatch.NEXT_OBS) - return specs + return output @override(RLModule) - def output_specs_train(self) -> SpecDict: - spec = SpecDict( - { - SampleBatch.ACTION_DIST: Distribution, - SampleBatch.ACTION_LOGP: TorchTensorSpec("b", dtype=torch.float32), - SampleBatch.VF_PREDS: TorchTensorSpec("b", dtype=torch.float32), - "entropy": TorchTensorSpec("b", dtype=torch.float32), - } - ) - return spec - def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]: output = {} - # TODO (Artur): Remove this once Policy supports RNN - if self.encoder.config.shared: - batch[STATE_IN] = None - else: - batch[STATE_IN] = { - ACTOR: None, - CRITIC: None, - } - batch[SampleBatch.SEQ_LENS] = None - # Shared encoder encoder_outs = self.encoder(batch) - # TODO (Artur): Un-uncomment once Policy supports RNN - # output[STATE_OUT] = encoder_outs[STATE_OUT] + if STATE_OUT in encoder_outs: + output[STATE_OUT] = encoder_outs[STATE_OUT] # Value head vf_out = self.vf(encoder_outs[ENCODER_OUT][CRITIC]) output[SampleBatch.VF_PREDS] = vf_out.squeeze(-1) # Policy head - pi_out = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) - action_logits = pi_out - action_dist = self.action_dist_cls.from_logits(logits=action_logits) - logp = action_dist.logp(batch[SampleBatch.ACTIONS]) - entropy = action_dist.entropy() - + action_logits = self.pi(encoder_outs[ENCODER_OUT][ACTOR]) output[SampleBatch.ACTION_DIST_INPUTS] = action_logits - output[SampleBatch.ACTION_DIST] = action_dist - output[SampleBatch.ACTION_LOGP] = logp - output["entropy"] = entropy return output diff --git a/rllib/algorithms/qmix/qmix.py b/rllib/algorithms/qmix/qmix.py index 5c00a6a4ac9d..66025a359ca1 100644 --- a/rllib/algorithms/qmix/qmix.py +++ b/rllib/algorithms/qmix/qmix.py @@ -77,7 +77,12 @@ def __init__(self): self.double_q = True self.optim_alpha = 0.99 self.optim_eps = 0.00001 - self.grad_clip = 10 + + self.grad_clip = 10.0 + # Note: Only when using _enable_learner_api=True can the clipping mode be + # configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" # QMix-torch overrides the TorchPolicy's learn_on_batch w/o specifying a # alternative `learn_on_loaded_batch` alternative for the GPU. diff --git a/rllib/algorithms/simple_q/simple_q.py b/rllib/algorithms/simple_q/simple_q.py index 2f9a8e60bd12..ff7fa6966b98 100644 --- a/rllib/algorithms/simple_q/simple_q.py +++ b/rllib/algorithms/simple_q/simple_q.py @@ -117,7 +117,13 @@ def __init__(self, algo_class=None): self.store_buffer_in_checkpoints = False self.lr_schedule = None self.adam_epsilon = 1e-8 - self.grad_clip = 40 + + self.grad_clip = 40.0 + # Note: Only when using _enable_learner_api=True can the clipping mode be + # configured by the user. On the old API stack, RLlib will always clip by + # global_norm, no matter the value of `grad_clip_by`. + self.grad_clip_by = "global_norm" + self.tau = 1.0 # __sphinx_doc_end__ # fmt: on diff --git a/rllib/algorithms/slateq/slateq_tf_policy.py b/rllib/algorithms/slateq/slateq_tf_policy.py index c6145bced515..a3079abe5746 100644 --- a/rllib/algorithms/slateq/slateq_tf_policy.py +++ b/rllib/algorithms/slateq/slateq_tf_policy.py @@ -207,15 +207,6 @@ def build_slateq_stats(policy: Policy, batch) -> Dict[str, TensorType]: "q_loss": policy._q_loss, "mean_actions": policy._mean_actions, } - # if hasattr(policy, "_mean_grads_0"): - # stats.update({"mean_grads_0": policy._mean_grads_0}) - # stats.update({"mean_grads_1": policy._mean_grads_1}) - # stats.update({"mean_grads_2": policy._mean_grads_2}) - # stats.update({"mean_grads_3": policy._mean_grads_3}) - # stats.update({"mean_grads_4": policy._mean_grads_4}) - # stats.update({"mean_grads_5": policy._mean_grads_5}) - # stats.update({"mean_grads_6": policy._mean_grads_6}) - # stats.update({"mean_grads_7": policy._mean_grads_7}) return stats diff --git a/rllib/algorithms/tests/test_algorithm_config.py b/rllib/algorithms/tests/test_algorithm_config.py index dbb873a4f641..7b6804ea92cc 100644 --- a/rllib/algorithms/tests/test_algorithm_config.py +++ b/rllib/algorithms/tests/test_algorithm_config.py @@ -4,7 +4,8 @@ import ray from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.algorithms.callbacks import make_multi_callbacks -from ray.rllib.algorithms.ppo import PPO, PPOConfig +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.algorithms.ppo import PPO from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.core.rl_module.marl_module import ( @@ -16,7 +17,7 @@ class TestAlgorithmConfig(unittest.TestCase): @classmethod def setUpClass(cls): - ray.init(num_cpus=6) + ray.init(num_cpus=6, local_mode=True) @classmethod def tearDownClass(cls): @@ -178,6 +179,7 @@ def test_rl_module_api(self): .framework("torch") .rollouts(enable_connectors=True) .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) ) config.validate() @@ -327,7 +329,11 @@ def get_default_rl_module_spec(self): ######################################## # This is the simplest case where we have to construct the marl module based on # the default specs only. - config = SingleAgentAlgoConfig().rl_module(_enable_rl_module_api=True) + config = ( + SingleAgentAlgoConfig() + .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec(config, DiscreteBCTorchModule) @@ -342,14 +348,18 @@ def get_default_rl_module_spec(self): ######################################## # This is the case where we pass in a multi-agent RLModuleSpec that asks the # algorithm to assign a specific type of RLModule class to certain module_ids. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=MultiAgentRLModuleSpec( - module_specs={ - "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - } - ), + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=MultiAgentRLModuleSpec( + module_specs={ + "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + }, + ), + ) + .training(_enable_learner_api=True) ) config.validate() @@ -359,9 +369,13 @@ def get_default_rl_module_spec(self): ######################################## # This is the case where we ask the algorithm to assign a specific type of # RLModule class to ALL module_ids. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=SingleAgentRLModuleSpec(module_class=CustomRLModule1), + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=SingleAgentRLModuleSpec(module_class=CustomRLModule1), + ) + .training(_enable_learner_api=True) ) config.validate() @@ -376,11 +390,15 @@ def get_default_rl_module_spec(self): ######################################## # This is an alternative way to ask the algorithm to assign a specific type of # RLModule class to ALL module_ids. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=MultiAgentRLModuleSpec( - module_specs=SingleAgentRLModuleSpec(module_class=CustomRLModule1) - ), + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=MultiAgentRLModuleSpec( + module_specs=SingleAgentRLModuleSpec(module_class=CustomRLModule1) + ), + ) + .training(_enable_learner_api=True) ) config.validate() @@ -397,15 +415,19 @@ def get_default_rl_module_spec(self): # This is not only assigning a specific type of RLModule class to EACH # module_id, but also defining a new custom MultiAgentRLModule class to be used # in the multi-agent scenario. - config = SingleAgentAlgoConfig().rl_module( - _enable_rl_module_api=True, - rl_module_spec=MultiAgentRLModuleSpec( - marl_module_class=CustomMARLModule1, - module_specs={ - "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), - }, - ), + config = ( + SingleAgentAlgoConfig() + .rl_module( + _enable_rl_module_api=True, + rl_module_spec=MultiAgentRLModuleSpec( + marl_module_class=CustomMARLModule1, + module_specs={ + "p1": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + "p2": SingleAgentRLModuleSpec(module_class=CustomRLModule1), + }, + ), + ) + .training(_enable_learner_api=True) ) config.validate() @@ -434,8 +456,10 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, but the MultiAgentRLModuleSpec has not defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfigWithNoSingleAgentSpec().rl_module( - _enable_rl_module_api=True + config = ( + MultiAgentAlgoConfigWithNoSingleAgentSpec() + .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) ) self.assertRaisesRegex( @@ -448,7 +472,11 @@ def get_default_rl_module_spec(self): # This is the case where we ask the algorithm to use its default # MultiAgentRLModuleSpec, and the MultiAgentRLModuleSpec has defined its # SingleAgentRLmoduleSpecs. - config = MultiAgentAlgoConfig().rl_module(_enable_rl_module_api=True) + config = ( + MultiAgentAlgoConfig() + .rl_module(_enable_rl_module_api=True) + .training(_enable_learner_api=True) + ) config.validate() spec, expected = self._get_expected_marl_spec( diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py index e522b5b11439..bb6b2f9d4c26 100644 --- a/rllib/algorithms/tests/test_worker_failures.py +++ b/rllib/algorithms/tests/test_worker_failures.py @@ -5,7 +5,7 @@ import unittest import ray -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray.rllib.algorithms.algorithm_config import AlgorithmConfig from ray.rllib.algorithms.a3c import A3CConfig from ray.rllib.algorithms.apex_dqn import ApexDQNConfig diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 23ca2d2e3162..1e00fb6a6cff 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -47,6 +47,7 @@ OverrideToImplementCustomLogic, OverrideToImplementCustomLogic_CallToSuperRecommended, ) +from ray.rllib.utils.schedules.scheduler import Scheduler torch, _ = try_import_torch() tf1, tf, tfv = try_import_tf() @@ -66,9 +67,12 @@ VF_LOSS_KEY = "vf_loss" ENTROPY_KEY = "entropy" +# Additional update keys +LEARNER_RESULTS_CURR_LR_KEY = "curr_lr" + @dataclass -class FrameworkHPs: +class FrameworkHyperparameters: """The framework specific hyper-parameters. Args: @@ -83,18 +87,25 @@ class FrameworkHPs: @dataclass -class LearnerHPs: - """The hyper-parameters for Learner. +class LearnerHyperparameters: + """Hyperparameters for a Learner, derived from a subset of AlgorithmConfig values. - When creating a new Learner, the new hyper-parameters have to be defined by - subclassing this class and adding the new hyper-parameters as fields. + Instances of this class should only be created via calling + `get_learner_hyperparameters()` on a frozen AlgorithmConfig object and should always + considered read-only. - # TODO (Kourosh, Avnish): The things that could be part of the base class: - - a function, `validate` that runs some validation on the hyper-parameters. + When creating a new Learner, you should also define a new sub-class of this class + and make sure the respective AlgorithmConfig sub-class has a proper implementation + of the `get_learner_hyperparameters` method. + Validation of the values of these hyperparameters should be done by the + respective AlgorithmConfig class. """ - pass + # TODO (Sven): Move lr from - currently - optimizer config to only exist here. + # lr: float = None + + lr_schedule: Optional[List[List[Union[int, float]]]] = None class Learner: @@ -111,7 +122,6 @@ class Learner: the TF or Torch specific sub-classes to implement their algorithm-specific update logic. - Args: module_spec: The module specification for the RLModule that is being trained. If the module is a single agent module, after building the module it will @@ -130,11 +140,12 @@ class Learner: Algorithm specific learner hyper-parameters will passed in via this argument. For example in PPO the `vf_loss_coeff` hyper-parameter will be passed in via this argument. Refer to - ray.rllib.core.learner.learner.LearnerHPs for more info. + ray.rllib.core.learner.learner.LearnerHyperparameters for more info. framework_hps: The framework specific hyper-parameters. This will be used to pass in any framework specific hyper-parameter that will impact the module creation. For example eager_tracing in TF or compile in Torch. - Refer to ray.rllib.core.learner.learner.FrameworkHPs for more info. + Refer to ray.rllib.core.learner.learner.FrameworkHyperparameters for + more info. Usage pattern: @@ -199,7 +210,7 @@ class MyLearner(TorchLearner): def compute_loss(self, fwd_out, batch): # compute the loss based on batch and output of the forward pass - # to access the learner hyper-parameters use `self.hps` + # to access the learner hyper-parameters use `self._hps` return {self.TOTAL_LOSS_KEY: loss} """ @@ -215,9 +226,9 @@ def __init__( ] = None, module: Optional[RLModule] = None, optimizer_config: Mapping[str, Any] = None, - learner_scaling_config: LearnerGroupScalingConfig = LearnerGroupScalingConfig(), - learner_hyperparameters: Optional[LearnerHPs] = LearnerHPs(), - framework_hyperparameters: Optional[FrameworkHPs] = FrameworkHPs(), + learner_group_scaling_config: Optional[LearnerGroupScalingConfig] = None, + learner_hyperparameters: Optional[LearnerHyperparameters] = None, + framework_hyperparameters: Optional[FrameworkHyperparameters] = None, ): # TODO (Kourosh): convert optimizer configs to dataclasses if module_spec is not None and module is not None: @@ -233,13 +244,21 @@ def __init__( self._module_spec = module_spec self._module_obj = module self._optimizer_config = optimizer_config - self._hps = learner_hyperparameters + self._hps = learner_hyperparameters or LearnerHyperparameters() + self._device = None # pick the configs that we need for the learner from scaling config - self._distributed = learner_scaling_config.num_workers > 1 - self._use_gpu = learner_scaling_config.num_gpus_per_worker > 0 + self._learner_group_scaling_config = ( + learner_group_scaling_config or LearnerGroupScalingConfig() + ) + self._distributed = self._learner_group_scaling_config.num_workers > 1 + self._use_gpu = self._learner_group_scaling_config.num_gpus_per_worker > 0 # if we are using gpu but we are not distributed, use this gpu for training - self._local_gpu_idx = learner_scaling_config.local_gpu_idx + self._local_gpu_idx = self._learner_group_scaling_config.local_gpu_idx + + self._framework_hyperparameters = ( + framework_hyperparameters or FrameworkHyperparameters() + ) # whether self.build has already been called self._is_built = False @@ -263,7 +282,7 @@ def module(self) -> MultiAgentRLModule: return self._module @property - def hps(self) -> LearnerHPs: + def hps(self) -> LearnerHyperparameters: """The hyper-parameters for the learner.""" return self._hps @@ -522,14 +541,14 @@ def compile_results( # We put the stats for all modules under the ALL_MODULES key. e.g. average of # the gradients across all modules will go here. - mean_grads = [ - np.mean(grad) + mean_abs_grads = [ + np.mean(np.abs(grad)) for grad in convert_to_numpy(postprocessed_gradients.values()) if grad is not None ] module_learner_stats[ALL_MODULES] = { - "mean_gradient": np.mean(mean_grads), + "mean_abs_postprocessed_gradients": np.mean(mean_abs_grads), self.TOTAL_LOSS_KEY: loss_numpy[self.TOTAL_LOSS_KEY], } @@ -603,6 +622,17 @@ def build(self) -> None: logger.debug("Learner already built. Skipping build.") return self._is_built = True + + # Build learning rate scheduling tools. + # TODO (sven): Move lr from optimizer config to Learner HPs? + # We might not need optimizer config. + self.lr_scheduler = Scheduler( + fixed_value=self._optimizer_config["lr"], + schedule=self.hps.lr_schedule, + framework=self.framework, + device=self._device, + ) + self._module = self._make_module() for param_seq, optimizer in self.configure_optimizers(): self._optimizer_parameters[optimizer] = [] @@ -735,10 +765,10 @@ def additional_update_per_module(self, module_id: ModuleID, tau: float): return results_all_modules - @OverrideToImplementCustomLogic + @OverrideToImplementCustomLogic_CallToSuperRecommended def additional_update_per_module( self, module_id: ModuleID, **kwargs - ) -> Mapping[str, Any]: + ) -> Dict[str, Any]: """Apply additional non-gradient based updates for a single module. See `additional_update` for more details. @@ -750,23 +780,25 @@ def additional_update_per_module( Returns: A dictionary of results from the update """ - raise NotImplementedError + return {} @OverrideToImplementCustomLogic def postprocess_gradients( - self, gradients_dict: Mapping[str, Any] + self, + gradients_dict: Mapping[str, Any], ) -> Mapping[str, Any]: - """Applies potential postprocessings to the gradients. + """Applies potential postprocessing operations on the gradients. - In some algorithms, we may want to perform some postprocessing on the - gradients before they are applied. This method is called after gradients - have been computed, and modifies them before they are applied. + This method is called after gradients have been computed, and modifies them + before they are applied to the respective module(s). + This includes grad clipping by value, norm, or global-norm, or other + algorithm specific gradient postprocessing steps. Args: gradients_dict: A dictionary of gradients. Returns: - A dictionary of updated gradients. + A dictionary with the updated gradients. """ return gradients_dict @@ -776,7 +808,9 @@ def update( *, minibatch_size: Optional[int] = None, num_iters: int = 1, - reduce_fn: Callable[[ResultDict], ResultDict] = _reduce_mean_results, + reduce_fn: Callable[[List[Mapping[str, Any]]], ResultDict] = ( + _reduce_mean_results + ), ) -> Mapping[str, Any]: """Do `num_iters` minibatch updates given the original batch. @@ -845,14 +879,29 @@ def set_state(self, state: Mapping[str, Any]) -> None: Args: state: The state of the optimizer and module. Can be obtained - from `get_state`. + from `get_state`. State is a dictionary with two keys: + "module_state" and "optimizer_state". The value of each key + is a dictionary that can be passed to `set_weights` and + `set_optimizer_weights` respectively. """ # TODO (Kourosh): We have both get(set)_state and get(set)_weights. I think # having both can become confusing. Can we simplify this API requirement? self._check_is_built() # TODO: once we figure out the optimizer format, we can set/get the state - self._module.set_state(state.get("module_state", {})) + if "module_state" not in state: + raise ValueError( + "state must have a key 'module_state' for the module weights" + ) + if "optimizer_state" not in state: + raise ValueError( + "state must have a key 'optimizer_state' for the optimizer weights" + ) + + module_state = state.get("module_state") + optimizer_state = state.get("optimizer_state") + self.set_weights(module_state) + self.set_optimizer_weights(optimizer_state) def get_state(self) -> Mapping[str, Any]: """Get the state of the learner. @@ -863,7 +912,29 @@ def get_state(self) -> Mapping[str, Any]: """ self._check_is_built() # TODO: once we figure out the optimizer format, we can set/get the state - return {"module_state": self._module.get_state()} + return { + "module_state": self.get_weights(), + "optimizer_state": self.get_optimizer_weights(), + } + # return {"module_state": self.get_weights(), "optimizer_state": {}} + + def set_optimizer_weights(self, weights: Mapping[str, Any]) -> None: + """Set the weights of the optimizer. + + Args: + weights: The weights of the optimizer. + + """ + raise NotImplementedError + + def get_optimizer_weights(self) -> Mapping[str, Any]: + """Get the weights of the optimizer. + + Returns: + The weights of the optimizer. + + """ + raise NotImplementedError def _get_metadata(self) -> Dict[str, Any]: metadata = { @@ -902,6 +973,16 @@ def save_state(self, path: Union[str, pathlib.Path]) -> None: NOTE: if path doesn't exist, then a new directory will be created. otherwise, it will be appended to. + the state of the learner is saved in the following format: + + checkpoint_dir/ + learner_state.json + module_state/ + module_1/ + ... + optimizer_state/ + optimizers_module_1/ + ... Args: path: The path to the directory to save the state to. @@ -957,17 +1038,17 @@ def _make_module(self) -> MultiAgentRLModule: This method uses `self._module_specs` or `self._module_obj` to construct the module. If the module_class is a single agent RL module it will be wrapped to a - multi-agent RL module. Override this method if there are other things than - needs to happen for instantiation of the module. - + multi-agent RL module. Override this method if there are other things that + need to happen for instantiation of the module. Returns: - The constructed module. + A constructed MultiAgentRLModule. """ if self._module_obj is not None: module = self._module_obj else: module = self._module_spec.build() + # If not already, convert to MultiAgentRLModule. module = module.as_multi_agent() return module @@ -975,11 +1056,11 @@ def _check_result(self, result: Mapping[str, Any]) -> None: """Checks whether the result has the correct format. All the keys should be referencing the module ids that got updated. There is a - special key `__all__` that hold any extra information that is not specific to a - module. + special key `ALL_MODULES` that hold any extra information that is not specific + to a module. Args: - results: The result of the update. + result: The result of the update. Raises: ValueError: If the result are not in the correct format. @@ -1000,7 +1081,7 @@ def _check_result(self, result: Mapping[str, Any]) -> None: if key not in self.module.keys(): raise ValueError( f"The key {key} in the result of the update is not a valid " - f"module id. Valid module ids are: {self.module.keys()}" + f"module id. Valid module ids are: {list(self.module.keys())}." ) @OverrideToImplementCustomLogic_CallToSuperRecommended @@ -1010,7 +1091,7 @@ def _update( ) -> Mapping[str, Any]: """Performs a single update given a batch of data.""" # TODO (Kourosh): remove the MultiAgentBatch from the type, it should be - # NestedDict from the base class. + # NestedDict from the base class. tensorbatch = self._convert_batch_type(batch) fwd_out = self._module.forward_train(tensorbatch) loss = self.compute_loss(fwd_out=fwd_out, batch=tensorbatch) @@ -1018,9 +1099,9 @@ def _update( gradients = self.compute_gradients(loss) postprocessed_gradients = self.postprocess_gradients(gradients) self.apply_gradients(postprocessed_gradients) - result = self.compile_results(batch, fwd_out, loss, postprocessed_gradients) - self._check_result(result) - return convert_to_numpy(result) + results = self.compile_results(batch, fwd_out, loss, postprocessed_gradients) + self._check_result(results) + return convert_to_numpy(results) def _check_is_built(self): if self._module is None: @@ -1039,6 +1120,26 @@ def _reset(self): def apply(self, func, *_args, **_kwargs): return func(self, *_args, **_kwargs) + @abc.abstractmethod + def _get_tensor_variable( + self, + value: Any, + dtype: Any = None, + trainable: bool = False, + ) -> TensorType: + """Returns a framework-specific tensor variable with the initial given value. + + This is a framework specific method that should be implemented by the + framework specific sub-class. + + Args: + value: The initial value for the tensor variable variable. + + Returns: + The framework specific tensor variable of the given initial value, + dtype and trainable/requires_grad property. + """ + @dataclass class LearnerSpec: @@ -1052,27 +1153,31 @@ class LearnerSpec: backend_config: The backend config for properly distributing the RLModule. optimizer_config: The optimizer setting to apply during training. learner_hyperparameters: The extra config for the loss/additional update. This - should be a subclass of LearnerHPs. This is useful for passing in - algorithm configs that contains the hyper-parameters for loss computation, - change of training behaviors, etc. e.g lr, entropy_coeff. + should be a subclass of LearnerHyperparameters. This is useful for passing + in algorithm configs that contains the hyper-parameters for loss + computation, change of training behaviors, etc. e.g lr, entropy_coeff. """ learner_class: Type["Learner"] module_spec: Union["SingleAgentRLModuleSpec", "MultiAgentRLModuleSpec"] = None module: Optional["RLModule"] = None - learner_scaling_config: LearnerGroupScalingConfig = field( + learner_group_scaling_config: LearnerGroupScalingConfig = field( default_factory=LearnerGroupScalingConfig ) optimizer_config: Dict[str, Any] = field(default_factory=dict) - learner_hyperparameters: LearnerHPs = field(default_factory=LearnerHPs) - framework_hyperparameters: FrameworkHPs = field(default_factory=FrameworkHPs) + learner_hyperparameters: LearnerHyperparameters = field( + default_factory=LearnerHyperparameters + ) + framework_hyperparameters: FrameworkHyperparameters = field( + default_factory=FrameworkHyperparameters + ) def get_params_dict(self) -> Dict[str, Any]: """Returns the parameters than be passed to the Learner constructor.""" return { "module": self.module, "module_spec": self.module_spec, - "learner_scaling_config": self.learner_scaling_config, + "learner_group_scaling_config": self.learner_group_scaling_config, "optimizer_config": self.optimizer_config, "learner_hyperparameters": self.learner_hyperparameters, "framework_hyperparameters": self.framework_hyperparameters, diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index c53ee9b78dd7..ee2213351787 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -1,10 +1,19 @@ from collections import deque +from functools import partial import pathlib -import socket -from typing import Any, List, Mapping, Type, Optional, Callable, Set, TYPE_CHECKING +from typing import ( + Any, + Callable, + List, + Mapping, + Optional, + Set, + Type, + TYPE_CHECKING, + Union, +) import ray - from ray.rllib.core.learner.reduce_result_dict_fn import _reduce_mean_results from ray.rllib.core.rl_module.rl_module import ( ModuleID, @@ -76,22 +85,25 @@ def __init__( learner_spec: LearnerSpec, max_queue_len: int = 20, ): - scaling_config = learner_spec.learner_scaling_config + scaling_config = learner_spec.learner_group_scaling_config learner_class = learner_spec.learner_class # TODO (Kourosh): Go with a _remote flag instead of _is_local to be more - # explicit + # explicit. self._is_local = scaling_config.num_workers == 0 self._learner = None self._workers = None - # if a user calls self.shutdown() on their own then this flag is set to true. + # If a user calls self.shutdown() on their own then this flag is set to true. # When del is called the backend executor isn't shutdown twice if this flag is # true. the backend executor would otherwise log a warning to the console from - # ray train + # ray train. self._is_shut_down = False self._is_module_trainable = _is_module_trainable + # How many timesteps had to be dropped due to a full input queue? + self._in_queue_ts_dropped = 0 + if self._is_local: self._learner = learner_class(**learner_spec.get_params_dict()) self._learner.build() @@ -114,24 +126,25 @@ def __init__( self._workers = [w.actor for w in backend_executor.worker_group.workers] - # run the neural network building code on remote workers + # Run the neural network building code on remote workers. ray.get([w.build.remote() for w in self._workers]) - # use only 1 max in flight request per worker since training workers have to - # be synchronously executed. + self._worker_manager = FaultTolerantActorManager( self._workers, - max_remote_requests_in_flight_per_actor=1, + # TODO (sven): This probably works even without any restriction + # (allowing for any arbitrary number of requests in-flight). Test with + # 3 first, then with unlimited, and if both show the same behavior on + # an async algo, remove this restriction entirely. + max_remote_requests_in_flight_per_actor=3, ) self._in_queue = deque(maxlen=max_queue_len) - @property - def in_queue_size(self) -> int: - """Returns the number of batches currently in the in queue to be processed. - - If the queue is reaching its max size, then this learner group likely needs - more workers to process incoming batches. - """ - return len(self._in_queue) + def get_in_queue_stats(self) -> Mapping[str, Any]: + """Returns the current stats for the input queue for this learner group.""" + return { + "learner_group_queue_size": len(self._in_queue), + "learner_group_queue_ts_dropped": self._in_queue_ts_dropped, + } @property def is_local(self) -> bool: @@ -143,23 +156,25 @@ def update( *, minibatch_size: Optional[int] = None, num_iters: int = 1, - reduce_fn: Callable[[ResultDict], ResultDict] = _reduce_mean_results, + reduce_fn: Optional[Callable[[List[Mapping[str, Any]]], ResultDict]] = ( + _reduce_mean_results + ), block: bool = True, - ) -> List[Mapping[str, Any]]: - """Do one gradient based update to the Learner(s). + ) -> Union[Mapping[str, Any], List[Mapping[str, Any]]]: + """Do one or more gradient based updates to the Learner(s) based on given data. Args: - batch: The data to use for the update. + batch: The data batch to use for the update. minibatch_size: The minibatch size to use for the update. num_iters: The number of complete passes over all the sub-batches in the input multi-agent batch. - reduce_fn: A function to reduce the results from a list of Learner Actors - into a single result. This can be any arbitrary function that takes a - list of dictionaries and returns a single dictionary. For example you - can either take an average (default) or concatenate the results (for - example for metrics) or be more selective about you want to report back - to the algorithm's training_step. If None is passed, the results will - not get reduced. + reduce_fn: An optional callable to reduce the results from a list of the + Learner actors into a single result. This can be any arbitrary function + that takes a list of dictionaries and returns a single dictionary. For + example you can either take an average (default) or concatenate the + results (for example for metrics) or be more selective about you want to + report back to the algorithm's training_step. If None is passed, the + results will not get reduced. block: Whether to block until the update is complete. Returns: @@ -196,9 +211,15 @@ def update( block=block, ) - # TODO (Kourosh): Maybe we should use LearnerInfoBuilder() here? - if reduce_fn is None or not results: + # No reduce function -> Return results as is: (possibly empty) list of mappings. + if reduce_fn is None: return results + # If results are empty, don't run them through reduce_fn, but return empty dict. + elif not results: + return {} + # Run results (list of result dicts from our n learner actors) through + # reduction function and return single mapping. + # TODO (Kourosh): Maybe we should use LearnerInfoBuilder() here? return reduce_fn(results) def _distributed_update( @@ -207,7 +228,9 @@ def _distributed_update( *, minibatch_size: Optional[int] = None, num_iters: int = 1, - reduce_fn: Callable[[ResultDict], ResultDict] = _reduce_mean_results, + reduce_fn: Callable[[List[Mapping[str, Any]]], ResultDict] = ( + _reduce_mean_results + ), block: bool = True, ) -> List[Mapping[str, Any]]: """Do a gradient based update to the Learners using DDP training. @@ -221,43 +244,77 @@ def _distributed_update( See `.update()` docstring. Returns: - A list of dictionaries of results from the updates from the Learner(s) + A list of dictionaries of results from the updates from the individual + Learner(s) """ + # Make sure minibatch size is reduced to the correct number of shards as well + # (just like we split each batch into the number of learner workers). + if minibatch_size is not None: + minibatch_size //= len(self._workers) + + def _learner_update(learner, minibatch): + return learner.update( + minibatch, + minibatch_size=minibatch_size, + num_iters=num_iters, + reduce_fn=reduce_fn, + ) if block: - results = self._worker_manager.foreach_actor( - [ - lambda w: w.update( - b, - minibatch_size=minibatch_size, - num_iters=num_iters, - reduce_fn=reduce_fn, - ) - for b in ShardBatchIterator(batch, len(self._workers)) - ] - ) - else: - if batch is not None: - self._in_queue.append(batch) - results = self._worker_manager.fetch_ready_async_reqs() - if self._worker_manager_ready() and self._in_queue: - batch = self._in_queue.popleft() - self._worker_manager.foreach_actor_async( + results = self._get_results( + self._worker_manager.foreach_actor( [ - lambda w: w.update( - b, - minibatch_size=minibatch_size, - num_iters=num_iters, - reduce_fn=reduce_fn, - ) - for b in ShardBatchIterator(batch, len(self._workers)) + partial(_learner_update, minibatch=minibatch) + for minibatch in ShardBatchIterator(batch, len(self._workers)) ] ) + ) + else: + # Queue the new batches. + # If queue is full, kick out the oldest item (and thus add its + # length to the "dropped ts" counter). + if len(self._in_queue) == self._in_queue.maxlen: + self._in_queue_ts_dropped += len(self._in_queue[0]) + + self._in_queue.append(batch) - return self._get_results(results) + # Retrieve all ready results (kicked off by prior calls to this method). + results = self._worker_manager.fetch_ready_async_reqs() + # Only if there are no more requests in-flight on any of the learners, + # we can send in one new batch for sharding and parallel learning. + if self._worker_manager_ready(): + count = 0 + # TODO (sven): This probably works even without any restriction + # (allowing for any arbitrary number of requests in-flight). Test with + # 3 first, then with unlimited, and if both show the same behavior on + # an async algo, remove this restriction entirely. + while len(self._in_queue) > 0 and count < 3: + # Pull a single batch from the queue (from the left side, meaning: + # use the oldest one first). + batch = self._in_queue.popleft() + self._worker_manager.foreach_actor_async( + [ + partial(_learner_update, minibatch=minibatch) + for minibatch in ShardBatchIterator( + batch, len(self._workers) + ) + ] + ) + count += 1 + + results = self._get_results(results) + + return results def _worker_manager_ready(self): - return self._worker_manager.num_outstanding_async_reqs() == 0 + # TODO (sven): This probably works even without any restriction (allowing for + # any arbitrary number of requests in-flight). Test with 3 first, then with + # unlimited, and if both show the same behavior on an async algo, remove + # this method entirely. + return ( + self._worker_manager.num_outstanding_async_reqs() + <= self._worker_manager.num_actors() * 2 + ) def _get_results(self, results): processed_results = [] @@ -272,9 +329,9 @@ def _get_results(self, results): def additional_update( self, *, - reduce_fn: Optional[Callable[[ResultDict], ResultDict]] = _reduce_mean_results, + reduce_fn: Callable[[ResultDict], ResultDict] = _reduce_mean_results, **kwargs, - ) -> List[Mapping[str, Any]]: + ) -> Union[Mapping[str, Any], List[Mapping[str, Any]]]: """Apply additional non-gradient based updates to the Learners. For example, this could be used to do a polyak averaging update @@ -291,10 +348,10 @@ def additional_update( """ if self.is_local: - results = [self._learner.additional_update(**kwargs)] + return self._learner.additional_update(**kwargs) else: results = self._worker_manager.foreach_actor( - [lambda w: w.additional_update(**kwargs) for worker in self._workers] + [lambda w: w.additional_update(**kwargs) for _ in self._workers] ) results = self._get_results(results) if reduce_fn is None: @@ -475,20 +532,19 @@ def load_state(self, path: str) -> None: if not path.exists(): raise ValueError(f"Path {path} does not exist.") path = str(path.absolute()) - assert len(self._workers) == self._worker_manager.num_healthy_actors() if self.is_local: self._learner.load_state(path) else: - head_node_ip = socket.gethostbyname(socket.gethostname()) + assert len(self._workers) == self._worker_manager.num_healthy_actors() + head_node_ip = ray.util.get_node_ip_address() workers = self._worker_manager.healthy_actor_ids() def _load_state(w): # doing imports here since they might not be imported on the worker - import socket + import ray import tempfile - hostname = socket.gethostname() - worker_node_ip = socket.gethostbyname(hostname) + worker_node_ip = ray.util.get_node_ip_address() # if the worker is on the same node as the head, load the checkpoint # directly from the path otherwise sync the checkpoint from the head # to the worker and load it from there @@ -530,11 +586,9 @@ def _get_ip_address(_=None) -> str: The address of this process. """ - import socket - - hostname = socket.gethostname() + import ray - return socket.gethostbyname(hostname) + return ray.util.get_node_ip_address() def shutdown(self): """Shuts down the LearnerGroup.""" diff --git a/rllib/core/learner/learner_group_config.py b/rllib/core/learner/learner_group_config.py index bf6454886a3d..322c8d062568 100644 --- a/rllib/core/learner/learner_group_config.py +++ b/rllib/core/learner/learner_group_config.py @@ -6,8 +6,8 @@ from ray.rllib.core.learner.scaling_config import LearnerGroupScalingConfig from ray.rllib.core.learner.learner import ( LearnerSpec, - LearnerHPs, - FrameworkHPs, + LearnerHyperparameters, + FrameworkHyperparameters, ) from ray.rllib.utils.from_config import NotProvided @@ -18,8 +18,15 @@ ModuleSpec = Union[SingleAgentRLModuleSpec, MultiAgentRLModuleSpec] -# TODO (Kourosh): We should make all configs come from a standard base class that -# defines the general interfaces for validation, from_dict, to_dict etc. +# TODO (Kourosh, Sven): We should make all configs come from a standard base class that +# defines the general interfaces for validation, from_dict, to_dict etc. +# Also, all these classes should abide by the following design patterns: +# - Define all default values for properties in the constructor. +# - No properties are magically set under the hood, w/o the user calling one of its +# setter methods (e.g. `.training()`). `validate()` is not one of these setter methods +# and thus should never set any properties, only validate and warn/error. +# - Any sub-configurations should be generated by calling a `.get_xyz_config()` method +# and thus be compiled on-the-fly to avoid duplicate information. class LearnerGroupConfig: """Configuration object for LearnerGroup.""" @@ -33,8 +40,9 @@ def __init__(self, cls: Type[LearnerGroup] = None) -> None: # `self.learner()` self.learner_class = None - self.optimizer_config = None - self.learner_hps = LearnerHPs() + # TODO (Kourosh): Change the optimizer config to a dataclass object. + self.optimizer_config = {"lr": 3e-4} + self.learner_hyperparameters = LearnerHyperparameters() # `self.resources()` self.num_gpus_per_learner_worker = 0 @@ -65,11 +73,6 @@ def validate(self) -> None: "the Learner class with .learner(learner_class=MyTrainerClass)." ) - if self.optimizer_config is None: - # get the default optimizer config if it's not provided - # TODO (Kourosh): Change the optimizer config to a dataclass object. - self.optimizer_config = {"lr": 1e-3} - def build(self) -> LearnerGroup: self.validate() @@ -80,14 +83,14 @@ def build(self) -> LearnerGroup: local_gpu_idx=self.local_gpu_idx, ) - framework_hps = FrameworkHPs(eager_tracing=self.eager_tracing) + framework_hps = FrameworkHyperparameters(eager_tracing=self.eager_tracing) learner_spec = LearnerSpec( learner_class=self.learner_class, module_spec=self.module_spec, optimizer_config=self.optimizer_config, - learner_scaling_config=scaling_config, - learner_hyperparameters=self.learner_hps, + learner_group_scaling_config=scaling_config, + learner_hyperparameters=self.learner_hyperparameters, framework_hyperparameters=framework_hps, ) @@ -113,8 +116,9 @@ def module( def resources( self, + *, num_learner_workers: Optional[int] = NotProvided, - num_gpus_per_learner_worker: Optional[Union[float, int]] = NotProvided, + num_gpus_per_learner_worker: Optional[int] = NotProvided, num_cpus_per_learner_worker: Optional[Union[float, int]] = NotProvided, local_gpu_idx: Optional[int] = NotProvided, ) -> "LearnerGroupConfig": @@ -135,14 +139,14 @@ def learner( *, learner_class: Optional[Type["Learner"]] = NotProvided, optimizer_config: Optional[Dict] = NotProvided, - learner_hps: Optional[LearnerHPs] = NotProvided, + learner_hyperparameters: Optional[LearnerHyperparameters] = NotProvided, ) -> "LearnerGroupConfig": if learner_class is not NotProvided: self.learner_class = learner_class if optimizer_config is not NotProvided: - self.optimizer_config = optimizer_config - if learner_hps is not NotProvided: - self.learner_hps = learner_hps + self.optimizer_config.update(optimizer_config) + if learner_hyperparameters is not NotProvided: + self.learner_hyperparameters = learner_hyperparameters return self diff --git a/rllib/core/learner/scaling_config.py b/rllib/core/learner/scaling_config.py index 8b02494a5efb..2fc16b0efa94 100644 --- a/rllib/core/learner/scaling_config.py +++ b/rllib/core/learner/scaling_config.py @@ -13,7 +13,9 @@ class LearnerGroupScalingConfig: training will run on a single CPU. num_gpus_per_worker: The number of GPUs to allocate per worker. If num_workers=0, any number greater than 0 will run the training on a single - GPU. A value of zero will run the training on a single CPU. + GPU. A value of zero will run the training on `num_cpus_per_worker` CPUs. + Fractional values (e.g. 0.5) are currently NOT supported as these might + cause CUDA async errors. local_gpu_idx: if num_gpus_per_worker > 0, and num_workers<2, then this gpu index will be used for training. This is an index into the available cuda devices. For example if os.environ["CUDA_VISIBLE_DEVICES"] = "1" then a diff --git a/rllib/core/learner/tests/test_learner.py b/rllib/core/learner/tests/test_learner.py index 4cc4d8f895f2..2c7520f6fac7 100644 --- a/rllib/core/learner/tests/test_learner.py +++ b/rllib/core/learner/tests/test_learner.py @@ -5,32 +5,34 @@ import unittest import ray - +from ray.rllib.algorithms.appo.appo import APPOConfig +from ray.rllib.core.learner.learner import Learner, FrameworkHyperparameters +from ray.rllib.core.learner.scaling_config import LearnerGroupScalingConfig from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.core.learner.learner import Learner, FrameworkHPs from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule from ray.rllib.core.testing.tf.bc_learner import BCTfLearner from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID -from ray.rllib.utils.test_utils import check, get_cartpole_dataset_reader +from ray.rllib.utils.test_utils import ( + check, + framework_iterator, + get_cartpole_dataset_reader, +) from ray.rllib.utils.metrics import ALL_MODULES -from ray.rllib.core.learner.scaling_config import LearnerGroupScalingConfig -def get_learner(learning_rate=1e-3) -> Learner: - env = gym.make("CartPole-v1") - +def get_learner(obs_space, action_space, learning_rate=1e-3) -> Learner: learner = BCTfLearner( module_spec=SingleAgentRLModuleSpec( module_class=DiscreteBCTFModule, - observation_space=env.observation_space, - action_space=env.action_space, + observation_space=obs_space, + action_space=action_space, model_config_dict={"fcnet_hiddens": [32]}, ), # made this a configurable hparam to avoid information leakage in tests where we # need to know what the learning rate is. optimizer_config={"lr": learning_rate}, - learner_scaling_config=LearnerGroupScalingConfig(), - framework_hyperparameters=FrameworkHPs(eager_tracing=True), + learner_group_scaling_config=LearnerGroupScalingConfig(), + framework_hyperparameters=FrameworkHyperparameters(eager_tracing=True), ) learner.build() @@ -39,6 +41,9 @@ def get_learner(learning_rate=1e-3) -> Learner: class TestLearner(unittest.TestCase): + + ENV = gym.make("CartPole-v1") + @classmethod def setUp(cls) -> None: ray.init() @@ -49,7 +54,7 @@ def tearDown(cls) -> None: def test_end_to_end_update(self): - learner = get_learner() + learner = get_learner(self.ENV.observation_space, self.ENV.action_space) reader = get_cartpole_dataset_reader(batch_size=512) min_loss = float("inf") @@ -72,11 +77,25 @@ def test_compute_gradients(self): Tests that if we sum all the trainable variables the gradient of output w.r.t. the weights is all ones. """ - learner = get_learner() + learner = BCTfLearner( + module_spec=SingleAgentRLModuleSpec( + module_class=DiscreteBCTFModule, + observation_space=self.ENV.observation_space, + action_space=self.ENV.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ), + # made this a configurable hparam to avoid information leakage in tests + # where we need to know what the learning rate is. + optimizer_config={"lr": 1e-3}, + learner_group_scaling_config=LearnerGroupScalingConfig(), + framework_hyperparameters=FrameworkHyperparameters(eager_tracing=True), + ) + + learner.build() with tf.GradientTape() as tape: params = learner.module[DEFAULT_POLICY_ID].trainable_variables - loss = {"total_loss": sum([tf.reduce_sum(param) for param in params])} + loss = {"total_loss": sum(tf.reduce_sum(param) for param in params)} gradients = learner.compute_gradients(loss, tape) # type should be a mapping from ParamRefs to gradients @@ -85,6 +104,100 @@ def test_compute_gradients(self): for grad in gradients.values(): check(grad, np.ones(grad.shape)) + def test_postprocess_gradients(self): + """Tests the postprocess_gradients correctness.""" + config = ( + APPOConfig() + .environment("CartPole-v1") + .framework(eager_tracing=True) + .rollouts(rollout_fragment_length=50) + ) + + # TODO (sven): Enable torch once available for APPO. + for fw in framework_iterator(config, frameworks=("tf2")): + # Clip by value only. + config.training( + grad_clip=0.75, + grad_clip_by="value", + ) + # TODO (sven): remove this once validation does NOT cause HPs to be + # generated anymore. + config.validate() + config.freeze() + module_spec = config.get_default_rl_module_spec() + module_spec.model_config_dict = {"fcnet_hiddens": [10]} + module_spec.observation_space = self.ENV.observation_space + module_spec.action_space = self.ENV.action_space + learner_group = ( + config.get_learner_group_config(module_spec=module_spec) + .learner(learner_class=config.get_default_learner_class()) + .build() + ) + learner = learner_group._learner + # Pretend our computed gradients are our weights + 1.0. + grads = { + v.ref(): v + 1.0 + for v in learner.module[DEFAULT_POLICY_ID].trainable_variables + } + # Call the learner's postprocessing method. + processed_grads = list(learner.postprocess_gradients(grads).values()) + # Check clipped gradients. + # No single gradient must be larger than 0.1 or smaller than -0.1: + self.assertTrue( + all( + np.max(grad) <= 0.75 and np.min(grad) >= -0.75 + for grad in processed_grads + ) + ) + + # Clip by norm. + config = config.copy(copy_frozen=False).training( + grad_clip=1.0, + grad_clip_by="norm", + ) + # TODO (sven): remove this once validation does NOT cause HPs to be + # generated anymore. + config.validate() + config.freeze() + learner_group = ( + config.get_learner_group_config(module_spec=module_spec) + .learner(learner_class=config.get_default_learner_class()) + .build() + ) + learner = learner_group._learner + # Call the learner's postprocessing method. + processed_grads = list(learner.postprocess_gradients(grads).values()) + # Check clipped gradients. + for proc_grad, grad in zip(processed_grads, grads.values()): + l2_norm = np.sqrt(np.sum(grad**2.0)) + if l2_norm > 1.0: + check(proc_grad, grad * (1.0 / l2_norm)) + + # Clip by global norm. + config = config.copy(copy_frozen=False).training( + grad_clip=5.0, + grad_clip_by="global_norm", + ) + # TODO: remove this once validation does NOT cause HPs to be generated + # anymore + config.validate() + config.freeze() + learner_group = ( + config.get_learner_group_config(module_spec=module_spec) + .learner(learner_class=config.get_default_learner_class()) + .build() + ) + learner = learner_group._learner + # Call the learner's postprocessing method. + processed_grads = list(learner.postprocess_gradients(grads).values()) + # Check clipped gradients. + global_norm = np.sqrt( + np.sum(np.sum(grad**2.0) for grad in grads.values()) + ) + if global_norm > 5.0: + for proc_grad, grad in zip(processed_grads, grads.values()): + check(proc_grad, grad * (5.0 / global_norm)) + def test_apply_gradients(self): """Tests the apply_gradients correctness. @@ -92,7 +205,7 @@ def test_apply_gradients(self): standard SGD/Adam update rule. """ - learner = get_learner() + learner = get_learner(self.ENV.observation_space, self.ENV.action_space) # calculated the expected new params based on gradients of all ones. params = learner.module[DEFAULT_POLICY_ID].trainable_variables @@ -114,16 +227,15 @@ def test_add_remove_module(self): from default), and remove the default module, with a loss that is the sum of all variables the updated parameters follow the SGD update rule. """ - env = gym.make("CartPole-v1") lr = 1e-3 - learner = get_learner(lr) + learner = get_learner(self.ENV.observation_space, self.ENV.action_space, lr) learner.add_module( module_id="test", module_spec=SingleAgentRLModuleSpec( module_class=DiscreteBCTFModule, - observation_space=env.observation_space, - action_space=env.action_space, + observation_space=self.ENV.observation_space, + action_space=self.ENV.action_space, model_config_dict={"fcnet_hiddens": [16]}, ), ) @@ -139,25 +251,23 @@ def test_add_remove_module(self): expected = [param - n_steps * lr * np.ones(param.shape) for param in params] for _ in range(n_steps): with tf.GradientTape() as tape: - loss = {"total_loss": sum([tf.reduce_sum(param) for param in params])} + loss = {"total_loss": sum(tf.reduce_sum(param) for param in params)} gradients = learner.compute_gradients(loss, tape) learner.apply_gradients(gradients) check(params, expected) def test_save_load_state(self): - env = gym.make("CartPole-v1") - learner1 = BCTfLearner( module_spec=SingleAgentRLModuleSpec( module_class=DiscreteBCTFModule, - observation_space=env.observation_space, - action_space=env.action_space, + observation_space=self.ENV.observation_space, + action_space=self.ENV.action_space, model_config_dict={"fcnet_hiddens": [64]}, ), optimizer_config={"lr": 2e-3}, - learner_scaling_config=LearnerGroupScalingConfig(), - framework_hyperparameters=FrameworkHPs(eager_tracing=True), + learner_group_scaling_config=LearnerGroupScalingConfig(), + framework_hyperparameters=FrameworkHyperparameters(eager_tracing=True), ) learner1.build() @@ -167,13 +277,13 @@ def test_save_load_state(self): learner2 = BCTfLearner( module_spec=SingleAgentRLModuleSpec( module_class=DiscreteBCTFModule, - observation_space=env.observation_space, - action_space=env.action_space, + observation_space=self.ENV.observation_space, + action_space=self.ENV.action_space, model_config_dict={"fcnet_hiddens": [32]}, ), optimizer_config={"lr": 1e-3}, - learner_scaling_config=LearnerGroupScalingConfig(), - framework_hyperparameters=FrameworkHPs(eager_tracing=True), + learner_group_scaling_config=LearnerGroupScalingConfig(), + framework_hyperparameters=FrameworkHyperparameters(eager_tracing=True), ) learner2.build() learner2.load_state(tmpdir) @@ -185,8 +295,8 @@ def test_save_load_state(self): module_id="test", module_spec=SingleAgentRLModuleSpec( module_class=DiscreteBCTFModule, - observation_space=env.observation_space, - action_space=env.action_space, + observation_space=self.ENV.observation_space, + action_space=self.ENV.action_space, model_config_dict={"fcnet_hiddens": [32]}, ), ) diff --git a/rllib/core/learner/tests/test_learner_group.py b/rllib/core/learner/tests/test_learner_group.py index 55800981a128..e01e81edfb8e 100644 --- a/rllib/core/learner/tests/test_learner_group.py +++ b/rllib/core/learner/tests/test_learner_group.py @@ -36,7 +36,7 @@ LOCAL_SCALING_CONFIGS = { "local-cpu": LearnerGroupScalingConfig(num_workers=0, num_gpus_per_worker=0), - "local-gpu": LearnerGroupScalingConfig(num_workers=0, num_gpus_per_worker=0.5), + "local-gpu": LearnerGroupScalingConfig(num_workers=0, num_gpus_per_worker=1), } @@ -45,6 +45,17 @@ @ray.remote(num_gpus=1) class RemoteTrainingHelper: def local_training_helper(self, fw, scaling_mode) -> None: + if fw == "torch": + import torch + + torch.manual_seed(0) + elif fw == "tf": + import tensorflow as tf + + # this is done by rllib already inside of the policy class, but we need to + # do it here for testing purposes + tf.compat.v1.enable_eager_execution() + tf.random.set_seed(0) env = gym.make("CartPole-v1") scaling_config = LOCAL_SCALING_CONFIGS[scaling_mode] lr = 1e-3 @@ -71,13 +82,25 @@ def local_training_helper(self, fw, scaling_mode) -> None: # make the state of the learner and the local learner_group identical local_learner.set_state(learner_group.get_state()) + # learner_group.set_state(learner_group.get_state()) + check(local_learner.get_state(), learner_group.get_state()) # do another update batch = reader.next() ma_batch = MultiAgentBatch( {new_module_id: batch, DEFAULT_POLICY_ID: batch}, env_steps=batch.count ) - check(local_learner.update(ma_batch), learner_group.update(ma_batch)) + # the optimizer state is not initialized fully until the first time that + # training is completed. A call to get state before that won't contain the + # optimizer state. So we do a dummy update here to initialize the optimizer + local_learner.update(ma_batch) + learner_group.update(ma_batch) + + check(local_learner.get_state(), learner_group.get_state()) + local_learner_results = local_learner.update(ma_batch) + learner_group_results = learner_group.update(ma_batch) + + check(local_learner_results, learner_group_results) check(local_learner.get_state(), learner_group.get_state()) @@ -138,8 +161,8 @@ def test_update_multigpu(self): self.assertLess(min_loss, 0.57) - # make sure the learner_group resources are freed up so that we don't - # autoscale + # Make sure the learner_group resources are freed up so that we don't + # autoscale. learner_group.shutdown() del learner_group diff --git a/rllib/core/learner/tf/tf_learner.py b/rllib/core/learner/tf/tf_learner.py index 1695940a4170..bfe03311cadc 100644 --- a/rllib/core/learner/tf/tf_learner.py +++ b/rllib/core/learner/tf/tf_learner.py @@ -5,17 +5,18 @@ import tree # pip install dm-tree from typing import ( Any, + Callable, + Hashable, Mapping, - Union, Optional, - Callable, Sequence, - Hashable, + Union, ) from ray.rllib.core.learner.learner import ( - FrameworkHPs, + FrameworkHyperparameters, Learner, + LEARNER_RESULTS_CURR_LR_KEY, ParamOptimizerPair, NamedParamOptimizerPairs, ParamType, @@ -28,8 +29,12 @@ ) from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule from ray.rllib.policy.sample_batch import MultiAgentBatch -from ray.rllib.utils.annotations import override +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.tf_utils import clip_gradients from ray.rllib.utils.typing import TensorType, ResultDict from ray.rllib.utils.minibatch_utils import ( MiniBatchDummyIterator, @@ -51,7 +56,7 @@ class TfLearner(Learner): def __init__( self, *, - framework_hyperparameters: Optional[FrameworkHPs] = FrameworkHPs(), + framework_hyperparameters: Optional[FrameworkHyperparameters] = None, **kwargs, ): @@ -65,12 +70,17 @@ def __init__( # enable_v2_behavior after variables have already been created. pass - super().__init__(framework_hyperparameters=framework_hyperparameters, **kwargs) + super().__init__( + framework_hyperparameters=( + framework_hyperparameters or FrameworkHyperparameters() + ), + **kwargs, + ) - self._enable_tf_function = framework_hyperparameters.eager_tracing + self._enable_tf_function = self._framework_hyperparameters.eager_tracing - # this is a placeholder which will be filled by - # `_make_distributed_strategy_if_necessary` + # This is a placeholder which will be filled by + # `_make_distributed_strategy_if_necessary`. self._strategy: tf.distribute.Strategy = None @override(Learner) @@ -78,13 +88,13 @@ def configure_optimizer_per_module( self, module_id: ModuleID ) -> Union[ParamOptimizerPair, NamedParamOptimizerPairs]: module = self._module[module_id] - lr = self._optimizer_config["lr"] + lr = self.lr_scheduler.get_current_value(module_id) optim = tf.keras.optimizers.Adam(learning_rate=lr) pair: ParamOptimizerPair = ( self.get_parameters(module), optim, ) - # this isn't strictly necessary, but makes it so that if a checkpoint is + # This isn't strictly necessary, but makes it so that if a checkpoint is # computed before training actually starts, then it will be the same in # shape / size as a checkpoint after training starts. optim.build(module.trainable_variables) @@ -97,12 +107,28 @@ def compute_gradients( grads = tape.gradient(loss[self.TOTAL_LOSS_KEY], self._params) return grads + @override(Learner) + def postprocess_gradients( + self, + gradients_dict: Mapping[str, Any], + ) -> Mapping[str, Any]: + """Postprocesses gradients depending on the optimizer config.""" + + # Perform gradient clipping, if necessary. + clip_gradients( + gradients_dict, + grad_clip=self._optimizer_config.get("grad_clip"), + grad_clip_by=self._optimizer_config.get("grad_clip_by"), + ) + + return gradients_dict + @override(Learner) def apply_gradients(self, gradients: ParamDictType) -> None: # TODO (Avnishn, kourosh): apply gradients doesn't work in cases where - # only some agents have a sample batch that is passed but not others. - # This is probably because of the way that we are iterating over the - # parameters in the optim_to_param_dictionary + # only some agents have a sample batch that is passed but not others. + # This is probably because of the way that we are iterating over the + # parameters in the optim_to_param_dictionary. for optim, param_ref_seq in self._optimizer_parameters.items(): variable_list = [ self._params[param_ref] @@ -116,20 +142,6 @@ def apply_gradients(self, gradients: ParamDictType) -> None: ] optim.apply_gradients(zip(gradient_list, variable_list)) - @override(Learner) - def postprocess_gradients( - self, gradients_dict: Mapping[str, Any] - ) -> Mapping[str, Any]: - grad_clip = self._optimizer_config.get("grad_clip", None) - assert isinstance( - grad_clip, (int, float, type(None)) - ), "grad_clip must be a number" - if grad_clip is not None: - gradients_dict = tf.nest.map_structure( - lambda v: tf.clip_by_value(v, -grad_clip, grad_clip), gradients_dict - ) - return gradients_dict - @override(Learner) def load_state( self, @@ -267,6 +279,25 @@ def _load_optimizers(self, path: Union[str, pathlib.Path]) -> None: def set_weights(self, weights: Mapping[str, Any]) -> None: self._module.set_state(weights) + @override(Learner) + def get_optimizer_weights(self) -> Mapping[str, Any]: + optim_weights = {} + with tf.init_scope(): + for name, optim in self._named_optimizers.items(): + optim_weights[name] = [var.numpy() for var in optim.variables()] + return optim_weights + + @override(Learner) + def set_optimizer_weights(self, weights: Mapping[str, Any]) -> None: + for name, weight_array in weights.items(): + if name not in self._named_optimizers: + raise ValueError( + f"Optimizer {name} in weights is not known." + f"Known optimizers are {self._named_optimizers.keys()}" + ) + optim = self._named_optimizers[name] + optim.set_weights(weight_array) + @override(Learner) def get_param_ref(self, param: ParamType) -> Hashable: return param.ref() @@ -413,7 +444,7 @@ def update( reduce_fn: Callable[[ResultDict], ResultDict] = ..., ) -> Mapping[str, Any]: # TODO (Kourosh): The update of learner is vastly differnet than the base - # class. So we need to unify them. + # class. So we need to unify them. missing_module_ids = set(batch.policy_batches.keys()) - set(self._module.keys()) if len(missing_module_ids) > 0: raise ValueError( @@ -430,7 +461,7 @@ def update( results = [] for minibatch in batch_iter(batch, minibatch_size, num_iters): # TODO (Avnish): converting to tf tensor and then from nested dict back to - # dict will most likely hit us in perf. But let's go with this for now. + # dict will most likely hit us in perf. But let's go with this for now. tensorbatch = self._convert_batch_type(minibatch) update_outs = self._update_fn(tensorbatch) loss = update_outs["loss"] @@ -448,12 +479,16 @@ def update( return results return reduce_fn(results) - def _do_update_fn(self, batch: MultiAgentBatch) -> Mapping[str, Any]: + def _do_update_fn( + self, + batch: MultiAgentBatch, + _ray_trace_ctx=None, + ) -> Mapping[str, Any]: # TODO (Avnish): Match this base class's implementation. def helper(_batch): # TODO (Kourosh): We need to go back to NestedDict because that's the - # constraint on forward_train and compute_loss APIs. This seems to be - # in-efficient. Make it efficient. + # constraint on forward_train and compute_loss APIs. This seems to be + # in-efficient. Make it efficient. _batch = NestedDict(_batch) with tf.GradientTape() as tape: fwd_out = self._module.forward_train(_batch) @@ -485,3 +520,43 @@ def filter_fwd_out(x): } return self._strategy.run(helper, args=(batch,)) + + @OverrideToImplementCustomLogic_CallToSuperRecommended + @override(Learner) + def additional_update_per_module( + self, module_id: ModuleID, *, timestep: int, **kwargs + ) -> Mapping[str, Any]: + + results = super().additional_update_per_module(module_id, timestep=timestep) + + # Handle lr scheduling updates and apply new learning rates to the optimizers. + new_lr = self.lr_scheduler.update(module_id=module_id, timestep=timestep) + + # Not sure why we need to do this here besides setting the original + # tf Variable `self.curr_lr_per_module[module_id]`. But when tf creates the + # optimizer, it seems to detach its lr value from the given variable. + # Updating this variable is NOT sufficient to update the actual optimizer's + # learning rate, so we have to explicitly set it here. + if self.hps.lr_schedule is not None: + self._named_optimizers[module_id].lr = new_lr + + results.update({LEARNER_RESULTS_CURR_LR_KEY: new_lr}) + + return results + + @override(Learner) + def _get_tensor_variable(self, value, dtype=None, trainable=False) -> "tf.Tensor": + return tf.Variable( + value, + trainable=trainable, + dtype=( + dtype + or ( + tf.float32 + if isinstance(value, float) + else tf.int32 + if isinstance(value, int) + else None + ) + ), + ) diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py index 732e6cde5c45..29c4a2e4fc45 100644 --- a/rllib/core/learner/torch/torch_learner.py +++ b/rllib/core/learner/torch/torch_learner.py @@ -2,11 +2,11 @@ import pathlib from typing import ( Any, - Mapping, - Union, - Sequence, Hashable, + Mapping, Optional, + Sequence, + Union, ) from ray.rllib.core.rl_module.rl_module import ( @@ -17,8 +17,9 @@ from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule from ray.rllib.core.learner.learner import ( - FrameworkHPs, + FrameworkHyperparameters, Learner, + LEARNER_RESULTS_CURR_LR_KEY, ParamOptimizerPair, NamedParamOptimizerPairs, ParamType, @@ -26,10 +27,18 @@ ) from ray.rllib.core.rl_module.torch.torch_rl_module import TorchDDPRLModule from ray.rllib.policy.sample_batch import MultiAgentBatch -from ray.rllib.utils.annotations import override -from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.annotations import ( + override, + OverrideToImplementCustomLogic, + OverrideToImplementCustomLogic_CallToSuperRecommended, +) from ray.rllib.utils.typing import TensorType from ray.rllib.utils.nested_dict import NestedDict +from ray.rllib.utils.torch_utils import ( + clip_gradients, + convert_to_torch_tensor, + copy_torch_tensors, +) from ray.rllib.utils.framework import try_import_torch torch, nn = try_import_torch() @@ -47,12 +56,17 @@ class TorchLearner(Learner): def __init__( self, *, - framework_hyperparameters: Optional[FrameworkHPs] = FrameworkHPs(), + framework_hyperparameters: Optional[FrameworkHyperparameters] = None, **kwargs, ): - super().__init__(**kwargs) + super().__init__( + framework_hyperparameters=( + framework_hyperparameters or FrameworkHyperparameters() + ), + **kwargs, + ) - # will be set during build + # Will be set during build. self._device = None @override(Learner) @@ -60,7 +74,7 @@ def configure_optimizer_per_module( self, module_id: ModuleID ) -> Union[ParamOptimizerPair, NamedParamOptimizerPairs]: module = self._module[module_id] - lr = self._optimizer_config["lr"] + lr = self.lr_scheduler.get_current_value(module_id) pair: ParamOptimizerPair = ( self.get_parameters(module), torch.optim.Adam(self.get_parameters(module), lr=lr), @@ -79,6 +93,35 @@ def compute_gradients( return grads + @OverrideToImplementCustomLogic_CallToSuperRecommended + @override(Learner) + def additional_update_per_module( + self, module_id: ModuleID, *, timestep: int, **kwargs + ) -> Mapping[str, Any]: + results = super().additional_update_per_module(module_id, timestep=timestep) + + # Handle lr scheduling updates and apply new learning rates to the optimizers. + new_lr = self.lr_scheduler.update(module_id=module_id, timestep=timestep) + results.update({LEARNER_RESULTS_CURR_LR_KEY: new_lr}) + + return results + + @override(Learner) + def postprocess_gradients( + self, + gradients_dict: Mapping[str, Any], + ) -> Mapping[str, Any]: + """Postprocesses gradients depending on the optimizer config.""" + + # Perform gradient clipping, if necessary. + clip_gradients( + gradients_dict, + grad_clip=self._optimizer_config.get("grad_clip"), + grad_clip_by=self._optimizer_config.get("grad_clip_by"), + ) + + return gradients_dict + @override(Learner) def apply_gradients(self, gradients: ParamDictType) -> None: # make sure the parameters do not carry gradients on their own @@ -103,16 +146,42 @@ def set_weights(self, weights: Mapping[str, Any]) -> None: def _save_optimizers(self, path: Union[str, pathlib.Path]) -> None: path = pathlib.Path(path) path.mkdir(parents=True, exist_ok=True) - for name, optim in self._named_optimizers.items(): - torch.save(optim.state_dict(), path / f"{name}.pt") + optim_weights = self.get_optimizer_weights() + for name, weights in optim_weights.items(): + torch.save(weights, path / f"{name}.pt") @override(Learner) def _load_optimizers(self, path: Union[str, pathlib.Path]) -> None: path = pathlib.Path(path) if not path.exists(): raise ValueError(f"Directory {path} does not exist.") + weights = {} + for name in self._named_optimizers.keys(): + weights[name] = torch.load(path / f"{name}.pt") + self.set_optimizer_weights(weights) + + @override(Learner) + def get_optimizer_weights(self) -> Mapping[str, Any]: + optimizer_name_weights = {} for name, optim in self._named_optimizers.items(): - optim.load_state_dict(torch.load(path / f"{name}.pt")) + optim_state_dict = optim.state_dict() + optim_state_dict_cpu = copy_torch_tensors(optim_state_dict, device="cpu") + optimizer_name_weights[name] = optim_state_dict_cpu + return optimizer_name_weights + + @override(Learner) + def set_optimizer_weights(self, weights: Mapping[str, Any]) -> None: + for name, weight_dict in weights.items(): + if name not in self._named_optimizers: + raise ValueError( + f"Optimizer {name} in weights is not known." + f"Known optimizers are {self._named_optimizers.keys()}" + ) + optim = self._named_optimizers[name] + weight_dict_correct_device = copy_torch_tensors( + weight_dict, device=self._device + ) + optim.load_state_dict(weight_dict_correct_device) @override(Learner) def get_param_ref(self, param: ParamType) -> Hashable: @@ -154,14 +223,14 @@ def build(self) -> None: """Builds the TorchLearner. This method is specific to TorchLearner. Before running super() it will - initialzed the device properly based on use_gpu and distributed flags, so that - _make_module() can place the created module on the correct device. After - running super() it will wrap the module in a TorchDDPRLModule if distributed is - set. + initialze the device properly based on the `_use_gpu` and `_distributed` + flags, so that `_make_module()` can place the created module on the correct + device. After running super() it will wrap the module in a TorchDDPRLModule + if `_distributed` is True. """ - # TODO (Kourosh): How do we handle model parallism? + # TODO (Kourosh): How do we handle model parallelism? # TODO (Kourosh): Instead of using _TorchAccelerator, we should use the public - # api in ray.train but allow for session to be None without any errors raised. + # API in ray.train but allow for session to be None without any errors raised. if self._use_gpu: # get_device() returns the 0th device if # it is called from outside of a Ray Train session. Its necessary to give @@ -183,20 +252,32 @@ def build(self) -> None: self._device = torch.device("cpu") super().build() - # if the module is a MultiAgentRLModule and nn.Module we can simply assume + + self._make_modules_ddp_if_necessary() + + @OverrideToImplementCustomLogic + def _make_modules_ddp_if_necessary(self) -> None: + """Default logic for (maybe) making all Modules within self._module DDP.""" + + # If the module is a MultiAgentRLModule and nn.Module we can simply assume # all the submodules are registered. Otherwise, we need to loop through # each submodule and move it to the correct device. # TODO (Kourosh): This can result in missing modules if the user does not - # register them in the MultiAgentRLModule. We should find a better way to - # handle this. + # register them in the MultiAgentRLModule. We should find a better way to + # handle this. if self._distributed: + # Single agent module: Convert to `TorchDDPRLModule`. if isinstance(self._module, TorchRLModule): self._module = TorchDDPRLModule(self._module) + # Multi agent module: Convert each submodule to `TorchDDPRLModule`. else: + assert isinstance(self._module, MultiAgentRLModule) for key in self._module.keys(): - if isinstance(self._module[key], TorchRLModule): + sub_module = self._module[key] + if isinstance(sub_module, TorchRLModule): + # Wrap and override the module ID key in self._module. self._module.add_module( - key, TorchDDPRLModule(self._module[key]), override=True + key, TorchDDPRLModule(sub_module), override=True ) def _is_module_compatible_with_learner(self, module: RLModule) -> bool: @@ -233,3 +314,23 @@ def _map_module_to_device(self, module: MultiAgentRLModule) -> None: for key in module.keys(): if isinstance(module[key], torch.nn.Module): module[key].to(self._device) + + @override(Learner) + def _get_tensor_variable( + self, value, dtype=None, trainable=False + ) -> "torch.Tensor": + return torch.tensor( + value, + requires_grad=trainable, + device=self._device, + dtype=( + dtype + or ( + torch.float32 + if isinstance(value, float) + else torch.int32 + if isinstance(value, int) + else None + ) + ), + ) diff --git a/rllib/core/models/base.py b/rllib/core/models/base.py index 0f00f87d66e5..dca2d030deb6 100644 --- a/rllib/core/models/base.py +++ b/rllib/core/models/base.py @@ -2,13 +2,10 @@ from dataclasses import dataclass from typing import List, Optional, Tuple, Union -from ray.rllib.core.models.specs.checker import convert_to_canonical_format from ray.rllib.core.models.specs.specs_base import Spec -from ray.rllib.core.models.specs.specs_dict import SpecDict from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import ExperimentalAPI from ray.rllib.utils.annotations import override -from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.typing import TensorType # Top level keys that unify model i/o. @@ -20,13 +17,6 @@ CRITIC: str = "critic" -def _raise_not_decorated_exception(class_and_method, input_or_output): - raise ValueError( - f"`{class_and_method}()` not decorated with {input_or_output} specification. " - f"Decorate it with @check_{input_or_output}_specs() to define a specification." - ) - - @ExperimentalAPI @dataclass class ModelConfig(abc.ABC): @@ -45,10 +35,16 @@ class ModelConfig(abc.ABC): Attributes: input_dims: The input dimensions of the network output_dims: The output dimensions of the network. + always_check_shapes: Whether to always check the inputs and outputs of the + model for the specifications. Input specifications are checked on failed + forward passes of the model regardless of this flag. If this flag is set + to `True`, inputs and outputs are checked on every call. This leads to + a slow-down and should only be used for debugging. """ input_dims: Union[List[int], Tuple[int]] = None output_dims: Union[List[int], Tuple[int]] = None + always_check_shapes: bool = False @abc.abstractmethod def build(self, framework: str): @@ -60,6 +56,7 @@ def build(self, framework: str): raise NotImplementedError +@ExperimentalAPI class Model(abc.ABC): """Framework-agnostic base class for RLlib models. @@ -181,15 +178,15 @@ def output_specs(self, spec: Spec) -> None: "you want to override this behavior." ) - def get_initial_state(self) -> Union[NestedDict, List[TensorType]]: + def get_initial_state(self) -> Union[dict, List[TensorType]]: """Returns the initial state of the Model. It can be left empty if this Model is not stateful. """ - return NestedDict() + return dict() @abc.abstractmethod - def _forward(self, input_dict: NestedDict, **kwargs) -> NestedDict: + def _forward(self, input_dict: dict, **kwargs) -> dict: """Returns the output of this model for the given input. This method is called by the forwarding method of the respective framework @@ -200,7 +197,7 @@ def _forward(self, input_dict: NestedDict, **kwargs) -> NestedDict: **kwargs: Forward compatibility kwargs. Returns: - NestedDict: The output tensors. + dict: The output tensors. """ @abc.abstractmethod @@ -227,6 +224,7 @@ def _set_to_dummy_weights(self, value_sequence=(-0.02, -0.01, 0.01, 0.02)) -> No """ +@ExperimentalAPI class Encoder(Model, abc.ABC): """The framework-agnostic base class for all RLlib encoders. @@ -301,14 +299,14 @@ def build(self, framework: str): @override(Model) def get_input_specs(self) -> Optional[Spec]: - return convert_to_canonical_format([SampleBatch.OBS, STATE_IN]) + return [SampleBatch.OBS] @override(Model) def get_output_specs(self) -> Optional[Spec]: - return convert_to_canonical_format([ENCODER_OUT, STATE_OUT]) + return [] @abc.abstractmethod - def _forward(self, input_dict: NestedDict, **kwargs) -> NestedDict: + def _forward(self, input_dict: dict, **kwargs) -> dict: """Returns the latent of the encoder for the given inputs. This method is called by the forwarding method of the respective framework @@ -319,8 +317,8 @@ def _forward(self, input_dict: NestedDict, **kwargs) -> NestedDict: The output dict contains at minimum the latent and the state of the encoder (None for stateless encoders). To establish an agreement between the encoder and RLModules, these values - have the fixed keys `SampleBatch.OBS` and `STATE_IN` for the `input_dict`, - and `STATE_OUT` and `ENCODER_OUT` for the returned NestedDict. + have the fixed keys `SampleBatch.OBS` for the `input_dict`, + and `ACTOR` and `CRITIC` for the returned dict. Args: input_dict: The input tensors. Must contain at a minimum the keys @@ -329,24 +327,25 @@ def _forward(self, input_dict: NestedDict, **kwargs) -> NestedDict: **kwargs: Forward compatibility kwargs. Returns: - NestedDict: The output tensors. Must contain at a minimum the keys - ENCODER_OUT and STATE_OUT (which might be None for stateless encoders). + The output tensors. Must contain at a minimum the key ENCODER_OUT. """ - raise NotImplementedError +@ExperimentalAPI class ActorCriticEncoder(Encoder): - """An encoder that potentially holds two encoders. + """An encoder that potentially holds two stateless encoders. - This is a special case of encoder that can either enclose a single, + This is a special case of Encoder that can either enclose a single, shared encoder or two separate encoders: One for the actor and one for the - critic. The two encoders are of the same type and we can therefore make the + critic. The two encoders are of the same type, and we can therefore make the assumption that they have the same input and output specs. """ framework = None def __init__(self, config: ModelConfig) -> None: + super().__init__(config) + if config.shared: self.encoder = config.base_encoder_config.build(framework=self.framework) else: @@ -357,47 +356,75 @@ def __init__(self, config: ModelConfig) -> None: framework=self.framework ) - # We need to call Encoder.__init__() after initializing the encoder(s) in - # order to build on their specs. - super().__init__(config) - @override(Model) def get_input_specs(self) -> Optional[Spec]: - # if self.config.shared: - # state_in_spec = self.encoder.input_specs[STATE_IN] - # else: - # state_in_spec = { - # ACTOR: self.actor_encoder.input_specs[STATE_IN], - # CRITIC: self.critic_encoder.input_specs[STATE_IN], - # } - - return SpecDict( - { - SampleBatch.OBS: None, - # STATE_IN: state_in_spec, - # SampleBatch.SEQ_LENS: None, - } - ) + return [SampleBatch.OBS] @override(Model) def get_output_specs(self) -> Optional[Spec]: + return [(ENCODER_OUT, ACTOR), (ENCODER_OUT, CRITIC)] + + @override(Model) + def _forward(self, inputs: dict, **kwargs) -> dict: if self.config.shared: - state_out_spec = self.encoder.output_specs[STATE_OUT] - else: - state_out_spec = { - ACTOR: self.actor_encoder.output_specs[STATE_OUT], - CRITIC: self.critic_encoder.output_specs[STATE_OUT], + encoder_outs = self.encoder(inputs, **kwargs) + return { + ENCODER_OUT: { + ACTOR: encoder_outs[ENCODER_OUT], + CRITIC: encoder_outs[ENCODER_OUT], + } } + else: + # Encoders should not modify inputs, so we can pass the same inputs + actor_out = self.actor_encoder(inputs, **kwargs) + critic_out = self.critic_encoder(inputs, **kwargs) - return SpecDict( - { + return { ENCODER_OUT: { - ACTOR: None, - CRITIC: None, - }, - STATE_OUT: state_out_spec, + ACTOR: actor_out[ENCODER_OUT], + CRITIC: critic_out[ENCODER_OUT], + } } - ) + + +@ExperimentalAPI +class StatefulActorCriticEncoder(Encoder): + """An encoder that potentially holds two potentially stateful encoders. + + This is a special case of Encoder that can either enclose a single, + shared encoder or two separate encoders: One for the actor and one for the + critic. The two encoders are of the same type, and we can therefore make the + assumption that they have the same input and output specs. + + If this encoder wraps a single encoder, state in input- and output dicts + is simply stored under the key `STATE_IN` and `STATE_OUT`, respectively. + If this encoder wraps two encoders, state in input- and output dicts is + stored under the keys `(STATE_IN, ACTOR)` and `(STATE_IN, CRITIC)` and + `(STATE_OUT, ACTOR)` and `(STATE_OUT, CRITIC)`, respectively. + """ + + framework = None + + def __init__(self, config: ModelConfig) -> None: + super().__init__(config) + + if config.shared: + self.encoder = config.base_encoder_config.build(framework=self.framework) + else: + self.actor_encoder = config.base_encoder_config.build( + framework=self.framework + ) + self.critic_encoder = config.base_encoder_config.build( + framework=self.framework + ) + + @override(Model) + def get_input_specs(self) -> Optional[Spec]: + return [SampleBatch.OBS, STATE_IN] + + @override(Model) + def get_output_specs(self) -> Optional[Spec]: + return [(ENCODER_OUT, ACTOR), (ENCODER_OUT, CRITIC), (STATE_OUT,)] @override(Model) def get_initial_state(self): @@ -410,32 +437,33 @@ def get_initial_state(self): } @override(Model) - def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: + def _forward(self, inputs: dict, **kwargs) -> dict: + outputs = {} + if self.config.shared: outs = self.encoder(inputs, **kwargs) - return NestedDict( - { - ENCODER_OUT: {ACTOR: outs[ENCODER_OUT], CRITIC: outs[ENCODER_OUT]}, - STATE_OUT: outs[STATE_OUT], - } - ) + encoder_out = outs.pop(ENCODER_OUT) + outputs[ENCODER_OUT] = {ACTOR: encoder_out, CRITIC: encoder_out} + outputs[STATE_OUT] = outs[STATE_OUT] else: - actor_inputs = NestedDict({**inputs}) - # , **{STATE_IN: inputs[STATE_IN][ACTOR]}}) - critic_inputs = NestedDict( - {**inputs} # , **{STATE_IN: inputs[STATE_IN][CRITIC]}} - ) + # Shallow copy inputs so that we can add states without modifying + # original dict. + actor_inputs = inputs.copy() + critic_inputs = inputs.copy() + actor_inputs[STATE_IN] = inputs[STATE_IN][ACTOR] + critic_inputs[STATE_IN] = inputs[STATE_IN][CRITIC] + actor_out = self.actor_encoder(actor_inputs, **kwargs) critic_out = self.critic_encoder(critic_inputs, **kwargs) - return NestedDict( - { - ENCODER_OUT: { - ACTOR: actor_out[ENCODER_OUT], - CRITIC: critic_out[ENCODER_OUT], - }, - STATE_OUT: { - ACTOR: actor_out[STATE_OUT], - CRITIC: critic_out[STATE_OUT], - }, - } - ) + + outputs[ENCODER_OUT] = { + ACTOR: actor_out[ENCODER_OUT], + CRITIC: critic_out[ENCODER_OUT], + } + + outputs[STATE_OUT] = { + ACTOR: actor_out[STATE_OUT], + CRITIC: critic_out[STATE_OUT], + } + + return outputs diff --git a/rllib/core/models/catalog.py b/rllib/core/models/catalog.py index b8dde4673ad0..53f9e5776758 100644 --- a/rllib/core/models/catalog.py +++ b/rllib/core/models/catalog.py @@ -314,6 +314,9 @@ def get_encoder_config( activation = model_config_dict["fcnet_activation"] output_activation = model_config_dict["fcnet_activation"] fcnet_hiddens = model_config_dict["fcnet_hiddens"] + # TODO (sven): Move to a new ModelConfig object (dataclass) asap, instead of + # "linking" into the old ModelConfig (dict)! This just causes confusion as to + # which old keys now mean what for the new RLModules-based default models. encoder_latent_dim = ( model_config_dict["encoder_latent_dim"] or fcnet_hiddens[-1] ) @@ -364,12 +367,18 @@ def get_encoder_config( encoder_config = CNNEncoderConfig( input_dims=observation_space.shape, cnn_filter_specifiers=model_config_dict["conv_filters"], - cnn_activation=activation, + cnn_activation=model_config_dict["conv_activation"], cnn_use_layernorm=model_config_dict.get( "conv_use_layernorm", False ), output_dims=[encoder_latent_dim], - output_activation=output_activation, + # TODO (sven): Setting this to None here helps with the existing + # APPO Pong benchmark (actually, leaving this at default=tanh does + # NOT learn at all!). + # We need to remove the last Dense layer from CNNEncoder in general + # AND establish proper ModelConfig objects (instead of hacking + # everything with the old default model config dict). + output_activation=None, ) # input_space is a 2D Box elif ( diff --git a/rllib/core/models/specs/checker.py b/rllib/core/models/specs/checker.py index 67a43e6f9e5f..da30e6ce5a52 100644 --- a/rllib/core/models/specs/checker.py +++ b/rllib/core/models/specs/checker.py @@ -1,13 +1,23 @@ -from collections import abc import functools +import logging +from collections import abc from typing import Union, Mapping, Any, Callable -from ray.util.annotations import DeveloperAPI - -from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.core.models.specs.specs_base import Spec, TypeSpec from ray.rllib.core.models.specs.specs_dict import SpecDict from ray.rllib.core.models.specs.typing import SpecType +from ray.rllib.utils.nested_dict import NestedDict +from ray.util.annotations import DeveloperAPI + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class SpecCheckingError(Exception): + """Raised when there is an error in the spec checking. + + This Error is raised when inputs or outputs do match the defined specs. + """ @DeveloperAPI @@ -50,10 +60,11 @@ def convert_to_canonical_format(spec: SpecType) -> Union[Spec, SpecDict]: # {"foo": TypeSpec(int), "bar": SpecDict({"baz": TypeSpec(str)})} # ) - spec = {"foo": int, "bar": {"baz": TorchTensorSpec("b,h")}} + spec = {"foo": int, "bar": {"baz": TensorSpec("b,h", framework="torch")}} output = convert_to_canonical_format(spec) # output = SpecDict( - # {"foo": TypeSpec(int), "bar": SpecDict({"baz": TorchTensorSpec("b,h")})} + # {"foo": TypeSpec(int), "bar": SpecDict({"baz": TensorSpec("b,h", + framework="torch")})} # ) @@ -68,9 +79,9 @@ def convert_to_canonical_format(spec: SpecType) -> Union[Spec, SpecDict]: output = convert_to_canonical_format(spec) # output = None - spec = TorchTensorSpec("b,h") + spec = TensorSpec("b,h", framework="torch") output = convert_to_canonical_format(spec) - # output = TorchTensorSpec("b,h") + # output = TensorSpec("b,h", framework="torch") Args: spec: The spec to convert to canonical format. @@ -160,7 +171,7 @@ def _validate( try: spec.validate(data) except ValueError as e: - raise ValueError( + raise SpecCheckingError( f"{tag} spec validation failed on " f"{cls_instance.__class__.__name__}.{method.__name__}, {e}." ) @@ -172,6 +183,7 @@ def _validate( def check_input_specs( input_specs: str, *, + only_check_on_retry: bool = True, filter: bool = False, cache: bool = False, ): @@ -209,6 +221,9 @@ def check_input_specs( string in input_specs and returns the `SpecDict`, `Spec`, or simply the `Type` that the `input_data` should comply with. It can also be None or empty list / dict to enforce no input spec. + only_check_on_retry: If True, the spec will not be checked. Only if the + decorated method raises an Exception, we check the spec to provide a more + informative error message. filter: If True, and `input_data` is a nested dict the `input_data` will be filtered by its corresponding spec tree structure and then passed into the implemented function to make sure user is not confounded with unnecessary @@ -230,12 +245,36 @@ def decorator(func): def wrapper(self, input_data, **kwargs): if cache and not hasattr(self, "__checked_input_specs_cache__"): self.__checked_input_specs_cache__ = {} + if cache and func.__name__ not in self.__checked_input_specs_cache__: + self.__checked_input_specs_cache__[func.__name__] = True + initial_exception = None + if only_check_on_retry: + # Attempt to run the function without spec checking + try: + return func(self, input_data, **kwargs) + except SpecCheckingError as e: + raise e + except Exception as e: + # We store the initial exception to raise it later if the spec + # check fails. + initial_exception = e + logger.error( + f"Exception {e} raised on function call without checkin " + f"input specs. RLlib will now attempt to check the spec " + f"before calling the function again." + ) + + # If the function was not executed successfully yet, we check specs checked_data = input_data if input_specs: - spec = getattr(self, input_specs, "___NOT_FOUND___") - if spec == "___NOT_FOUND___": - raise ValueError(f"object {self} has no attribute {input_specs}.") + if hasattr(self, input_specs): + spec = getattr(self, input_specs) + else: + raise SpecCheckingError( + f"object {self} has no attribute {input_specs}." + ) + if spec is not None: spec = convert_to_canonical_format(spec) checked_data = _validate( @@ -251,12 +290,12 @@ def wrapper(self, input_data, **kwargs): # filtering should happen regardless of cache checked_data = checked_data.filter(spec) - output_data = func(self, checked_data, **kwargs) - - if cache and func.__name__ not in self.__checked_input_specs_cache__: - self.__checked_input_specs_cache__[func.__name__] = True + # If we have encountered an exception from calling `func` already, + # we raise it again here and don't need to call func again. + if initial_exception: + raise initial_exception - return output_data + return func(self, checked_data, **kwargs) wrapper.__checked_input_specs__ = True return wrapper @@ -320,9 +359,11 @@ def wrapper(self, input_data, **kwargs): output_data = func(self, input_data, **kwargs) if output_specs: - spec = getattr(self, output_specs, "___NOT_FOUND___") - if spec == "___NOT_FOUND___": + if hasattr(self, output_specs): + spec = getattr(self, output_specs) + else: raise ValueError(f"object {self} has no attribute {output_specs}.") + if spec is not None: spec = convert_to_canonical_format(spec) _validate( diff --git a/rllib/core/models/specs/specs_base.py b/rllib/core/models/specs/specs_base.py index 8e22ee7d0cfb..b1f693891e92 100644 --- a/rllib/core/models/specs/specs_base.py +++ b/rllib/core/models/specs/specs_base.py @@ -1,10 +1,17 @@ import abc from copy import deepcopy +import numpy as np from typing import Any, Optional, Dict, List, Tuple, Union, Type +from ray.rllib.utils import try_import_jax, try_import_tf, try_import_torch +from ray.rllib.utils.annotations import OverrideToImplementCustomLogic from ray.rllib.utils.annotations import DeveloperAPI, override from ray.rllib.utils.typing import TensorType +torch, _ = try_import_torch() +_, tf, _ = try_import_tf() +jax, _ = try_import_jax() + _INVALID_INPUT_DUP_DIM = "Duplicate dimension names in shape ({})" _INVALID_INPUT_UNKNOWN_DIM = "Unknown dimension name {} in shape ({})" _INVALID_INPUT_POSITIVE = "Dimension {} in ({}) must be positive, got {}" @@ -71,6 +78,8 @@ class TensorSpec(Spec): dtype: The dtype of the tensor. If None, the dtype is not checked during validation. Also during Sampling the dtype is set the default dtype of the backend. + framework: The framework of the tensor. If None, the framework is not + checked during validation. shape_vals: An optional dictionary mapping some dimension names to their values. For example, if shape is "B, C" and shape_vals is {"C": 3}, then the shape of the tensor is (B, 3). B is to be determined during @@ -87,14 +96,8 @@ class TensorSpec(Spec): validate: Checks if the shape and dtype of the tensor matches the specification. fill: creates a tensor with the specified value that is an - example of a tensor that matches the specification. - - Abstract Methods: - get_type: Returns the type of the tensor, e.g. tf.Tensor or torch.Tensor. - get_shape: Returns the shape of the tensor depending on the backend. - get_dtype: Returns the dtype of the tensor depending on the backend. - _full: Creates a tensor with the specified value that - has values of fill_value, shape of shape, and dtype of self.dtype. + example of a tensor that matches the specification. This can only be + called if `framework` is specified. """ def __init__( @@ -102,17 +105,77 @@ def __init__( shape: str, *, dtype: Optional[Any] = None, + framework: Optional[str] = None, **shape_vals: int, ) -> None: self._expected_shape = self._parse_expected_shape(shape, shape_vals) self._full_shape = self._get_full_shape() self._dtype = dtype + self._framework = framework + + if framework not in ("tf2", "torch", "np", "jax", None): + raise ValueError(f"Unknown framework {self._framework}") + + self._type = self._get_expected_type() + + @OverrideToImplementCustomLogic + def _get_expected_type(self) -> Type: + """Returns the expected type of the checked tensor.""" + if self._framework == "torch": + return torch.Tensor + elif self._framework == "tf2": + return tf.Tensor + elif self._framework == "np": + return np.ndarray + elif self._framework == "jax": + jax, _ = try_import_jax() + return jax.numpy.ndarray + elif self._framework is None: + # Don't restrict the type of the tensor if no framework is specified. + return object + + @OverrideToImplementCustomLogic + def get_shape(self, tensor: TensorType) -> Tuple[int]: + """Returns the shape of a tensor. + + Args: + tensor: The tensor whose shape is to be returned. + Returns: + A `tuple` specifying the shape of the tensor. + """ + if self._framework == "tf2": + # tf2 returns `Dimension` objects instead of `int` objects. + return tuple( + int(i) if i is not None else None for i in tensor.shape.as_list() + ) + return tuple(tensor.shape) + + @OverrideToImplementCustomLogic + def get_dtype(self, tensor: TensorType) -> Any: + """Returns the expected data type of the checked tensor. + + Args: + tensor: The tensor whose data type is to be returned. + Returns: + The data type of the tensor. + """ + return tensor.dtype + + @property + def dtype(self) -> Any: + """Returns the expected data type of the checked tensor.""" + return self._dtype @property def shape(self) -> Tuple[Union[int, str]]: """Returns a `tuple` specifying the abstract tensor shape (int and str).""" return self._expected_shape + @property + def type(self) -> Type: + """Returns the expected type of the checked tensor.""" + return self._type + @property def full_shape(self) -> Tuple[int]: """Returns a `tuple` specifying the concrete tensor shape (only ints).""" @@ -154,11 +217,6 @@ def append(self, spec: "TensorSpec") -> "TensorSpec": copy_._full_shape = self._get_full_shape() return copy_ - @property - def dtype(self) -> Any: - """Returns a dtype specifying the tensor dtype.""" - return self._dtype - @override(Spec) def validate(self, tensor: TensorType) -> None: """Checks if the shape and dtype of the tensor matches the specification. @@ -170,9 +228,8 @@ def validate(self, tensor: TensorType) -> None: ValueError: If the shape or dtype of the tensor does not match the """ - expected_type = self.get_type() - if not isinstance(tensor, expected_type): - raise ValueError(_INVALID_TYPE.format(expected_type, type(tensor).__name__)) + if not isinstance(tensor, self.type): + raise ValueError(_INVALID_TYPE.format(self.type, type(tensor).__name__)) shape = self.get_shape(tensor) if len(shape) != len(self._expected_shape): @@ -182,42 +239,10 @@ def validate(self, tensor: TensorType) -> None: if isinstance(expected_d, int) and expected_d != actual_d: raise ValueError(_INVALID_SHAPE.format(self._expected_shape, shape)) - dtype = self.get_dtype(tensor) + dtype = tensor.dtype if self.dtype and dtype != self.dtype: raise ValueError(_INVALID_TYPE.format(self.dtype, tensor.dtype)) - @classmethod - @abc.abstractmethod - def get_type(cls) -> Union[Type, Tuple[Type]]: - """Returns the type of a tensor e.g. torch.Tensor or tf.Tensor. - - Returns: - The type of a tensor. If the backend supports multiple tensor types, then a - tuple of types is returned. - """ - - @abc.abstractmethod - def get_shape(self, tensor: TensorType) -> Tuple[int]: - """Returns the shape of a tensor. - - Args: - tensor: The tensor whose shape is to be returned. - - Returns: - A `tuple` specifying the shape of the tensor. - """ - - @abc.abstractmethod - def get_dtype(self, tensor: TensorType) -> Any: - """Returns the data type of a tensor. - - Args: - tensor: The tensor whose data type is to be returned. - - Returns: - The data type of the tensor. - """ - @DeveloperAPI def fill(self, fill_value: Union[float, int] = 0) -> TensorType: """Creates a tensor filled with `fill_value` that matches the specs. @@ -227,23 +252,30 @@ def fill(self, fill_value: Union[float, int] = 0) -> TensorType: Returns: A tensor with the specified value that matches the specs. + + Raises: + ValueError: If `framework` is not specified. """ - return self._full(self.full_shape, fill_value) - @abc.abstractmethod - def _full(self, shape: Tuple[int], fill_value: Union[float, int] = 0) -> TensorType: - """Creates a tensor with the given shape filled with `fill_value`. + if self._framework == "torch": + return torch.full(self.full_shape, fill_value, dtype=self.dtype) - The tensor dtype is inferred from `fill_value`. This is equivalent to - np.full(shape, val). + elif self._framework == "tf2": + if self.dtype: + return tf.ones(self.full_shape, dtype=self.dtype) * fill_value + return tf.fill(self.full_shape, fill_value) - Args: - shape: The shape of the tensor to be sampled. - fill_value: The value to fill the tensor with. + elif self._framework == "np": + return np.full(self.full_shape, fill_value, dtype=self.dtype) - Returns: - A tensor with the specified value that matches the specs. - """ + elif self._framework == "jax": + return jax.numpy.full(self.full_shape, fill_value, dtype=self.dtype) + + elif self._framework is None: + raise ValueError( + "Cannot fill tensor without providing `framework` to TensorSpec. " + "This TensorSpec was instantiated without `framework`." + ) def _get_full_shape(self) -> Tuple[int]: """Converts the expected shape to a shape by replacing the unknown dimension diff --git a/rllib/core/models/specs/specs_jax.py b/rllib/core/models/specs/specs_jax.py deleted file mode 100644 index 50fb24d1c275..000000000000 --- a/rllib/core/models/specs/specs_jax.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Tuple, Any, Union, Type - -from ray.rllib.utils.annotations import DeveloperAPI, override -from ray.rllib.utils.framework import try_import_jax -from ray.rllib.core.models.specs.specs_base import TensorSpec - -jax, _ = try_import_jax() -jnp = None -if jax is not None: - jnp = jax.numpy - - -@DeveloperAPI -class JAXTensorSpec(TensorSpec): - @override(TensorSpec) - def get_type(cls) -> Type: - return jnp.ndarray - - @override(TensorSpec) - def get_shape(self, tensor: jnp.ndarray) -> Tuple[int]: - return tuple(tensor.shape) - - @override(TensorSpec) - def get_dtype(self, tensor: jnp.ndarray) -> Any: - return tensor.dtype - - @override(TensorSpec) - def _full( - self, shape: Tuple[int], fill_value: Union[float, int] = 0 - ) -> jnp.ndarray: - return jnp.full(shape, fill_value, dtype=self.dtype) diff --git a/rllib/core/models/specs/specs_np.py b/rllib/core/models/specs/specs_np.py deleted file mode 100644 index 7f4c6fb901b2..000000000000 --- a/rllib/core/models/specs/specs_np.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Tuple, Any, Union, Type -import numpy as np - -from ray.rllib.utils.annotations import DeveloperAPI, override -from ray.rllib.core.models.specs.specs_base import TensorSpec - - -@DeveloperAPI -class NPTensorSpec(TensorSpec): - @override(TensorSpec) - def get_type(cls) -> Type: - return np.ndarray - - @override(TensorSpec) - def get_shape(self, tensor: np.ndarray) -> Tuple[int]: - return tuple(tensor.shape) - - @override(TensorSpec) - def get_dtype(self, tensor: np.ndarray) -> Any: - return tensor.dtype - - @override(TensorSpec) - def _full(self, shape: Tuple[int], fill_value: Union[float, int] = 0) -> np.ndarray: - return np.full(shape, fill_value, dtype=self.dtype) diff --git a/rllib/core/models/specs/specs_tf.py b/rllib/core/models/specs/specs_tf.py deleted file mode 100644 index 17efbf7fbc44..000000000000 --- a/rllib/core/models/specs/specs_tf.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Tuple, Any, Union, Type - -from ray.rllib.utils.annotations import DeveloperAPI, override -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.core.models.specs.specs_base import TensorSpec - -_, tf, tfv = try_import_tf() - - -@DeveloperAPI -class TfTensorSpec(TensorSpec): - @override(TensorSpec) - def get_type(cls) -> Type: - return tf.Tensor - - @override(TensorSpec) - def get_shape(self, tensor: "tf.Tensor") -> Tuple[int]: - return tuple(tensor.shape) - - @override(TensorSpec) - def get_dtype(self, tensor: "tf.Tensor") -> Any: - return tensor.dtype - - @override(TensorSpec) - def _full( - self, shape: Tuple[int], fill_value: Union[float, int] = 0 - ) -> "tf.Tensor": - if self.dtype: - return tf.ones(shape, dtype=self.dtype) * fill_value - return tf.fill(shape, fill_value) diff --git a/rllib/core/models/specs/specs_torch.py b/rllib/core/models/specs/specs_torch.py deleted file mode 100644 index 85ea1ae3750c..000000000000 --- a/rllib/core/models/specs/specs_torch.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Tuple, Any, Union, Type - -from ray.rllib.utils.annotations import DeveloperAPI, override -from ray.rllib.utils.framework import try_import_torch -from ray.rllib.core.models.specs.specs_base import TensorSpec - - -torch, _ = try_import_torch() - - -@DeveloperAPI -class TorchTensorSpec(TensorSpec): - @override(TensorSpec) - def get_type(cls) -> Type: - return torch.Tensor - - @override(TensorSpec) - def get_shape(self, tensor: torch.Tensor) -> Tuple[int]: - return tuple(tensor.shape) - - @override(TensorSpec) - def get_dtype(self, tensor: torch.Tensor) -> Any: - return tensor.dtype - - @override(TensorSpec) - def _full( - self, shape: Tuple[int], fill_value: Union[float, int] = 0 - ) -> torch.Tensor: - return torch.full(shape, fill_value, dtype=self.dtype) diff --git a/rllib/core/models/specs/tests/test_check_specs.py b/rllib/core/models/specs/tests/test_check_specs.py index ef05d2a5f9dd..fecf5b39b3d2 100644 --- a/rllib/core/models/specs/tests/test_check_specs.py +++ b/rllib/core/models/specs/tests/test_check_specs.py @@ -1,20 +1,21 @@ import abc -import numpy as np import time -import torch -from typing import Dict, Any, Type import unittest +from typing import Dict, Any, Type -from ray.rllib.core.models.specs.specs_base import TensorSpec, TypeSpec -from ray.rllib.core.models.specs.specs_dict import SpecDict -from ray.rllib.core.models.specs.specs_torch import TorchTensorSpec -from ray.rllib.utils.annotations import override -from ray.rllib.utils.nested_dict import NestedDict +import numpy as np +import torch + +from ray.rllib.core.models.specs.checker import SpecCheckingError from ray.rllib.core.models.specs.checker import ( convert_to_canonical_format, check_input_specs, check_output_specs, ) +from ray.rllib.core.models.specs.specs_base import TensorSpec, TypeSpec +from ray.rllib.core.models.specs.specs_dict import SpecDict +from ray.rllib.utils.annotations import override +from ray.rllib.utils.nested_dict import NestedDict ONLY_ONE_KEY_ALLOWED = "Only one key is allowed in the data dict." @@ -33,7 +34,9 @@ def input_specs(self) -> SpecDict: def output_specs(self) -> SpecDict: pass - @check_input_specs("input_specs", filter=True, cache=False) + @check_input_specs( + "input_specs", filter=True, cache=False, only_check_on_retry=False + ) @check_output_specs("output_specs", cache=False) def check_input_and_output(self, input_dict: Dict[str, Any]) -> Dict[str, Any]: return self._check_input_and_output(input_dict) @@ -42,7 +45,9 @@ def check_input_and_output(self, input_dict: Dict[str, Any]) -> Dict[str, Any]: def _check_input_and_output(self, input_dict: Dict[str, Any]) -> Dict[str, Any]: pass - @check_input_specs("input_specs", filter=True, cache=False) + @check_input_specs( + "input_specs", filter=True, cache=False, only_check_on_retry=False + ) def check_only_input(self, input_dict: Dict[str, Any]) -> Dict[str, Any]: """should not override this method""" return self._check_only_input(input_dict) @@ -60,7 +65,9 @@ def check_only_output(self, input_dict: Dict[str, Any]) -> Dict[str, Any]: def _check_only_output(self, input_dict: Dict[str, Any]) -> Dict[str, Any]: pass - @check_input_specs("input_specs", filter=True, cache=True) + @check_input_specs( + "input_specs", filter=True, cache=True, only_check_on_retry=False + ) @check_output_specs("output_specs", cache=True) def check_input_and_output_with_cache( self, input_dict: Dict[str, Any] @@ -68,7 +75,9 @@ def check_input_and_output_with_cache( """should not override this method""" return self._check_input_and_output(input_dict) - @check_input_specs("input_specs", filter=False, cache=False) + @check_input_specs( + "input_specs", filter=False, cache=False, only_check_on_retry=False + ) @check_output_specs("output_specs", cache=False) def check_input_and_output_wo_filter(self, input_dict) -> Dict[str, Any]: """should not override this method""" @@ -139,7 +148,8 @@ def test_check_input_and_output(self): # This should raise an error saying that the `input` key is missing. self.assertRaises( - ValueError, lambda: correct_module.check_input_and_output({"not_input": 2}) + SpecCheckingError, + lambda: correct_module.check_input_and_output({"not_input": 2}), ) def test_check_only_input(self): @@ -165,7 +175,8 @@ def test_incorrect_implementation(self): # this should raise an error saying that the output does not match the # `output_specs`. self.assertRaises( - ValueError, lambda: incorrect_module.check_input_and_output({"input": 2}) + SpecCheckingError, + lambda: incorrect_module.check_input_and_output({"input": 2}), ) # this should not raise an error because output is not forced to be checked @@ -173,7 +184,8 @@ def test_incorrect_implementation(self): # This should raise an error because output does not match the `output_specs`. self.assertRaises( - ValueError, lambda: incorrect_module.check_only_output({"not_input": 2}) + SpecCheckingError, + lambda: incorrect_module.check_only_output({"not_input": 2}), ) def test_filter(self): @@ -239,15 +251,15 @@ def test_tensor_specs(self): class ClassWithTensorSpec: @property def input_spec1(self) -> TensorSpec: - return TorchTensorSpec("b, h", h=4) + return TensorSpec("b, h", h=4, framework="torch") - @check_input_specs("input_spec1", cache=False) + @check_input_specs("input_spec1", cache=False, only_check_on_retry=False) def forward(self, input_data) -> Any: return input_data module = ClassWithTensorSpec() module.forward(torch.rand(2, 4)) - self.assertRaises(ValueError, lambda: module.forward(torch.rand(2, 3))) + self.assertRaises(SpecCheckingError, lambda: module.forward(torch.rand(2, 3))) def test_type_specs(self): class SpecialOutputType: @@ -272,7 +284,9 @@ def forward_fail(self, input_data) -> Any: module = ClassWithTypeSpec() output = module.forward_pass(torch.rand(2, 4)) self.assertIsInstance(output, SpecialOutputType) - self.assertRaises(ValueError, lambda: module.forward_fail(torch.rand(2, 3))) + self.assertRaises( + SpecCheckingError, lambda: module.forward_fail(torch.rand(2, 3)) + ) def test_convert_to_canonical_format(self): @@ -290,14 +304,17 @@ def test_convert_to_canonical_format(self): # Case: input is a Nested Mapping returned = convert_to_canonical_format( - {"foo": {"bar": TorchTensorSpec("b")}, "jar": {"tar": int, "car": None}} + { + "foo": {"bar": TensorSpec("b", framework="torch")}, + "jar": {"tar": int, "car": None}, + } ) self.assertIsInstance(returned, SpecDict) self.assertDictEqual( returned.asdict(), SpecDict( { - "foo": {"bar": TorchTensorSpec("b")}, + "foo": {"bar": TensorSpec("b", framework="torch")}, "jar": {"tar": TypeSpec(int), "car": None}, } ).asdict(), @@ -305,13 +322,21 @@ def test_convert_to_canonical_format(self): # Case: input is a SpecDict already returned = convert_to_canonical_format( - SpecDict({"foo": {"bar": TorchTensorSpec("b")}, "jar": {"tar": int}}) + SpecDict( + { + "foo": {"bar": TensorSpec("b", framework="torch")}, + "jar": {"tar": int}, + } + ) ) self.assertIsInstance(returned, SpecDict) self.assertDictEqual( returned.asdict(), SpecDict( - {"foo": {"bar": TorchTensorSpec("b")}, "jar": {"tar": TypeSpec(int)}} + { + "foo": {"bar": TensorSpec("b", framework="torch")}, + "jar": {"tar": TypeSpec(int)}, + } ).asdict(), ) diff --git a/rllib/core/models/specs/tests/test_spec_dict.py b/rllib/core/models/specs/tests/test_spec_dict.py index 941bab465e60..52f269a69357 100644 --- a/rllib/core/models/specs/tests/test_spec_dict.py +++ b/rllib/core/models/specs/tests/test_spec_dict.py @@ -1,12 +1,13 @@ import unittest import numpy as np -from ray.rllib.core.models.specs.specs_np import NPTensorSpec +from ray.rllib.core.models.specs.specs_base import TensorSpec from ray.rllib.core.models.specs.specs_dict import SpecDict from ray.rllib.core.models.specs.checker import ( check_input_specs, convert_to_canonical_format, ) +from ray.rllib.core.models.specs.checker import SpecCheckingError class TypeClass1: @@ -24,8 +25,8 @@ def test_basic_validation(self): h1, h2 = 3, 4 spec_1 = SpecDict( { - "out_tensor_1": NPTensorSpec("b, h", h=h1), - "out_tensor_2": NPTensorSpec("b, h", h=h2), + "out_tensor_1": TensorSpec("b, h", h=h1, framework="np"), + "out_tensor_2": TensorSpec("b, h", h=h2, framework="np"), "out_class_1": TypeClass1, } ) @@ -76,12 +77,12 @@ def test_basic_validation(self): spec_2 = SpecDict( { "encoder": { - "input": NPTensorSpec("b, h", h=h1), - "output": NPTensorSpec("b, h", h=h2), + "input": TensorSpec("b, h", h=h1, framework="np"), + "output": TensorSpec("b, h", h=h2, framework="np"), }, "decoder": { - "input": NPTensorSpec("b, h", h=h2), - "output": NPTensorSpec("b, h", h=h1), + "input": TensorSpec("b, h", h=h2, framework="np"), + "output": TensorSpec("b, h", h=h1, framework="np"), }, } ) @@ -160,17 +161,21 @@ def dict_key_spec_with_none_leaves(self): @property def spec_with_type_and_tensor_leaves(self): - return {"a": TypeClass1, "b": NPTensorSpec("b, h", h=3)} + return {"a": TypeClass1, "b": TensorSpec("b, h", h=3, framework="np")} - @check_input_specs("nested_key_spec") + @check_input_specs("nested_key_spec", only_check_on_retry=False) def forward_nested_key(self, input_dict): return input_dict - @check_input_specs("dict_key_spec_with_none_leaves") + @check_input_specs( + "dict_key_spec_with_none_leaves", only_check_on_retry=False + ) def forward_dict_key_with_none_leaves(self, input_dict): return input_dict - @check_input_specs("spec_with_type_and_tensor_leaves") + @check_input_specs( + "spec_with_type_and_tensor_leaves", only_check_on_retry=False + ) def forward_spec_with_type_and_tensor_leaves(self, input_dict): return input_dict @@ -207,10 +212,13 @@ def forward_spec_with_type_and_tensor_leaves(self, input_dict): }, } - self.assertRaises(ValueError, lambda: model.forward_nested_key(input_dict_2)) + self.assertRaises( + SpecCheckingError, lambda: model.forward_nested_key(input_dict_2) + ) self.assertRaises( - ValueError, lambda: model.forward_dict_key_with_none_leaves(input_dict_2) + SpecCheckingError, + lambda: model.forward_dict_key_with_none_leaves(input_dict_2), ) input_dict_3 = { @@ -220,7 +228,7 @@ def forward_spec_with_type_and_tensor_leaves(self, input_dict): # should raise shape mismatch self.assertRaises( - ValueError, + SpecCheckingError, lambda: model.forward_spec_with_type_and_tensor_leaves(input_dict_3), ) diff --git a/rllib/core/models/specs/tests/test_tensor_spec.py b/rllib/core/models/specs/tests/test_tensor_spec.py index 7e961d5e2045..11d005489852 100644 --- a/rllib/core/models/specs/tests/test_tensor_spec.py +++ b/rllib/core/models/specs/tests/test_tensor_spec.py @@ -1,26 +1,31 @@ import itertools import unittest -import torch import numpy as np -import tensorflow as tf +from ray.rllib.utils import try_import_jax, try_import_tf, try_import_torch from ray.rllib.utils.test_utils import check -from ray.rllib.core.models.specs.specs_np import NPTensorSpec -from ray.rllib.core.models.specs.specs_tf import TfTensorSpec -from ray.rllib.core.models.specs.specs_torch import TorchTensorSpec +from ray.rllib.core.models.specs.specs_base import TensorSpec -# TODO: add jax tests +_, tf, _ = try_import_tf() +torch, _ = try_import_torch() +jax, _ = try_import_jax() +jnp = jax.numpy -SPEC_CLASSES = {"torch": TorchTensorSpec, "np": NPTensorSpec, "tf": TfTensorSpec} +# This makes it so that does not convert 64-bit floats to 32-bit +jax.config.update("jax_enable_x64", True) + +FRAMEWORKS_TO_TEST = {"torch", "np", "tf2", "jax"} DOUBLE_TYPE = { "torch": torch.float64, "np": np.float64, - "tf": tf.float64, + "tf2": tf.float64, + "jax": jnp.float64, } FLOAT_TYPE = { "torch": torch.float32, "np": np.float32, - "tf": tf.float32, + "tf2": tf.float32, + "jax": jnp.float32, } @@ -31,82 +36,95 @@ def setUpClass(cls) -> None: def test_fill(self): - for fw in SPEC_CLASSES.keys(): - spec_class = SPEC_CLASSES[fw] + for fw in FRAMEWORKS_TO_TEST: double_type = DOUBLE_TYPE[fw] # if un-specified dims should be 1, dtype is not important - x = spec_class("b,h").fill(float(2.0)) + x = TensorSpec("b,h", framework=fw).fill(float(2.0)) # check the shape self.assertEqual(x.shape, (1, 1)) # check the value check(x, np.array([[2.0]])) - x = spec_class("b,h", b=2, h=3).fill(2.0) + x = TensorSpec("b,h", b=2, h=3, framework=fw).fill(2.0) self.assertEqual(x.shape, (2, 3)) - x = spec_class("b,h1,h2,h3", h1=2, h2=3, h3=3, dtype=double_type).fill(2) + x = TensorSpec( + "b,h1,h2,h3", h1=2, h2=3, h3=3, framework=fw, dtype=double_type + ).fill(2) self.assertEqual(x.shape, (1, 2, 3, 3)) self.assertEqual(x.dtype, double_type) - # def test_validation(self): - - # b, h = 2, 3 - - # for fw in SPEC_CLASSES.keys(): - # spec_class = SPEC_CLASSES[fw] - # double_type = DOUBLE_TYPE[fw] - # float_type = FLOAT_TYPE[fw] - - # tensor_2d = spec_class("b,h", b=b, h=h, dtype=double_type).fill() - - # matching_specs = [ - # spec_class("b,h"), - # spec_class("b,h", h=h), - # spec_class("b,h", h=h, b=b), - # spec_class("b,h", b=b, dtype=double_type), - # ] - - # # check if get_shape returns a tuple of ints - # shape = matching_specs[0].get_shape(tensor_2d) - # self.assertIsInstance(shape, tuple) - # self.assertTrue(all(isinstance(x, int) for x in shape)) - - # # check matching - # for spec in matching_specs: - # spec.validate(tensor_2d) - - # non_matching_specs = [ - # spec_class("b"), - # spec_class("b,h1,h2"), - # spec_class("b,h", h=h + 1), - # ] - # if fw != "jax": - # non_matching_specs.append(spec_class("b,h", dtype=float_type)) - - # for spec in non_matching_specs: - # self.assertRaises(ValueError, lambda: spec.validate(tensor_2d)) - - # # non unique dimensions - # self.assertRaises(ValueError, lambda: spec_class("b,b")) - # # unknown dimensions - # self.assertRaises(ValueError, lambda: spec_class("b,h", b=1, h=2, c=3)) - # self.assertRaises(ValueError, lambda: spec_class("b1", b2=1)) - # # zero dimensions - # self.assertRaises(ValueError, lambda: spec_class("b,h", b=1, h=0)) - # # non-integer dimension - # self.assertRaises(ValueError, lambda: spec_class("b,h", b=1, h="h")) + def test_validation(self): + + b, h = 2, 3 + + for fw in FRAMEWORKS_TO_TEST: + double_type = DOUBLE_TYPE[fw] + float_type = FLOAT_TYPE[fw] + + tensor_2d = TensorSpec( + "b,h", b=b, h=h, framework=fw, dtype=double_type + ).fill() + + matching_specs = [ + TensorSpec("b,h", framework=fw), + TensorSpec("b,h", h=h, framework=fw), + TensorSpec("b,h", h=h, b=b, framework=fw), + TensorSpec("b,h", b=b, framework=fw, dtype=double_type), + ] + + # check if get_shape returns a tuple of ints + shape = matching_specs[0].get_shape(tensor_2d) + self.assertIsInstance(shape, tuple) + print(fw) + print(shape) + self.assertTrue(all(isinstance(x, int) for x in shape)) + + # check matching + for spec in matching_specs: + spec.validate(tensor_2d) + + non_matching_specs = [ + TensorSpec("b", framework=fw), + TensorSpec("b,h1,h2", framework=fw), + TensorSpec("b,h", h=h + 1, framework=fw), + ] + if fw != "jax": + non_matching_specs.append( + TensorSpec("b,h", framework=fw, dtype=float_type) + ) + + for spec in non_matching_specs: + self.assertRaises(ValueError, lambda: spec.validate(tensor_2d)) + + # non unique dimensions + self.assertRaises(ValueError, lambda: TensorSpec("b,b", framework=fw)) + # unknown dimensions + self.assertRaises( + ValueError, lambda: TensorSpec("b,h", b=1, h=2, c=3, framework=fw) + ) + self.assertRaises(ValueError, lambda: TensorSpec("b1", b2=1, framework=fw)) + # zero dimensions + self.assertRaises( + ValueError, lambda: TensorSpec("b,h", b=1, h=0, framework=fw) + ) + # non-integer dimension + self.assertRaises( + ValueError, lambda: TensorSpec("b,h", b=1, h="h", framework=fw) + ) def test_equal(self): - for fw in SPEC_CLASSES.keys(): - spec_class = SPEC_CLASSES[fw] - spec_eq_1 = spec_class("b,h", b=2, h=3) - spec_eq_2 = spec_class("b, h", b=2, h=3) - spec_eq_3 = spec_class(" b, h", b=2, h=3) - spec_neq_1 = spec_class("b, h", h=3, b=3) - spec_neq_2 = spec_class("b, h", h=3, b=3, dtype=DOUBLE_TYPE[fw]) + for fw in FRAMEWORKS_TO_TEST: + spec_eq_1 = TensorSpec("b,h", b=2, h=3, framework=fw) + spec_eq_2 = TensorSpec("b, h", b=2, h=3, framework=fw) + spec_eq_3 = TensorSpec(" b, h", b=2, h=3, framework=fw) + spec_neq_1 = TensorSpec("b, h", h=3, b=3, framework=fw) + spec_neq_2 = TensorSpec( + "b, h", h=3, b=3, framework=fw, dtype=DOUBLE_TYPE[fw] + ) self.assertTrue(spec_eq_1 == spec_eq_2) self.assertTrue(spec_eq_2 == spec_eq_3) @@ -114,13 +132,13 @@ def test_equal(self): self.assertTrue(spec_eq_1 != spec_neq_2) def test_type_validation(self): - - fw_keys = SPEC_CLASSES.keys() # check all combinations of spec fws with tensor fws - for spec_fw, tensor_fw in itertools.product(fw_keys, fw_keys): + for spec_fw, tensor_fw in itertools.product( + FRAMEWORKS_TO_TEST, FRAMEWORKS_TO_TEST + ): - spec = SPEC_CLASSES[spec_fw]("b, h", b=2, h=3) - tensor = SPEC_CLASSES[tensor_fw]("b, h", b=2, h=3).fill(0) + spec = TensorSpec("b, h", b=2, h=3, framework=spec_fw) + tensor = TensorSpec("b, h", b=2, h=3, framework=tensor_fw).fill(0) print("spec:", type(spec), ", tensor: ", type(tensor)) @@ -129,6 +147,71 @@ def test_type_validation(self): else: self.assertRaises(ValueError, lambda: spec.validate(tensor)) + def test_no_framework_arg(self): + """ + Test that a TensorSpec without a framework can be created and used except + for filling. + """ + spec = TensorSpec("b, h", b=2, h=3) + self.assertRaises(ValueError, lambda: spec.fill(0)) + + for fw in FRAMEWORKS_TO_TEST: + tensor = TensorSpec("b, h", b=2, h=3, framework=fw).fill(0) + spec.validate(tensor) + + def test_validate_framework(self): + """ + Test that a TensorSpec with a framework raises an error + when being used with a tensor from a different framework. + """ + for spec_fw, tensor_fw in itertools.product( + FRAMEWORKS_TO_TEST, FRAMEWORKS_TO_TEST + ): + spec = TensorSpec("b, h", b=2, h=3, framework=spec_fw) + tensor = TensorSpec("b, h", b=2, h=3, framework=tensor_fw).fill(0) + if spec_fw == tensor_fw: + spec.validate(tensor) + else: + self.assertRaises(ValueError, lambda: spec.validate(tensor)) + + def test_validate_dtype(self): + """ + Test that a TensorSpec with a dtype raises an error + when being used with a tensor from a different dtype but works otherwise. + """ + + all_types = [DOUBLE_TYPE, FLOAT_TYPE] + + for spec_types, tensor_types in itertools.product(all_types, all_types): + for spec_fw, tensor_fw in itertools.product( + FRAMEWORKS_TO_TEST, FRAMEWORKS_TO_TEST + ): + + # Pick the correct types for the frameworks + spec_type = spec_types[spec_fw] + tensor_type = tensor_types[tensor_fw] + + print( + "\nTesting.." "\nspec_fw: ", + spec_fw, + "\ntensor_fw: ", + tensor_fw, + "\nspec_type: ", + spec_type, + "\ntensor_type: ", + tensor_type, + ) + + spec = TensorSpec("b, h", b=2, h=3, dtype=spec_type) + tensor = TensorSpec( + "b, h", b=2, h=3, framework=tensor_fw, dtype=tensor_type + ).fill(0) + + if spec_type != tensor_type: + self.assertRaises(ValueError, lambda: spec.validate(tensor)) + else: + spec.validate(tensor) + if __name__ == "__main__": import pytest diff --git a/rllib/core/models/tests/test_base_models.py b/rllib/core/models/tests/test_base_models.py new file mode 100644 index 000000000000..8a5ea238072a --- /dev/null +++ b/rllib/core/models/tests/test_base_models.py @@ -0,0 +1,274 @@ +import unittest +from dataclasses import dataclass + +from ray.rllib.core.models.base import ModelConfig +from ray.rllib.core.models.specs.checker import SpecCheckingError +from ray.rllib.core.models.specs.specs_base import TensorSpec +from ray.rllib.core.models.specs.specs_dict import SpecDict +from ray.rllib.core.models.tf.base import TfModel +from ray.rllib.core.models.torch.base import TorchModel +from ray.rllib.utils.framework import try_import_tf, try_import_torch + +_, tf, _ = try_import_tf() +torch, nn = try_import_torch() + + +def _dynamo_is_available(): + # This only works if torch._dynamo is available + try: + # TODO(Artur): Remove this once torch._dynamo is available on CI + import torch._dynamo as dynamo # noqa: F401 + + return True + except ImportError: + return False + + +class TestModelBase(unittest.TestCase): + def test_model_input_spec_checking(self): + """Tests if model input spec checking works correctly. + + This test is centered around the `always_check_shapes` flag of the + ModelConfig class. If this flag is set to True, the model will always + check if the inputs conform to the specs. If this flag is set to False, + the model will only check the input if we encounter an error in side + the forward call. + """ + + for fw in ["torch", "tf2"]: + + class CatModel: + """Simple model that concatenates parts of its input.""" + + def __init__(self, config): + super().__init__(config) + + def get_output_specs(self): + return SpecDict( + { + "out_1": TensorSpec("b, h", h=1, framework=fw), + # out_2 is simply 2x stacked in_1 + "out_2": TensorSpec("b, h", h=4, framework=fw), + } + ) + + def get_input_specs(self): + return SpecDict( + { + "in_1": TensorSpec("b, h", h=1, framework=fw), + "in_2": TensorSpec("b, h", h=2, framework=fw), + } + ) + + if fw == "tf2": + + class TestModel(CatModel, TfModel): + def _forward(self, input_dict): + out_2 = tf.concat( + [input_dict["in_2"], input_dict["in_2"]], axis=1 + ) + return {"out_1": input_dict["in_1"], "out_2": out_2} + + else: + + class TestModel(CatModel, TorchModel): + def _forward(self, input_dict): + out_2 = torch.cat( + [input_dict["in_2"], input_dict["in_2"]], dim=1 + ) + return {"out_1": input_dict["in_1"], "out_2": out_2} + + @dataclass + class CatModelConfig(ModelConfig): + def build(self, framework: str): + # Since we define the correct model above anyway, we don't need + # to distinguish between frameworks here. + return TestModel(self) + + # 1) Check if model behaves correctly with always_check_shapes=True first + # We expect model to raise an error if the input shapes are not correct. + # This is the behaviour we use for debugging with model specs. + + config = CatModelConfig(always_check_shapes=True) + + model = config.build(framework="spam") + + # We want to raise an input spec validation error here since the input + # consists of lists and not torch Tensors + with self.assertRaisesRegex( + SpecCheckingError, "input spec validation failed" + ): + model({"in_1": [1], "in_2": [1, 2]}) + + # We don't want to raise an input spec validation error here since the + # input consists of valid tensors + if fw == "torch": + model({"in_1": torch.Tensor([[1]]), "in_2": torch.Tensor([[1, 2]])}) + else: + model({"in_1": tf.constant([[1]]), "in_2": tf.constant([[1, 2]])}) + + # 2) Check if model behaves correctly with always_check_shapes=False. + # We don't expect model to raise an error if the input shapes are not + # correct. + # This is the more performant default behaviour + + config = CatModelConfig(always_check_shapes=False) + + model = config.build(framework="spam") + + # This should not raise an error since the specs are correct and the + # model does not raise an error either. + if fw == "torch": + model({"in_1": torch.Tensor([[1]]), "in_2": torch.Tensor([[1, 2]])}) + else: + model({"in_1": tf.constant([[1]]), "in_2": tf.constant([[1, 2]])}) + + # This should not raise an error since specs would be violated, but they + # are not checked and the model does not raise an error. + if fw == "torch": + model( + {"in_1": torch.Tensor([[1]]), "in_2": torch.Tensor([[1, 2, 3, 4]])} + ) + else: + model({"in_1": tf.constant([[1]]), "in_2": tf.constant([[1, 2, 3, 4]])}) + + # We want to raise an input spec validation error here since the model + # raises an exception that stems from inputs that could have been caught + # with input spec checking. + with self.assertRaisesRegex( + SpecCheckingError, "input spec validation failed" + ): + model({"in_1": [1], "in_2": [1, 2]}) + + def test_model_output_spec_checking(self): + """Tests if model output spec checking works correctly. + + This test is centered around the `always_check_shapes` flag of the + ModelConfig class. If this flag is set to True, the model will always + check if the outputs conform to the specs. If this flag is set to False, + the model will never check the outputs. + """ + + for fw in ["torch", "tf2"]: + + class BadModel: + """Simple model that produces bad outputs.""" + + def get_output_specs(self): + return SpecDict( + { + "out": TensorSpec("b, h", h=1), + } + ) + + def get_input_specs(self): + return SpecDict( + { + "in": TensorSpec("b, h", h=1), + } + ) + + if fw == "tf2": + + class TestModel(BadModel, TfModel): + def _forward(self, input_dict): + return {"out": torch.Tensor([[1, 2]])} + + else: + + class TestModel(BadModel, TfModel): + def _forward(self, input_dict): + return {"out": tf.constant([[1, 2]])} + + @dataclass + class CatModelConfig(ModelConfig): + def build(self, framework: str): + # Since we define the correct model above anyway, we don't need + # to distinguish between frameworks here. + return TestModel(self) + + # 1) Check if model behaves correctly with always_check_shapes=True first. + # We expect model to raise an error if the output shapes are not correct. + # This is the behaviour we use for debugging with model specs. + + config = CatModelConfig(always_check_shapes=True) + + model = config.build(framework="spam") + + # We want to raise an output spec validation error here since the output + # has the wrong shape + with self.assertRaisesRegex( + SpecCheckingError, "output spec validation failed" + ): + model({"in": torch.Tensor([[1]])}) + + # 2) Check if model behaves correctly with always_check_shapes=False. + # We don't expect model to raise an error. + # This is the more performant default behaviour + + config = CatModelConfig(always_check_shapes=False) + + model = config.build(framework="spam") + + model({"in_1": [[1]]}) + + @unittest.skipIf(not _dynamo_is_available(), "torch._dynamo not available") + def test_torch_compile_with_model(self): + """Tests if torch.compile() does not encounter any breaks. + + torch.compile() should not encounter any breaks when model is on its + code path by default. This test checks if this is the case. + """ + + class SomeTorchModel(TorchModel): + """Simple model that produces bad outputs.""" + + def __init__(self, config): + super().__init__(config) + self._model = torch.nn.Linear(1, 1) + + def get_output_specs(self): + return SpecDict( + { + "out": TensorSpec("b, h", h=1, framework="torch"), + } + ) + + def get_input_specs(self): + return SpecDict( + { + "in": TensorSpec("b, h", h=1, framework="torch"), + } + ) + + def _forward(self, input_dict): + return {"out": self._model(input_dict["in"])} + + @dataclass + class SomeTorchModelConfig(ModelConfig): + def build(self, framework: str): + return SomeTorchModel(self) + + config = SomeTorchModelConfig() + + model = config.build(framework="spam") + + # This could be the forward method of an RL Module that we torch compile + def compile_me(input_dict): + return model(input_dict) + + import torch._dynamo as dynamo + + dynamo_explanation = dynamo.explain(compile_me, {"in": torch.Tensor([[1]])}) + + # There should be only one break reason - `return_value` - since inputs and + # outputs are not checked + break_reasons_list = dynamo_explanation[4] + self.assertEquals(len(break_reasons_list), 1) + + +if __name__ == "__main__": + import pytest + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/core/models/tests/test_catalog.py b/rllib/core/models/tests/test_catalog.py index c2454320d791..9d17766ee8c4 100644 --- a/rllib/core/models/tests/test_catalog.py +++ b/rllib/core/models/tests/test_catalog.py @@ -83,12 +83,13 @@ def _check_model_outputs(self, model, framework, model_config_dict, input_space) outputs = model(inputs) self.assertEqual(outputs[ENCODER_OUT].shape, (32, latent_dim)) - tree.map_structure_with_path( - lambda p, v: ( - self.assertEqual(v.shape, states[p].shape) if v is not None else True - ), - outputs[STATE_OUT], - ) + if STATE_OUT in outputs: + tree.map_structure_with_path( + lambda p, v: ( + True if v is None else self.assertEqual(v.shape, states[p].shape) + ), + outputs[STATE_OUT], + ) def test_get_encoder_config(self): """Tests if we can create a bunch of encoders from the base catalog class.""" @@ -384,6 +385,7 @@ def build_vf_head(self, framework): _enable_rl_module_api=True, rl_module_spec=SingleAgentRLModuleSpec(catalog_class=MyCatalog), ) + .training(_enable_learner_api=True) .framework("torch") ) @@ -424,11 +426,11 @@ def test_post_init_overwrite(self): class MyCostumTorchEncoderConfig(ModelConfig): def build(self, framework): - return MyCostumTorchEncoder() + return MyCostumTorchEncoder(self) class MyCostumTorchEncoder(TorchModel, Encoder): - def __init__(self): - super().__init__({}) + def __init__(self, config): + super().__init__(config) self.net = torch.nn.Linear(env.observation_space.shape[0], 10) def _forward(self, input_dict, **kwargs): diff --git a/rllib/core/models/tests/test_cnn_encoders.py b/rllib/core/models/tests/test_cnn_encoders.py index fabff92ecfd2..3276de3ec51d 100644 --- a/rllib/core/models/tests/test_cnn_encoders.py +++ b/rllib/core/models/tests/test_cnn_encoders.py @@ -1,7 +1,7 @@ import itertools import unittest -from ray.rllib.core.models.base import ENCODER_OUT, STATE_OUT +from ray.rllib.core.models.base import ENCODER_OUT from ray.rllib.core.models.configs import CNNEncoderConfig from ray.rllib.models.utils import get_filter_config from ray.rllib.utils.framework import try_import_tf, try_import_torch @@ -15,7 +15,7 @@ class TestCNNEncoders(unittest.TestCase): def test_cnn_encoders(self): """Tests building CNN encoders properly and checks for correct architecture.""" - # Loop through different combinations of hyperparameters. + # Loop through permutations of hyperparameters. inputs_dimss = [ [480, 640, 3], [480, 640, 1], @@ -85,7 +85,6 @@ def test_cnn_encoders(self): # Add this framework version of the model to our checker. outputs = model_checker.add(framework=fw) self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dims[0])) - self.assertEqual(outputs[STATE_OUT], None) # Check all added models against each other. model_checker.check() diff --git a/rllib/core/models/tests/test_mlp_encoders.py b/rllib/core/models/tests/test_mlp_encoders.py index 26fd1430a56d..41a3207aa0bb 100644 --- a/rllib/core/models/tests/test_mlp_encoders.py +++ b/rllib/core/models/tests/test_mlp_encoders.py @@ -2,7 +2,7 @@ import unittest from ray.rllib.core.models.configs import MLPEncoderConfig -from ray.rllib.core.models.base import STATE_OUT, ENCODER_OUT +from ray.rllib.core.models.base import ENCODER_OUT from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.test_utils import framework_iterator, ModelChecker @@ -71,7 +71,6 @@ def test_mlp_encoders(self): # Add this framework version of the model to our checker. outputs = model_checker.add(framework=fw) self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dims[0])) - self.assertEqual(outputs[STATE_OUT], None) # Check all added models against each other. model_checker.check() diff --git a/rllib/core/models/tf/base.py b/rllib/core/models/tf/base.py index e99630ab4fad..dc88a005ce1f 100644 --- a/rllib/core/models/tf/base.py +++ b/rllib/core/models/tf/base.py @@ -1,4 +1,5 @@ import abc +import logging from typing import Tuple import numpy as np @@ -6,18 +7,18 @@ from ray.rllib.core.models.base import ( Model, ModelConfig, - _raise_not_decorated_exception, ) from ray.rllib.core.models.specs.checker import ( check_input_specs, - check_output_specs, is_input_decorated, is_output_decorated, + check_output_specs, ) from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.nested_dict import NestedDict +from ray.util import log_once +logger = logging.getLogger(__name__) _, tf, _ = try_import_tf() @@ -33,15 +34,26 @@ def __init__(self, config: ModelConfig): tf.keras.Model.__init__(self) Model.__init__(self, config) - # Raise errors if forward method is not decorated to check specs. + # Raise errors if forward method is not decorated to check input specs. if not is_input_decorated(self.call): - _raise_not_decorated_exception(type(self).__name__ + ".call()", "input") - if not is_output_decorated(self.call): - _raise_not_decorated_exception(type(self).__name__ + ".call()", "output") + raise ValueError( + f"`{type(self).__name__}.call()` not decorated with input " + f"specification. Decorate it with @check_input_specs() to define a " + f"specification and resolve this Error. If you don't want to check " + f"anything, you can use an empty spec." + ) + + if is_output_decorated(self.call): + if log_once("tf_model_forward_output_decorated"): + logger.warning( + f"`{type(self).__name__}.call()` decorated with output " + f"specification. This is not recommended because it can lead to " + f"slower execution. Remove @check_output_specs() from the " + f"forward method to resolve this." + ) @check_input_specs("input_specs") - @check_output_specs("output_specs") - def call(self, input_dict: NestedDict, **kwargs) -> NestedDict: + def call(self, input_dict: dict, **kwargs) -> dict: """Returns the output of this model for the given input. This method only makes sure that we have a spec-checked _forward() method. @@ -51,8 +63,21 @@ def call(self, input_dict: NestedDict, **kwargs) -> NestedDict: **kwargs: Forward compatibility kwargs. Returns: - NestedDict: The output tensors. + dict: The output tensors. """ + + # When `always_check_shapes` is set, we always check input and output specs. + # Note that we check the input specs twice because we need the following + # check to always check the input specs. + if self.config.always_check_shapes: + + @check_input_specs("input_specs", only_check_on_retry=False) + @check_output_specs("output_specs") + def checked_forward(self, input_data, **kwargs): + return self._forward(input_data, **kwargs) + + return checked_forward(self, input_dict, **kwargs) + return self._forward(input_dict, **kwargs) @override(Model) diff --git a/rllib/core/models/tf/encoder.py b/rllib/core/models/tf/encoder.py index 8f739934dcb0..232b59c2acb0 100644 --- a/rllib/core/models/tf/encoder.py +++ b/rllib/core/models/tf/encoder.py @@ -20,7 +20,7 @@ from ray.rllib.core.models.tf.primitives import TfMLP, TfCNN from ray.rllib.core.models.specs.specs_base import Spec from ray.rllib.core.models.specs.specs_dict import SpecDict -from ray.rllib.core.models.specs.specs_tf import TfTensorSpec +from ray.rllib.core.models.specs.specs_base import TensorSpec from ray.rllib.models.utils import get_activation_fn from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override @@ -47,7 +47,10 @@ def __init__(self, config: CNNEncoderConfig) -> None: TfModel.__init__(self, config) Encoder.__init__(self, config) - layers = [] + # Add an input layer for the Sequential, created below. This is really + # important to be able to derive the model's trainable_variables early on + # (inside our Learners). + layers = [tf.keras.layers.Input(shape=config.input_dims)] # The bare-bones CNN (no flatten, no succeeding dense). cnn = TfCNN( input_dims=config.input_dims, @@ -75,14 +78,13 @@ def __init__(self, config: CNNEncoderConfig) -> None: def get_input_specs(self) -> Optional[Spec]: return SpecDict( { - SampleBatch.OBS: TfTensorSpec( + SampleBatch.OBS: TensorSpec( "b, w, h, c", w=self.config.input_dims[0], h=self.config.input_dims[1], c=self.config.input_dims[2], + framework="tf2", ), - STATE_IN: None, - SampleBatch.SEQ_LENS: None, } ) @@ -90,19 +92,15 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TfTensorSpec("b, d", d=self.config.output_dims[0]), - STATE_OUT: None, + ENCODER_OUT: TensorSpec( + "b, d", d=self.config.output_dims[0], framework="tf2" + ), } ) @override(Model) - def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: - return NestedDict( - { - ENCODER_OUT: self.net(inputs[SampleBatch.OBS]), - STATE_OUT: inputs[STATE_IN], - } - ) + def _forward(self, inputs: dict, **kwargs) -> dict: + return {ENCODER_OUT: self.net(inputs[SampleBatch.OBS])} class TfMLPEncoder(Encoder, TfModel): @@ -125,9 +123,9 @@ def __init__(self, config: MLPEncoderConfig) -> None: def get_input_specs(self) -> Optional[Spec]: return SpecDict( { - SampleBatch.OBS: TfTensorSpec("b, d", d=self.config.input_dims[0]), - # STATE_IN: None, - # SampleBatch.SEQ_LENS: None, + SampleBatch.OBS: TensorSpec( + "b, d", d=self.config.input_dims[0], framework="tf2" + ), } ) @@ -135,19 +133,15 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TfTensorSpec("b, d", d=self.config.output_dims[0]), - STATE_OUT: None, + ENCODER_OUT: TensorSpec( + "b, d", d=self.config.output_dims[0], framework="tf2" + ), } ) @override(Model) def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: - return NestedDict( - { - ENCODER_OUT: self.net(inputs[SampleBatch.OBS]), - STATE_OUT: None, # inputs[STATE_IN], - } - ) + return {ENCODER_OUT: self.net(inputs[SampleBatch.OBS])} class TfGRUEncoder(TfModel, Encoder): @@ -180,10 +174,15 @@ def get_input_specs(self) -> Optional[Spec]: return SpecDict( { # b, t for batch major; t, b for time major. - SampleBatch.OBS: TfTensorSpec("b, t, d", d=self.config.input_dims[0]), + SampleBatch.OBS: TensorSpec( + "b, t, d", d=self.config.input_dims[0], framework="tf2" + ), STATE_IN: { - "h": TfTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "h": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="tf2", ), }, } @@ -193,10 +192,15 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TfTensorSpec("b, t, d", d=self.config.output_dims[0]), + ENCODER_OUT: TensorSpec( + "b, t, d", d=self.config.output_dims[0], framework="tf2" + ), STATE_OUT: { - "h": TfTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "h": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="tf2", ), }, } @@ -210,6 +214,9 @@ def get_initial_state(self): @override(Model) def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: + outputs = {} + + # Calculate the output and state of the GRU. out = tf.cast(inputs[SampleBatch.OBS], tf.float32) # States are batch-first when coming in. Make them layers-first. @@ -225,11 +232,10 @@ def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: out = self.linear(out) - return { - ENCODER_OUT: out, - # Make state_out batch-first. - STATE_OUT: {"h": tf.stack(states_out, 1)}, - } + # Insert them into the output dict. + outputs[ENCODER_OUT] = out + outputs[STATE_OUT] = {"h": tf.stack(states_out, 1)} + return outputs class TfLSTMEncoder(TfModel, Encoder): @@ -262,13 +268,21 @@ def get_input_specs(self) -> Optional[Spec]: return SpecDict( { # b, t for batch major; t, b for time major. - SampleBatch.OBS: TfTensorSpec("b, t, d", d=self.config.input_dims[0]), + SampleBatch.OBS: TensorSpec( + "b, t, d", d=self.config.input_dims[0], framework="tf2" + ), STATE_IN: { - "h": TfTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "h": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="tf2", ), - "c": TfTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "c": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="tf2", ), }, } @@ -278,13 +292,21 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TfTensorSpec("b, t, d", d=self.config.output_dims[0]), + ENCODER_OUT: TensorSpec( + "b, t, d", d=self.config.output_dims[0], framework="tf2" + ), STATE_OUT: { - "h": TfTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "h": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="tf2", ), - "c": TfTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "c": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="tf2", ), }, } @@ -299,6 +321,9 @@ def get_initial_state(self): @override(Model) def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: + outputs = {} + + # Calculate the output and state of the LSTM. out = tf.cast(inputs[SampleBatch.OBS], tf.float32) # States are batch-first when coming in. Make them layers-first. @@ -316,8 +341,10 @@ def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: out = self.linear(out) - return { - ENCODER_OUT: out, - # Make state_out batch-first. - STATE_OUT: {"h": tf.stack(states_out_h, 1), "c": tf.stack(states_out_c, 1)}, + # Insert them into the output dict. + outputs[ENCODER_OUT] = out + outputs[STATE_OUT] = { + "h": tf.stack(states_out_h, 1), + "c": tf.stack(states_out_c, 1), } + return outputs diff --git a/rllib/core/models/tf/heads.py b/rllib/core/models/tf/heads.py index e4fd7fe289a6..2b9c78824b10 100644 --- a/rllib/core/models/tf/heads.py +++ b/rllib/core/models/tf/heads.py @@ -9,7 +9,7 @@ MLPHeadConfig, ) from ray.rllib.core.models.specs.specs_base import Spec -from ray.rllib.core.models.specs.specs_tf import TfTensorSpec +from ray.rllib.core.models.specs.specs_base import TensorSpec from ray.rllib.core.models.tf.base import TfModel from ray.rllib.core.models.tf.primitives import TfCNNTranspose, TfMLP from ray.rllib.utils import try_import_tf @@ -34,11 +34,11 @@ def __init__(self, config: MLPHeadConfig) -> None: @override(Model) def get_input_specs(self) -> Optional[Spec]: - return TfTensorSpec("b, d", d=self.config.input_dims[0]) + return TensorSpec("b, d", d=self.config.input_dims[0], framework="tf2") @override(Model) def get_output_specs(self) -> Optional[Spec]: - return TfTensorSpec("b, d", d=self.config.output_dims[0]) + return TensorSpec("b, d", d=self.config.output_dims[0], framework="tf2") @override(Model) def _forward(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor: @@ -73,11 +73,11 @@ def __init__(self, config: FreeLogStdMLPHeadConfig) -> None: @override(Model) def get_input_specs(self) -> Optional[Spec]: - return TfTensorSpec("b, d", d=self.config.input_dims[0]) + return TensorSpec("b, d", d=self.config.input_dims[0], framework="tf2") @override(Model) def get_output_specs(self) -> Optional[Spec]: - return TfTensorSpec("b, d", d=self.config.output_dims[0]) + return TensorSpec("b, d", d=self.config.output_dims[0], framework="tf2") @override(Model) def _forward(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor: @@ -112,15 +112,16 @@ def __init__(self, config: CNNTransposeHeadConfig) -> None: @override(Model) def get_input_specs(self) -> Optional[Spec]: - return TfTensorSpec("b, d", d=self.config.input_dims[0]) + return TensorSpec("b, d", d=self.config.input_dims[0], framework="tf2") @override(Model) def get_output_specs(self) -> Optional[Spec]: - return TfTensorSpec( + return TensorSpec( "b, w, h, c", w=self.config.output_dims[0], h=self.config.output_dims[1], c=self.config.output_dims[2], + framework="tf2", ) @override(Model) diff --git a/rllib/core/models/tf/primitives.py b/rllib/core/models/tf/primitives.py index a36ae7a80463..816f837adaf6 100644 --- a/rllib/core/models/tf/primitives.py +++ b/rllib/core/models/tf/primitives.py @@ -24,9 +24,9 @@ def __init__( input_dim: int, hidden_layer_dims: List[int], hidden_layer_use_layernorm: bool = False, - hidden_layer_activation: Union[str, Callable] = "relu", + hidden_layer_activation: Optional[Union[str, Callable]] = "relu", output_dim: Optional[int] = None, - output_activation: Union[str, Callable] = "linear", + output_activation: Optional[Union[str, Callable]] = "linear", use_bias: bool = True, ): """Initialize a TfMLP object. @@ -112,7 +112,7 @@ def __init__( input_dims: Union[List[int], Tuple[int]], cnn_filter_specifiers: List[List[Union[int, List]]], cnn_use_layernorm: bool = False, - cnn_activation: str = "relu", + cnn_activation: Optional[str] = "relu", use_bias: bool = True, ): """Initializes a TfCNN instance. @@ -188,7 +188,7 @@ def __init__( *, input_dims: Union[List[int], Tuple[int]], cnn_transpose_filter_specifiers: List[List[Union[int, List]]], - cnn_transpose_activation: str = "relu", + cnn_transpose_activation: Optional[str] = "relu", cnn_transpose_use_layernorm: bool = False, use_bias: bool = True, ): diff --git a/rllib/core/models/torch/base.py b/rllib/core/models/torch/base.py index eefcf8ad2b42..775bc6116288 100644 --- a/rllib/core/models/torch/base.py +++ b/rllib/core/models/torch/base.py @@ -1,4 +1,5 @@ import abc +import logging from typing import Tuple, Union import numpy as np @@ -6,7 +7,6 @@ from ray.rllib.core.models.base import ( Model, ModelConfig, - _raise_not_decorated_exception, ) from ray.rllib.core.models.specs.checker import ( is_input_decorated, @@ -16,11 +16,13 @@ ) from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.typing import TensorType +from ray.util import log_once torch, nn = try_import_torch() +logger = logging.getLogger(__name__) + class TorchModel(nn.Module, Model, abc.ABC): """Base class for RLlib's PyTorch models. @@ -69,17 +71,30 @@ def __init__(self, config: ModelConfig): nn.Module.__init__(self) Model.__init__(self, config) - # Raise errors if forward method is not decorated to check specs. + # Raise errors if forward method is not decorated to check input specs. if not is_input_decorated(self.forward): - _raise_not_decorated_exception(type(self).__name__ + ".forward()", "input") - if not is_output_decorated(self.forward): - _raise_not_decorated_exception(type(self).__name__ + ".forward()", "output") + raise ValueError( + f"`{type(self).__name__}.forward()` not decorated with input " + f"specification. Decorate it with @check_input_specs() to define a " + f"specification and resolve this Error. If you don't want to check " + f"anything, you can use an empty spec." + ) + + if is_output_decorated(self.forward): + if log_once("torch_model_forward_output_decorated"): + logger.warning( + f"`{type(self).__name__}.forward()` decorated with output " + f"specification. This is not recommended for torch models " + f"that are used with torch.compile() because it breaks " + f"torch dynamo's graph. This can lead lead to slower execution." + f"Remove @check_output_specs() from the forward() method to " + f"resolve this." + ) @check_input_specs("input_specs") - @check_output_specs("output_specs") def forward( - self, inputs: Union[NestedDict, TensorType], **kwargs - ) -> Union[NestedDict, TensorType]: + self, inputs: Union[dict, TensorType], **kwargs + ) -> Union[dict, TensorType]: """Returns the output of this model for the given input. This method only makes sure that we have a spec-checked _forward() method. @@ -89,8 +104,21 @@ def forward( **kwargs: Forward compatibility kwargs. Returns: - NestedDict: The output tensors. + dict: The output tensors. """ + + # When `always_check_shapes` is set, we always check input and output specs. + # Note that we check the input specs twice because we need the following + # check to always check the input specs. + if self.config.always_check_shapes: + + @check_input_specs("input_specs", only_check_on_retry=False) + @check_output_specs("output_specs") + def checked_forward(self, input_data, **kwargs): + return self._forward(input_data, **kwargs) + + return checked_forward(self, inputs, **kwargs) + return self._forward(inputs, **kwargs) @override(Model) diff --git a/rllib/core/models/torch/encoder.py b/rllib/core/models/torch/encoder.py index 8fbd1c70c03b..61aa6fb3d51e 100644 --- a/rllib/core/models/torch/encoder.py +++ b/rllib/core/models/torch/encoder.py @@ -20,12 +20,11 @@ from ray.rllib.core.models.torch.primitives import TorchMLP, TorchCNN from ray.rllib.core.models.specs.specs_base import Spec from ray.rllib.core.models.specs.specs_dict import SpecDict -from ray.rllib.core.models.specs.specs_torch import TorchTensorSpec +from ray.rllib.core.models.specs.specs_base import TensorSpec from ray.rllib.models.utils import get_activation_fn from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.nested_dict import NestedDict torch, nn = try_import_torch() @@ -60,9 +59,9 @@ def __init__(self, config: MLPEncoderConfig) -> None: def get_input_specs(self) -> Optional[Spec]: return SpecDict( { - SampleBatch.OBS: TorchTensorSpec("b, d", d=self.config.input_dims[0]), - STATE_IN: None, - SampleBatch.SEQ_LENS: None, + SampleBatch.OBS: TensorSpec( + "b, d", d=self.config.input_dims[0], framework="torch" + ), } ) @@ -70,19 +69,15 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TorchTensorSpec("b, d", d=self.config.output_dims[0]), - STATE_OUT: None, + ENCODER_OUT: TensorSpec( + "b, d", d=self.config.output_dims[0], framework="torch" + ), } ) @override(Model) - def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: - return NestedDict( - { - ENCODER_OUT: self.net(inputs[SampleBatch.OBS]), - STATE_OUT: inputs[STATE_IN], - } - ) + def _forward(self, inputs: dict, **kwargs) -> dict: + return {ENCODER_OUT: self.net(inputs[SampleBatch.OBS])} class TorchCNNEncoder(TorchModel, Encoder): @@ -125,14 +120,13 @@ def __init__(self, config: CNNEncoderConfig) -> None: def get_input_specs(self) -> Optional[Spec]: return SpecDict( { - SampleBatch.OBS: TorchTensorSpec( + SampleBatch.OBS: TensorSpec( "b, w, h, c", w=self.config.input_dims[0], h=self.config.input_dims[1], c=self.config.input_dims[2], + framework="torch", ), - STATE_IN: None, - SampleBatch.SEQ_LENS: None, } ) @@ -140,19 +134,15 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TorchTensorSpec("b, d", d=self.config.output_dims[0]), - STATE_OUT: None, + ENCODER_OUT: TensorSpec( + "b, d", d=self.config.output_dims[0], framework="torch" + ), } ) @override(Model) - def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: - return NestedDict( - { - ENCODER_OUT: self.net(inputs[SampleBatch.OBS]), - STATE_OUT: inputs[STATE_IN], - } - ) + def _forward(self, inputs: dict, **kwargs) -> dict: + return {ENCODER_OUT: self.net(inputs[SampleBatch.OBS])} class TorchGRUEncoder(TorchModel, Encoder): @@ -181,12 +171,17 @@ def get_input_specs(self) -> Optional[Spec]: return SpecDict( { # b, t for batch major; t, b for time major. - SampleBatch.OBS: TorchTensorSpec( - "b, t, d", d=self.config.input_dims[0] + SampleBatch.OBS: TensorSpec( + "b, t, d", + d=self.config.input_dims[0], + framework="torch", ), STATE_IN: { - "h": TorchTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "h": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="torch", ), }, } @@ -196,10 +191,15 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TorchTensorSpec("b, t, d", d=self.config.output_dims[0]), + ENCODER_OUT: TensorSpec( + "b, t, d", d=self.config.output_dims[0], framework="torch" + ), STATE_OUT: { - "h": TorchTensorSpec( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers + "h": TensorSpec( + "b, l, h", + h=self.config.hidden_dim, + l=self.config.num_layers, + framework="torch", ), }, } @@ -212,7 +212,10 @@ def get_initial_state(self): } @override(Model) - def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: + def _forward(self, inputs: dict, **kwargs) -> dict: + outputs = {} + + # Calculate the output and state of the GRU. out = inputs[SampleBatch.OBS].float() # States are batch-first when coming in. Make them layers-first. @@ -223,11 +226,10 @@ def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: out = self.linear(out) - return { - ENCODER_OUT: out, - # Make states layer-first again. - STATE_OUT: tree.map_structure(lambda s: s.transpose(0, 1), states_out), - } + # Insert them into the output dict. + outputs[ENCODER_OUT] = out + outputs[STATE_OUT] = tree.map_structure(lambda s: s.transpose(0, 1), states_out) + return outputs class TorchLSTMEncoder(TorchModel, Encoder): @@ -257,19 +259,21 @@ def get_input_specs(self) -> Optional[Spec]: return SpecDict( { # b, t for batch major; t, b for time major. - SampleBatch.OBS: TorchTensorSpec( - "b, t, d", d=self.config.input_dims[0] + SampleBatch.OBS: TensorSpec( + "b, t, d", d=self.config.input_dims[0], framework="torch" ), STATE_IN: { - "h": TorchTensorSpec( + "h": TensorSpec( "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers, + framework="torch", ), - "c": TorchTensorSpec( + "c": TensorSpec( "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers, + framework="torch", ), }, } @@ -279,17 +283,21 @@ def get_input_specs(self) -> Optional[Spec]: def get_output_specs(self) -> Optional[Spec]: return SpecDict( { - ENCODER_OUT: TorchTensorSpec("b, t, d", d=self.config.output_dims[0]), + ENCODER_OUT: TensorSpec( + "b, t, d", d=self.config.output_dims[0], framework="torch" + ), STATE_OUT: { - "h": TorchTensorSpec( + "h": TensorSpec( "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers, + framework="torch", ), - "c": TorchTensorSpec( + "c": TensorSpec( "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers, + framework="torch", ), }, } @@ -303,7 +311,10 @@ def get_initial_state(self): } @override(Model) - def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: + def _forward(self, inputs: dict, **kwargs) -> dict: + outputs = {} + + # Calculate the output and state of the LSTM cell. out = inputs[SampleBatch.OBS].float() # States are batch-first when coming in. Make them layers-first. @@ -314,8 +325,7 @@ def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: out = self.linear(out) - return { - ENCODER_OUT: out, - # Make states layer-first again. - STATE_OUT: tree.map_structure(lambda s: s.transpose(0, 1), states_out), - } + # Insert them into the output dict. + outputs[ENCODER_OUT] = out + outputs[STATE_OUT] = tree.map_structure(lambda s: s.transpose(0, 1), states_out) + return outputs diff --git a/rllib/core/models/torch/heads.py b/rllib/core/models/torch/heads.py index f9de4040f8c0..81fbad04dedc 100644 --- a/rllib/core/models/torch/heads.py +++ b/rllib/core/models/torch/heads.py @@ -9,7 +9,7 @@ MLPHeadConfig, ) from ray.rllib.core.models.specs.specs_base import Spec -from ray.rllib.core.models.specs.specs_torch import TorchTensorSpec +from ray.rllib.core.models.specs.specs_base import TensorSpec from ray.rllib.core.models.torch.base import TorchModel from ray.rllib.core.models.torch.primitives import TorchCNNTranspose, TorchMLP from ray.rllib.utils.annotations import override @@ -34,11 +34,11 @@ def __init__(self, config: MLPHeadConfig) -> None: @override(Model) def get_input_specs(self) -> Optional[Spec]: - return TorchTensorSpec("b, d", d=self.config.input_dims[0]) + return TensorSpec("b, d", d=self.config.input_dims[0], framework="torch") @override(Model) def get_output_specs(self) -> Optional[Spec]: - return TorchTensorSpec("b, d", d=self.config.output_dims[0]) + return TensorSpec("b, d", d=self.config.output_dims[0], framework="torch") @override(Model) def _forward(self, inputs: torch.Tensor, **kwargs) -> torch.Tensor: @@ -70,11 +70,11 @@ def __init__(self, config: FreeLogStdMLPHeadConfig) -> None: @override(Model) def get_input_specs(self) -> Optional[Spec]: - return TorchTensorSpec("b, d", d=self.config.input_dims[0]) + return TensorSpec("b, d", d=self.config.input_dims[0], framework="torch") @override(Model) def get_output_specs(self) -> Optional[Spec]: - return TorchTensorSpec("b, d", d=self.config.output_dims[0]) + return TensorSpec("b, d", d=self.config.output_dims[0], framework="torch") @override(Model) def _forward(self, inputs: torch.Tensor, **kwargs) -> torch.Tensor: @@ -110,15 +110,16 @@ def __init__(self, config: CNNTransposeHeadConfig) -> None: @override(Model) def get_input_specs(self) -> Optional[Spec]: - return TorchTensorSpec("b, d", d=self.config.input_dims[0]) + return TensorSpec("b, d", d=self.config.input_dims[0], framework="torch") @override(Model) def get_output_specs(self) -> Optional[Spec]: - return TorchTensorSpec( + return TensorSpec( "b, w, h, c", w=self.config.output_dims[0], h=self.config.output_dims[1], c=self.config.output_dims[2], + framework="torch", ) @override(Model) diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index 905865f52d57..855a0151c8fd 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -186,7 +186,7 @@ def to_dict(self): """ catalog_class_path = ( - serialize_type(type(self.catalog_class)) if self.catalog_class else "" + serialize_type(self.catalog_class) if self.catalog_class else "" ) return { "observation_space": gym_space_to_dict(self.observation_space), @@ -329,6 +329,48 @@ def setup(self): abstraction can be used to create any component that your RLModule needs. """ + def get_train_action_dist_cls(self) -> Type[Distribution]: + """Returns the action distribution class for this RLModule used for training. + + This class is used to create action distributions from outputs of the + forward_train method. If the case that no action distribution class is needed, + this method can return None. + + Note that RLlib's distribution classes all implement the `Distribution` + interface. This requires two special methods: `Distribution.from_logits()` and + `Distribution.to_deterministic()`. See the documentation for `Distribution` + for more detail. + """ + raise NotImplementedError + + def get_exploration_action_dist_cls(self) -> Type[Distribution]: + """Returns the action distribution class for this RLModule used for exploration. + + This class is used to create action distributions from outputs of the + forward_exploration method. If the case that no action distribution class is + needed, this method can return None. + + Note that RLlib's distribution classes all implement the `Distribution` + interface. This requires two special methods: `Distribution.from_logits()` and + `Distribution.to_deterministic()`. See the documentation for `Distribution` + for more detail. + """ + raise NotImplementedError + + def get_inference_action_dist_cls(self) -> Type[Distribution]: + """Returns the action distribution class for this RLModule used for inference. + + This class is used to create action distributions from outputs of the forward + inference method. If the case that no action distribution class is needed, + this method can return None. + + Note that RLlib's distribution classes all implement the `Distribution` + interface. This requires two special methods: `Distribution.from_logits()` and + `Distribution.to_deterministic()`. See the documentation for `Distribution` + for more detail. + """ + raise NotImplementedError + def get_initial_state(self) -> NestedDict: """Returns the initial state of the module. @@ -381,8 +423,10 @@ def _default_input_specs(self) -> SpecType: @check_input_specs("_input_specs_inference") @check_output_specs("_output_specs_inference") def forward_inference(self, batch: SampleBatchType, **kwargs) -> Mapping[str, Any]: - """Forward-pass during evaluation, called from the sampler. This method should - not be overriden. Instead, override the _forward_inference method. + """Forward-pass during evaluation, called from the sampler. + + This method should not be overriden to implement a custom forward inference + method. Instead, override the _forward_inference method. Args: batch: The input batch. This input batch should comply with @@ -404,8 +448,10 @@ def _forward_inference(self, batch: NestedDict, **kwargs) -> Mapping[str, Any]: def forward_exploration( self, batch: SampleBatchType, **kwargs ) -> Mapping[str, Any]: - """Forward-pass during exploration, called from the sampler. This method should - not be overriden. Instead, override the _forward_exploration method. + """Forward-pass during exploration, called from the sampler. + + This method should not be overriden to implement a custom forward exploration + method. Instead, override the _forward_exploration method. Args: batch: The input batch. This input batch should comply with diff --git a/rllib/core/rl_module/tf/tests/test_tf_rl_module.py b/rllib/core/rl_module/tf/tests/test_tf_rl_module.py index 98fe5e71765f..c95d4cff96eb 100644 --- a/rllib/core/rl_module/tf/tests/test_tf_rl_module.py +++ b/rllib/core/rl_module/tf/tests/test_tf_rl_module.py @@ -1,13 +1,14 @@ -import gymnasium as gym -import tensorflow as tf -import tensorflow_probability as tfp import tempfile -from typing import Mapping import unittest +from typing import Mapping + +import gymnasium as gym +import tensorflow as tf from ray.rllib.core.rl_module.rl_module import RLModuleConfig from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule +from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.test_utils import check @@ -46,11 +47,13 @@ def test_forward_train(self): ) with tf.GradientTape() as tape: output = module.forward_train({"obs": obs}) - loss = -tf.math.reduce_mean(output["action_dist"].log_prob(actions)) + action_dist_class = module.get_train_action_dist_cls() + action_dist = action_dist_class.from_logits( + output[SampleBatch.ACTION_DIST_INPUTS] + ) + loss = -tf.math.reduce_mean(action_dist.logp(actions)) self.assertIsInstance(output, Mapping) - self.assertIn("action_dist", output) - self.assertIsInstance(output["action_dist"], tfp.distributions.Categorical) grads = tape.gradient(loss, module.trainable_variables) diff --git a/rllib/core/rl_module/tf/tf_rl_module.py b/rllib/core/rl_module/tf/tf_rl_module.py index 15861488dfcd..227cdd5a7990 100644 --- a/rllib/core/rl_module/tf/tf_rl_module.py +++ b/rllib/core/rl_module/tf/tf_rl_module.py @@ -5,7 +5,6 @@ from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf - _, tf, _ = try_import_tf() diff --git a/rllib/core/rl_module/torch/tests/test_torch_rl_module.py b/rllib/core/rl_module/torch/tests/test_torch_rl_module.py index 8393add6debc..e359e5e09dd6 100644 --- a/rllib/core/rl_module/torch/tests/test_torch_rl_module.py +++ b/rllib/core/rl_module/torch/tests/test_torch_rl_module.py @@ -1,12 +1,14 @@ -import gymnasium as gym import tempfile -import torch -from typing import Mapping import unittest +from typing import Mapping + +import gymnasium as gym +import torch from ray.rllib.core.rl_module.rl_module import RLModuleConfig from ray.rllib.core.rl_module.torch import TorchRLModule from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule +from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.test_utils import check @@ -44,10 +46,13 @@ def test_forward_train(self): output = module.forward_train({"obs": obs}) self.assertIsInstance(output, Mapping) - self.assertIn("action_dist", output) - self.assertIsInstance(output["action_dist"], torch.distributions.Categorical) + self.assertIn(SampleBatch.ACTION_DIST_INPUTS, output) + + action_dist_inputs = output[SampleBatch.ACTION_DIST_INPUTS] + action_dist_class = module.get_train_action_dist_cls() + action_dist = action_dist_class.from_logits(action_dist_inputs) - loss = -output["action_dist"].log_prob(actions.view(-1)).mean() + loss = -action_dist.logp(actions.view(-1)).mean() loss.backward() # check that all neural net parameters have gradients diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py index ce332ce6cf03..152a26534f1f 100644 --- a/rllib/core/rl_module/torch/torch_rl_module.py +++ b/rllib/core/rl_module/torch/torch_rl_module.py @@ -1,9 +1,14 @@ import pathlib -from typing import Any, Mapping, Union +from typing import Any, List, Mapping, Tuple, Union, Type +from ray.rllib.core.rl_module.rl_module_with_target_networks_interface import ( + RLModuleWithTargetNetworksInterface, +) from ray.rllib.core.rl_module import RLModule +from ray.rllib.models.distributions import Distribution from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.typing import NetworkType torch, nn = try_import_torch() @@ -50,6 +55,15 @@ def __init__(self, *args, **kwargs) -> None: # the interface of that base-class not the actual implementation. self.config = self.unwrapped().config + def get_train_action_dist_cls(self, *args, **kwargs) -> Type[Distribution]: + return self.unwrapped().get_train_action_dist_cls(*args, **kwargs) + + def get_exploration_action_dist_cls(self, *args, **kwargs) -> Type[Distribution]: + return self.unwrapped().get_exploration_action_dist_cls(*args, **kwargs) + + def get_inference_action_dist_cls(self, *args, **kwargs) -> Type[Distribution]: + return self.unwrapped().get_inference_action_dist_cls(*args, **kwargs) + @override(RLModule) def _forward_train(self, *args, **kwargs): return self(*args, **kwargs) @@ -93,3 +107,12 @@ def _module_metadata(self, *args, **kwargs): @override(RLModule) def unwrapped(self) -> "RLModule": return self.module + + +class TorchDDPRLModuleWithTargetNetworksInterface( + TorchDDPRLModule, + RLModuleWithTargetNetworksInterface, +): + @override(RLModuleWithTargetNetworksInterface) + def get_target_network_pairs(self) -> List[Tuple[NetworkType, NetworkType]]: + return self.module.get_target_network_pairs() diff --git a/rllib/core/testing/tf/bc_learner.py b/rllib/core/testing/tf/bc_learner.py index 0bc0d782d094..b3863a6f07e3 100644 --- a/rllib/core/testing/tf/bc_learner.py +++ b/rllib/core/testing/tf/bc_learner.py @@ -13,7 +13,9 @@ def compute_loss_per_module( self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType] ) -> Mapping[str, Any]: - action_dist = fwd_out["action_dist"] - loss = -tf.math.reduce_mean(action_dist.log_prob(batch[SampleBatch.ACTIONS])) + action_dist_inputs = fwd_out[SampleBatch.ACTION_DIST_INPUTS] + action_dist_class = self._module[module_id].get_train_action_dist_cls() + action_dist = action_dist_class.from_logits(action_dist_inputs) + loss = -tf.math.reduce_mean(action_dist.logp(batch[SampleBatch.ACTIONS])) return {self.TOTAL_LOSS_KEY: loss} diff --git a/rllib/core/testing/tf/bc_module.py b/rllib/core/testing/tf/bc_module.py index cca3a42f4eeb..0998468a4202 100644 --- a/rllib/core/testing/tf/bc_module.py +++ b/rllib/core/testing/tf/bc_module.py @@ -1,8 +1,8 @@ import tensorflow as tf -import tensorflow_probability as tfp from typing import Any, Mapping from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleConfig +from ray.rllib.models.tf.tf_distributions import TfCategorical from ray.rllib.core.rl_module.marl_module import ( MultiAgentRLModule, MultiAgentRLModuleConfig, @@ -33,36 +33,44 @@ def setup(self): self.policy = tf.keras.Sequential(layers) self._input_dim = input_dim + def get_train_action_dist_cls(self): + return TfCategorical + + def get_exploration_action_dist_cls(self): + return TfCategorical + + def get_inference_action_dist_cls(self): + return TfCategorical + @override(RLModule) def output_specs_exploration(self) -> SpecType: - return ["action_dist"] + return [SampleBatch.ACTION_DIST_INPUTS] @override(RLModule) def output_specs_inference(self) -> SpecType: - return ["action_dist"] + return [SampleBatch.ACTION_DIST_INPUTS] @override(RLModule) def output_specs_train(self) -> SpecType: - return ["action_dist"] + return [SampleBatch.ACTION_DIST_INPUTS] + + def _forward_shared(self, batch: NestedDict) -> Mapping[str, Any]: + # We can use a shared forward method because BC does not need to distinguish + # between train, inference, and exploration. + action_logits = self.policy(batch["obs"]) + return {SampleBatch.ACTION_DIST_INPUTS: action_logits} @override(RLModule) def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]: - obs = batch[SampleBatch.OBS] - action_logits = self.policy(obs) - action_logits_inference = tf.argmax(action_logits, axis=-1) - action_dist = tfp.distributions.Deterministic(action_logits_inference) - return {"action_dist": action_dist} + return self._forward_shared(batch) @override(RLModule) def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: - return self._forward_inference(batch) + return self._forward_shared(batch) @override(RLModule) def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]: - obs = batch[SampleBatch.OBS] - action_logits = self.policy(obs) - action_dist = tfp.distributions.Categorical(logits=action_logits) - return {"action_dist": action_dist} + return self._forward_shared(batch) @override(RLModule) def get_state(self) -> Mapping[str, Any]: @@ -112,7 +120,7 @@ def _common_forward(self, batch): policy_in = tf.concat([global_enc, obs["local"]], axis=-1) action_logits = self.policy_head(policy_in) - return {"action_dist": tf.distributions.Categorical(logits=action_logits)} + return {SampleBatch.ACTION_DIST_INPUTS: action_logits} class BCTfMultiAgentModuleWithSharedEncoder(MultiAgentRLModule): diff --git a/rllib/core/testing/torch/bc_learner.py b/rllib/core/testing/torch/bc_learner.py index 123d8566c1ef..8db3a2213abd 100644 --- a/rllib/core/testing/torch/bc_learner.py +++ b/rllib/core/testing/torch/bc_learner.py @@ -12,6 +12,8 @@ def compute_loss_per_module( self, module_id: str, batch: SampleBatch, fwd_out: Mapping[str, TensorType] ) -> Mapping[str, Any]: - action_dist = fwd_out["action_dist"] - loss = -torch.mean(action_dist.log_prob(batch[SampleBatch.ACTIONS])) + action_dist_inputs = fwd_out[SampleBatch.ACTION_DIST_INPUTS] + action_dist_class = self._module[module_id].get_train_action_dist_cls() + action_dist = action_dist_class.from_logits(action_dist_inputs) + loss = -torch.mean(action_dist.logp(batch[SampleBatch.ACTIONS])) return {self.TOTAL_LOSS_KEY: loss} diff --git a/rllib/core/testing/torch/bc_module.py b/rllib/core/testing/torch/bc_module.py index 2ee380a2530c..06b015a47205 100644 --- a/rllib/core/testing/torch/bc_module.py +++ b/rllib/core/testing/torch/bc_module.py @@ -1,6 +1,8 @@ from typing import Any, Mapping +from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleConfig +from ray.rllib.models.torch.torch_distributions import TorchCategorical from ray.rllib.core.rl_module.marl_module import ( MultiAgentRLModuleConfig, MultiAgentRLModule, @@ -31,17 +33,26 @@ def setup(self): self.input_dim = input_dim + def get_train_action_dist_cls(self): + return TorchCategorical + + def get_exploration_action_dist_cls(self): + return TorchCategorical + + def get_inference_action_dist_cls(self): + return TorchCategorical + @override(RLModule) def output_specs_exploration(self) -> SpecType: - return ["action_dist"] + return [SampleBatch.ACTION_DIST_INPUTS] @override(RLModule) def output_specs_inference(self) -> SpecType: - return ["action_dist"] + return [SampleBatch.ACTION_DIST_INPUTS] @override(RLModule) def output_specs_train(self) -> SpecType: - return ["action_dist"] + return [SampleBatch.ACTION_DIST_INPUTS] @override(RLModule) def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]: @@ -56,7 +67,7 @@ def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]: @override(RLModule) def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]: action_logits = self.policy(batch["obs"]) - return {"action_dist": torch.distributions.Categorical(logits=action_logits)} + return {SampleBatch.ACTION_DIST_INPUTS: action_logits} class BCTorchRLModuleWithSharedGlobalEncoder(TorchRLModule): @@ -84,6 +95,15 @@ def __init__( nn.Linear(hidden_dim, action_dim), ) + def get_train_action_dist_cls(self): + return TorchCategorical + + def get_exploration_action_dist_cls(self): + return TorchCategorical + + def get_inference_action_dist_cls(self): + return TorchCategorical + @override(RLModule) def _default_input_specs(self): return [("obs", "global"), ("obs", "local")] @@ -108,7 +128,7 @@ def _common_forward(self, batch): policy_in = torch.cat([global_enc, obs["local"]], dim=-1) action_logits = self.policy_head(policy_in) - return {"action_dist": torch.distributions.Categorical(logits=action_logits)} + return {SampleBatch.ACTION_DIST_INPUTS: action_logits} class BCTorchMultiAgentModuleWithSharedEncoder(MultiAgentRLModule): diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py index 889eb3f33141..a377c6b429a3 100644 --- a/rllib/core/testing/utils.py +++ b/rllib/core/testing/utils.py @@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.core.learner.learner_group import LearnerGroup -from ray.rllib.core.learner.learner import LearnerSpec, FrameworkHPs +from ray.rllib.core.learner.learner import LearnerSpec, FrameworkHyperparameters from ray.rllib.core.learner.scaling_config import LearnerGroupScalingConfig from ray.rllib.core.rl_module.marl_module import ( @@ -144,17 +144,17 @@ def get_learner_group( """ if framework == "tf": - learner_hps = FrameworkHPs(eager_tracing=eager_tracing) + framework_hps = FrameworkHyperparameters(eager_tracing=eager_tracing) else: - learner_hps = None + framework_hps = None learner_spec = LearnerSpec( learner_class=get_learner_class(framework), module_spec=get_module_spec( framework=framework, env=env, is_multi_agent=is_multi_agent ), optimizer_config={"lr": learning_rate}, - learner_scaling_config=scaling_config, - learner_hyperparameters=learner_hps, + learner_group_scaling_config=scaling_config, + framework_hyperparameters=framework_hps, ) lg = LearnerGroup(learner_spec) diff --git a/rllib/env/wrappers/unity3d_env.py b/rllib/env/wrappers/unity3d_env.py index a93dbd9191c6..28a0e6ebcdee 100644 --- a/rllib/env/wrappers/unity3d_env.py +++ b/rllib/env/wrappers/unity3d_env.py @@ -64,6 +64,9 @@ def __init__( Note: The game itself may contain its own episode length limits, which are always obeyed (on top of this value here). """ + # Skip env checking as the nature of the agent IDs depends on the game + # running in the connected Unity editor. + self._skip_env_checking = True super().__init__() diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 0b54c85bcb15..cf5653585d6b 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -2,12 +2,15 @@ import scipy.signal from typing import Dict, Optional +from ray.rllib.core.models.base import STATE_IN from ray.rllib.evaluation.episode import Episode from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import DeveloperAPI -from ray.rllib.utils.typing import AgentID +from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.typing import AgentID @DeveloperAPI @@ -197,15 +200,18 @@ def compute_gae_for_sample_batch( # correct? Does this mean that I need to preserve the trajectory # information during training and compute the advantages inside the loss # function? - # TODO (Kourosh) - # Another thing I need to figure out is which end point to call here? - # forward_exploration? what if this method is getting called inside the - # learner loop? or via another abstraction like - # RLSampler.postprocess_trajectory() which is non-batched cpu/gpu task - # running across different processes for different trajectories? - # This implementation right now will compute even the action_dist which - # will not be needed but takes time to compute. - input_dict = policy._lazy_tensor_dict(input_dict) + # TODO (Kourosh): Another thing we need to figure out is which end point + # to call here (why forward_exploration)? What if this method is getting + # called inside the learner loop or via another abstraction like + # RLSampler.postprocess_trajectory() which is non-batched cpu/gpu task + # running across different processes for different trajectories? + # This implementation right now will compute even the action_dist which + # will not be needed but takes time to compute. + if policy.framework == "torch": + input_dict = convert_to_torch_tensor(input_dict) + # TODO (sven): Fix this once we support RNNs on the new stack. + input_dict[STATE_IN] = input_dict[SampleBatch.SEQ_LENS] = None + input_dict = NestedDict(input_dict) fwd_out = policy.model.forward_exploration(input_dict) last_r = fwd_out[SampleBatch.VF_PREDS][-1] else: diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 003083066c17..6f68dc01ee0b 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -134,7 +134,7 @@ def _update_env_seed_if_necessary( NOTE: this may not work with remote environments (issue #18154). """ - if not seed: + if seed is None: return # A single RL job is unlikely to have more than 10K @@ -265,7 +265,7 @@ def __init__( log_dir: Optional[str] = None, spaces: Optional[Dict[PolicyID, Tuple[Space, Space]]] = None, default_policy_class: Optional[Type[Policy]] = None, - dataset_shards: Optional[List[ray.data.Datastream]] = None, + dataset_shards: Optional[List[ray.data.Dataset]] = None, # Deprecated: This is all specified in `config` anyways. policy_config=DEPRECATED_VALUE, input_creator=DEPRECATED_VALUE, diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 0b81dc45bd3e..6bcf6225444a 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -7,9 +7,6 @@ from ray.rllib.algorithms.callbacks import DefaultCallbacks import ray.rllib.algorithms.dqn as dqn import ray.rllib.algorithms.ppo as ppo -from ray.rllib.algorithms.ppo.torch.ppo_torch_policy_rlm import ( - PPOTorchPolicyWithRLModule, -) from ray.rllib.examples.env.debug_counter_env import MultiAgentDebugCounterEnv from ray.rllib.examples.env.multi_agent import MultiAgentPendulum from ray.rllib.evaluation.rollout_worker import RolloutWorker @@ -236,12 +233,9 @@ def test_traj_view_next_action(self): .rollouts(rollout_fragment_length=200, num_rollout_workers=0) ) config.validate() - enable_rl_module_api = config._enable_rl_module_api rollout_worker_w_api = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v1"), - default_policy_class=PPOTorchPolicyWithRLModule - if enable_rl_module_api - else ppo.PPOTorchPolicy, + default_policy_class=ppo.PPOTorchPolicy, config=config, ) # Add the next action (a') and 2nd next action (a'') to the view diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py index 1f5cb0128fd4..ce01295380f9 100644 --- a/rllib/examples/cartpole_lstm.py +++ b/rllib/examples/cartpole_lstm.py @@ -105,7 +105,8 @@ # >> # >> while True: # >> a, state_out, _ = algo.compute_single_action( - # .. obs, state, prev_a, prev_r) + # .. obs, state, prev_action=prev_a, prev_reward=prev_r + # .. ) # >> obs, reward, done, truncated, _ = env.step(a) # >> if done: # >> obs, info = env.reset() diff --git a/rllib/examples/export/onnx_torch.py b/rllib/examples/export/onnx_torch.py index b8196f36e8d6..3438d51840b7 100644 --- a/rllib/examples/export/onnx_torch.py +++ b/rllib/examples/export/onnx_torch.py @@ -1,8 +1,4 @@ -try: - from packaging.version import Version -except ImportError: - from distutils.version import LooseVersion as Version - +from packaging.version import Version import numpy as np import ray import ray.rllib.algorithms.ppo as ppo diff --git a/rllib/examples/self_play_with_open_spiel.py b/rllib/examples/self_play_with_open_spiel.py index 3c5360de15de..f611cac7d155 100644 --- a/rllib/examples/self_play_with_open_spiel.py +++ b/rllib/examples/self_play_with_open_spiel.py @@ -274,6 +274,7 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): # Train the "main" policy to play really well using self-play. results = None if not args.from_checkpoint: + create_checkpoints = not bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False)) results = tune.Tuner( "PPO", param_space=config, @@ -294,8 +295,8 @@ def policy_mapping_fn(agent_id, episode, worker, **kwargs): sort_by_metric=True, ), checkpoint_config=air.CheckpointConfig( - checkpoint_at_end=True, - checkpoint_frequency=10, + checkpoint_at_end=create_checkpoints, + checkpoint_frequency=10 if create_checkpoints else 0, ), ), ).fit() diff --git a/rllib/examples/serving/cartpole_server.py b/rllib/examples/serving/cartpole_server.py index d157ef8030e6..d800ab837f2e 100755 --- a/rllib/examples/serving/cartpole_server.py +++ b/rllib/examples/serving/cartpole_server.py @@ -180,6 +180,10 @@ def _input(ioctx): # Set to INFO so we'll see the server's actual address:port. .debugging(log_level="INFO") ) + # Disable RLModules because they need connectors + # TODO(Artur): Deprecate ExternalEnv and reenable connectors and RL Modules here + config.rl_module(_enable_rl_module_api=False) + config.training(_enable_learner_api=False) # DQN. if args.run == "DQN" or args.run == "APEX" or args.run == "R2D2": diff --git a/rllib/examples/serving/unity3d_server.py b/rllib/examples/serving/unity3d_server.py index 700ca6759390..b04a2397f524 100755 --- a/rllib/examples/serving/unity3d_server.py +++ b/rllib/examples/serving/unity3d_server.py @@ -151,6 +151,11 @@ def _input(ioctx): .evaluation(off_policy_estimation_methods={}) ) + # Disable RLModules because they need connectors + # TODO(Artur): Deprecate ExternalEnv and reenable connectors and RL Modules here + config.rl_module(_enable_rl_module_api=False) + config._enable_learner_api = False + # Create the Trainer used for Policy serving. algo = config.build() diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 133d9ef9fb72..83c942d1a277 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -181,6 +181,13 @@ # backward compatibility to old configs. This yields different models than past # versions of RLlib. "encoder_latent_dim": None, + # Whether to always check the inputs and outputs of RLlib's default models for + # their specifications. Input specifications are checked on failed forward passes + # of the models regardless of this flag. If this flag is set to `True`, inputs and + # outputs are checked on every call. This leads to a slow-down and should only be + # used for debugging. Note that this flag is only relevant for instances of + # RLlib's Model class. These are commonly generated from ModelConfigs in RLModules. + "always_check_shapes": False, # Deprecated keys: # Use `lstm_use_prev_action` or `lstm_use_prev_reward` instead. diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index d26ba8b028ba..409326063458 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -39,6 +39,7 @@ def tearDownClass(cls) -> None: def test_rlms_and_preprocessing(self): config = ( ppo.PPOConfig() + .framework("tf2") .environment( env="ray.rllib.examples.env.random_env.RandomEnv", env_config={ @@ -48,18 +49,18 @@ def test_rlms_and_preprocessing(self): }, ) # Run this very quickly locally. - .rollouts(rollout_fragment_length=10) - .rollouts(num_rollout_workers=0) - .training(train_batch_size=10, sgd_minibatch_size=1, num_sgd_iter=1) + .rollouts(num_rollout_workers=0, rollout_fragment_length=10) + .training( + train_batch_size=10, + sgd_minibatch_size=1, + num_sgd_iter=1, + _enable_learner_api=True, + ) + .rl_module(_enable_rl_module_api=True) # Set this to True to enforce no preprocessors being used. .experimental(_disable_preprocessor_api=True) - .framework("tf2") ) - # TODO (Artur): No need to manually enable RLModules here since we have not - # fully migrated. Clear this up after migration. - config.rl_module(_enable_rl_module_api=True) - for _ in framework_iterator(config, frameworks=("torch", "tf2")): algo = config.build() results = algo.train() diff --git a/rllib/models/tf/tf_distributions.py b/rllib/models/tf/tf_distributions.py index 8652cdfc3ec5..373ba3440fc8 100644 --- a/rllib/models/tf/tf_distributions.py +++ b/rllib/models/tf/tf_distributions.py @@ -105,8 +105,8 @@ def __init__( if logits is not None: assert temperature > 0.0, "Categorical `temperature` must be > 0.0!" - _logits = logits / temperature - probs = tf.nn.softmax(_logits, axis=-1) + logits /= temperature + probs = tf.nn.softmax(logits, axis=-1) self.probs = probs self.logits = logits @@ -118,8 +118,10 @@ def __init__( def logp(self, value: TensorType, **kwargs) -> TensorType: # This prevents an error in which float values at the boundaries of the range # of the distribution are passed to this function. - value = tf.cast(value, tf.int32) - return self._dist.log_prob(value, **kwargs) + return -tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=self.logits if self.logits is not None else self.probs, + labels=tf.cast(value, tf.int32), + ) @override(TfDistribution) def _get_tf_distribution( diff --git a/rllib/offline/dataset_reader.py b/rllib/offline/dataset_reader.py index c6a11622aa1f..d105cd59e2c4 100644 --- a/rllib/offline/dataset_reader.py +++ b/rllib/offline/dataset_reader.py @@ -69,7 +69,7 @@ def _unzip_if_needed(paths: List[str], format: str): @PublicAPI def get_dataset_and_shards( config: "AlgorithmConfig", num_workers: int = 0 -) -> Tuple[ray.data.Datastream, List[ray.data.Datastream]]: +) -> Tuple[ray.data.Dataset, List[ray.data.Dataset]]: """Returns a dataset and a list of shards. This function uses algorithm configs to create a dataset and a list of shards. @@ -77,12 +77,12 @@ def get_dataset_and_shards( input: The input type should be "dataset". input_config: A dict containing the following key and values: `format`: str, speciifies the format of the input data. This will be the - format that ray dataset supports. See ray.data.Datastream for + format that ray dataset supports. See ray.data.Dataset for supported formats. Only "parquet" or "json" are supported for now. `paths`: str, a single string or a list of strings. Each string is a path to a file or a directory holding the dataset. It can be either a local path or a remote path (e.g. to an s3 bucket). - `loader_fn`: Callable[None, ray.data.Datastream], Instead of + `loader_fn`: Callable[None, ray.data.Dataset], Instead of specifying paths and format, you can specify a function to load the dataset. `parallelism`: int, The number of tasks to use for loading the dataset. If not specified, it will be set to the number of workers. @@ -185,7 +185,7 @@ class DatasetReader(InputReader): "input_config": { "format": "json", # A single data file, a directory, or anything - # that ray.data.datastream recognizes. + # that ray.data.dataset recognizes. "paths": "/tmp/sample_batches/", # By default, parallelism=num_workers. "parallelism": 3, @@ -246,7 +246,7 @@ def next(self) -> SampleBatchType: ret = [] count = 0 while count < self.batch_size: - d = next(self._iter).as_pydict() + d = next(self._iter) # Columns like obs are compressed when written by DatasetWriter. d = from_json_data(d, self._ioctx.worker) count += d.count diff --git a/rllib/offline/estimators/direct_method.py b/rllib/offline/estimators/direct_method.py index ecf902c4c9b5..c735b93a5e1b 100644 --- a/rllib/offline/estimators/direct_method.py +++ b/rllib/offline/estimators/direct_method.py @@ -157,6 +157,7 @@ def estimate_on_dataset( updated_ds = dataset.map_batches( compute_q_and_v_values, batch_size=batch_size, + batch_format="pandas", fn_kwargs={ "model_class": self.model.__class__, "model_state": self.model.get_state(), diff --git a/rllib/offline/estimators/doubly_robust.py b/rllib/offline/estimators/doubly_robust.py index 53f6387df5d1..d98028023660 100644 --- a/rllib/offline/estimators/doubly_robust.py +++ b/rllib/offline/estimators/doubly_robust.py @@ -199,6 +199,7 @@ def estimate_on_dataset( updated_ds = dataset.map_batches( compute_is_weights, batch_size=batch_size, + batch_format="pandas", fn_kwargs={ "policy_state": self.policy.get_state(), "estimator_class": self.__class__, @@ -210,6 +211,7 @@ def estimate_on_dataset( updated_ds = updated_ds.map_batches( compute_q_and_v_values, batch_size=batch_size, + batch_format="pandas", fn_kwargs={ "model_class": self.model.__class__, "model_state": self.model.get_state(), @@ -229,6 +231,7 @@ def compute_v_target(batch: pd.DataFrame, normalizer: float = 1.0): updated_ds = updated_ds.map_batches( compute_v_target, batch_size=batch_size, + batch_format="pandas", fn_kwargs={"normalizer": normalizer}, ) diff --git a/rllib/offline/estimators/importance_sampling.py b/rllib/offline/estimators/importance_sampling.py index 500cf9e147e4..ee3b5909349c 100644 --- a/rllib/offline/estimators/importance_sampling.py +++ b/rllib/offline/estimators/importance_sampling.py @@ -99,6 +99,7 @@ def estimate_on_dataset( updated_ds = dataset.map_batches( compute_is_weights, batch_size=batch_size, + batch_format="pandas", fn_kwargs={ "policy_state": self.policy.get_state(), "estimator_class": self.__class__, diff --git a/rllib/offline/estimators/weighted_importance_sampling.py b/rllib/offline/estimators/weighted_importance_sampling.py index 2bd5d566e525..5571b085c2a7 100644 --- a/rllib/offline/estimators/weighted_importance_sampling.py +++ b/rllib/offline/estimators/weighted_importance_sampling.py @@ -155,6 +155,7 @@ def estimate_on_dataset( updated_ds = dataset.map_batches( compute_is_weights, batch_size=batch_size, + batch_format="pandas", fn_kwargs={ "policy_state": self.policy.get_state(), "estimator_class": self.__class__, diff --git a/rllib/offline/feature_importance.py b/rllib/offline/feature_importance.py index 5ffac614eb71..2efe17790a79 100644 --- a/rllib/offline/feature_importance.py +++ b/rllib/offline/feature_importance.py @@ -92,11 +92,15 @@ def get_feature_importance_on_index( difference between the expected output and the output due to the perturbation. """ perturbed_ds = dataset.map_batches( - perturb_fn, batch_size=batch_size, fn_kwargs={"index": index} + perturb_fn, + batch_size=batch_size, + batch_format="pandas", + fn_kwargs={"index": index}, ) perturbed_actions = perturbed_ds.map_batches( _compute_actions, batch_size=batch_size, + batch_format="pandas", fn_kwargs={ "output_key": "perturbed_actions", "input_key": "perturbed_obs", @@ -110,7 +114,9 @@ def delta_fn(batch): batch["delta"] = np.abs(batch["ref_actions"] - batch["perturbed_actions"]) return batch - delta = perturbed_actions.map_batches(delta_fn, batch_size=batch_size) + delta = perturbed_actions.map_batches( + delta_fn, batch_size=batch_size, batch_format="pandas" + ) return delta diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index e736f31dd1e1..9efe552d5aa6 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -175,22 +175,39 @@ def compute_actions_from_input_dict( ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]: """Traced version of Policy.compute_actions_from_input_dict.""" - # NOTE: In the new RLModule stack the sampling side is not traced with this - # justification that in order to speed up sampling we need to use more - # actors. # Create a traced version of `self._compute_actions_helper`. - if ( - not self.config.get("_enable_rl_module_api", False) - and self._traced_compute_actions_helper is False - and not self._no_tracing - ): - self._compute_actions_helper = _convert_eager_inputs( - tf.function( - super(TracedEagerPolicy, self)._compute_actions_helper, - autograph=False, - reduce_retracing=True, + if self._traced_compute_actions_helper is False and not self._no_tracing: + if self.config.get("_enable_rl_module_api"): + self._compute_actions_helper_rl_module_explore = ( + _convert_eager_inputs( + tf.function( + super( + TracedEagerPolicy, self + )._compute_actions_helper_rl_module_explore, + autograph=True, + reduce_retracing=True, + ) + ) + ) + self._compute_actions_helper_rl_module_inference = ( + _convert_eager_inputs( + tf.function( + super( + TracedEagerPolicy, self + )._compute_actions_helper_rl_module_inference, + autograph=True, + reduce_retracing=True, + ) + ) + ) + else: + self._compute_actions_helper = _convert_eager_inputs( + tf.function( + super(TracedEagerPolicy, self)._compute_actions_helper, + autograph=False, + reduce_retracing=True, + ) ) - ) self._traced_compute_actions_helper = True # Now that the helper method is traced, call super's diff --git a/rllib/policy/eager_tf_policy_v2.py b/rllib/policy/eager_tf_policy_v2.py index f38637e003a7..e9158176e2a0 100644 --- a/rllib/policy/eager_tf_policy_v2.py +++ b/rllib/policy/eager_tf_policy_v2.py @@ -3,13 +3,15 @@ It supports both traced and non-traced eager execution modes. """ -import gymnasium as gym import logging import os import threading -import tree # pip install dm_tree from typing import Dict, List, Optional, Tuple, Type, Union +import gymnasium as gym +import tree # pip install dm_tree + +from ray.rllib.core.models.base import STATE_IN from ray.rllib.evaluation.episode import Episode from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.modelv2 import ModelV2 @@ -39,6 +41,7 @@ NUM_GRAD_UPDATES_LIFETIME, ) from ray.rllib.utils.metrics.learner_info import LEARNER_STATS_KEY +from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.numpy import convert_to_numpy from ray.rllib.utils.spaces.space_utils import normalize_action from ray.rllib.utils.tf_utils import get_gpu_devices @@ -98,7 +101,7 @@ def __init__( self._loss_initialized = False # Backward compatibility workaround so Policy will call self.loss() directly. # TODO(jungong): clean up after all policies are migrated to new sub-class - # implementation. + # implementation. self._loss = None self.batch_divisibility_req = self.get_batch_divisibility_req() @@ -430,16 +433,17 @@ def _init_view_requirements(self): self.view_requirements[SampleBatch.INFOS].used_for_training = False def maybe_initialize_optimizer_and_loss(self): - optimizers = force_list(self.optimizer()) - if self.exploration: - # Policies with RLModules don't have an exploration object. - optimizers = self.exploration.get_exploration_optimizer(optimizers) + if not self.config.get("_enable_learner_api", False): + optimizers = force_list(self.optimizer()) + if self.exploration: + # Policies with RLModules don't have an exploration object. + optimizers = self.exploration.get_exploration_optimizer(optimizers) - # The list of local (tf) optimizers (one per loss term). - self._optimizers: List[LocalOptimizer] = optimizers - # Backward compatibility: A user's policy may only support a single - # loss term and optimizer (no lists). - self._optimizer: LocalOptimizer = optimizers[0] if optimizers else None + # The list of local (tf) optimizers (one per loss term). + self._optimizers: List[LocalOptimizer] = optimizers + # Backward compatibility: A user's policy may only support a single + # loss term and optimizer (no lists). + self._optimizer: LocalOptimizer = optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, @@ -480,14 +484,20 @@ def compute_actions_from_input_dict( timestep=timestep, explore=explore, tf_sess=self.get_session() ) - ret = self._compute_actions_helper( - input_dict, - state_batches, - # TODO: Passing episodes into a traced method does not work. - None if self.config["eager_tracing"] else episodes, - explore, - timestep, - ) + if self.config.get("_enable_rl_module_api"): + if explore: + ret = self._compute_actions_helper_rl_module_explore(input_dict) + else: + ret = self._compute_actions_helper_rl_module_inference(input_dict) + else: + ret = self._compute_actions_helper( + input_dict, + state_batches, + # TODO: Passing episodes into a traced method does not work. + None if self.config["eager_tracing"] else episodes, + explore, + timestep, + ) # Update our global timestep by the batch size. self.global_timestep.assign_add(tree.flatten(ret[0])[0].shape.as_list()[0]) return convert_to_numpy(ret) @@ -810,9 +820,83 @@ def loss_initialized(self): return self._loss_initialized # TODO: Figure out, why _ray_trace_ctx=None helps to prevent a crash in - # AlphaStar w/ framework=tf2; eager_tracing=True on the policy learner actors. + # eager_tracing=True. + # It seems there may be a clash between the traced-by-tf function and the + # traced-by-ray functions (for making the policy class a ray actor). + @with_lock + def _compute_actions_helper_rl_module_explore( + self, input_dict, _ray_trace_ctx=None + ): + # Increase the tracing counter to make sure we don't re-trace too + # often. If eager_tracing=True, this counter should only get + # incremented during the @tf.function trace operations, never when + # calling the already traced function after that. + self._re_trace_counter += 1 + + # Add models `forward_explore` extra fetches. + extra_fetches = {} + + input_dict = NestedDict(input_dict) + # TODO (sven): Support RNNs when using RLModules. + input_dict[STATE_IN] = None + input_dict[SampleBatch.SEQ_LENS] = None + + action_dist_class = self.model.get_exploration_action_dist_cls() + fwd_out = self.model.forward_exploration(input_dict) + action_dist = action_dist_class.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + actions = action_dist.sample() + + # Anything but action_dist and state_out is an extra fetch + for k, v in fwd_out.items(): + if k not in [SampleBatch.ACTIONS, "state_out"]: + extra_fetches[k] = v + + # Action-logp and action-prob. + logp = action_dist.logp(actions) + extra_fetches[SampleBatch.ACTION_LOGP] = logp + extra_fetches[SampleBatch.ACTION_PROB] = tf.exp(logp) + + return actions, {}, extra_fetches + + # TODO: Figure out, why _ray_trace_ctx=None helps to prevent a crash in + # eager_tracing=True. # It seems there may be a clash between the traced-by-tf function and the # traced-by-ray functions (for making the policy class a ray actor). + @with_lock + def _compute_actions_helper_rl_module_inference( + self, input_dict, _ray_trace_ctx=None + ): + # Increase the tracing counter to make sure we don't re-trace too + # often. If eager_tracing=True, this counter should only get + # incremented during the @tf.function trace operations, never when + # calling the already traced function after that. + self._re_trace_counter += 1 + + # Add models `forward_explore` extra fetches. + extra_fetches = {} + + input_dict = NestedDict(input_dict) + # TODO (sven): Support RNNs when using RLModules. + input_dict[STATE_IN] = None + input_dict[SampleBatch.SEQ_LENS] = None + + action_dist_class = self.model.get_inference_action_dist_cls() + fwd_out = self.model.forward_inference(input_dict) + action_dist = action_dist_class.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + action_dist = action_dist.to_deterministic() + actions = action_dist.sample() + + # Anything but action_dist and state_out is an extra fetch + for k, v in fwd_out.items(): + if k not in [SampleBatch.ACTIONS, "state_out"]: + extra_fetches[k] = v + + return actions, {}, extra_fetches + @with_lock def _compute_actions_helper( self, @@ -827,10 +911,7 @@ def _compute_actions_helper( # often. If eager_tracing=True, this counter should only get # incremented during the @tf.function trace operations, never when # calling the already traced function after that. - # NOTE: On the new RLModule API, we won't trace the sampling side, so we should - # not increment this counter to trigger excess re-tracing error. - if not self.config.get("_enable_rl_module_api", False): - self._re_trace_counter += 1 + self._re_trace_counter += 1 # Calculate RNN sequence lengths. batch_size = tree.flatten(input_dict[SampleBatch.OBS])[0].shape[0] @@ -839,33 +920,9 @@ def _compute_actions_helper( # Add default and custom fetches. extra_fetches = {} - # Use Exploration object. with tf.variable_creator_scope(_disallow_var_creation): - if self.config.get("_enable_rl_module_api", False): - - if explore: - fwd_out = self.model.forward_exploration(input_dict) - else: - fwd_out = self.model.forward_inference(input_dict) - action_dist = fwd_out[SampleBatch.ACTION_DIST] - if explore: - actions = action_dist.sample() - logp = action_dist.logp(actions) - else: - actions = action_dist.sample() - logp = None - state_out = fwd_out.get("state_out", {}) - - # anything but action_dist and state_out is an extra fetch - for k, v in fwd_out.items(): - if k not in [SampleBatch.ACTION_DIST, "state_out"]: - extra_fetches[k] = v - dist_inputs = None - - elif is_overridden(self.action_sampler_fn): - dist_inputs = None - state_out = [] + if is_overridden(self.action_sampler_fn): actions, logp, dist_inputs, state_out = self.action_sampler_fn( self.model, input_dict[SampleBatch.OBS], @@ -875,7 +932,6 @@ def _compute_actions_helper( ) else: if is_overridden(self.action_distribution_fn): - # Try new action_distribution_fn signature, supporting # state_batches and seq_lens. ( diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index bb534497e3d2..c1ca60b83904 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -1470,35 +1470,39 @@ def _initialize_loss_from_dummy_batch( seq_len = sample_batch_size // B seq_lens = np.array([seq_len for _ in range(B)], dtype=np.int32) postprocessed_batch[SampleBatch.SEQ_LENS] = seq_lens - # Switch on lazy to-tensor conversion on `postprocessed_batch`. - train_batch = self._lazy_tensor_dict(postprocessed_batch) - # Calling loss, so set `is_training` to True. - train_batch.set_training(True) - if seq_lens is not None: - train_batch[SampleBatch.SEQ_LENS] = seq_lens - train_batch.count = self._dummy_batch.count - # Call the loss function, if it exists. - # TODO(jungong) : clean up after all agents get migrated. - # We should simply do self.loss(...) here. - if self._loss is not None: - self._loss(self, self.model, self.dist_class, train_batch) - elif ( - is_overridden(self.loss) or self.config.get("_enable_rl_module_api", False) - ) and not self.config["in_evaluation"]: - self.loss(self.model, self.dist_class, train_batch) - # Call the stats fn, if given. - # TODO(jungong) : clean up after all agents get migrated. - # We should simply do self.stats_fn(train_batch) here. - if stats_fn is not None: - stats_fn(self, train_batch) - if hasattr(self, "stats_fn") and not self.config["in_evaluation"]: - self.stats_fn(train_batch) + + if not self.config.get("_enable_learner_api"): + # Switch on lazy to-tensor conversion on `postprocessed_batch`. + train_batch = self._lazy_tensor_dict(postprocessed_batch) + # Calling loss, so set `is_training` to True. + train_batch.set_training(True) + if seq_lens is not None: + train_batch[SampleBatch.SEQ_LENS] = seq_lens + train_batch.count = self._dummy_batch.count + + # Call the loss function, if it exists. + # TODO(jungong) : clean up after all agents get migrated. + # We should simply do self.loss(...) here. + if self._loss is not None: + self._loss(self, self.model, self.dist_class, train_batch) + elif is_overridden(self.loss) and not self.config["in_evaluation"]: + self.loss(self.model, self.dist_class, train_batch) + # Call the stats fn, if given. + # TODO(jungong) : clean up after all agents get migrated. + # We should simply do self.stats_fn(train_batch) here. + if stats_fn is not None: + stats_fn(self, train_batch) + if hasattr(self, "stats_fn") and not self.config["in_evaluation"]: + self.stats_fn(train_batch) # Re-enable tracing. self._no_tracing = False # Add new columns automatically to view-reqs. - if auto_remove_unneeded_view_reqs: + if ( + not self.config.get("_enable_learner_api") + and auto_remove_unneeded_view_reqs + ): # Add those needed for postprocessing and training. all_accessed_keys = ( train_batch.accessed_keys diff --git a/rllib/policy/tests/test_policy.py b/rllib/policy/tests/test_policy.py index 66db2f8b7a10..77e16e13bc4f 100644 --- a/rllib/policy/tests/test_policy.py +++ b/rllib/policy/tests/test_policy.py @@ -32,7 +32,10 @@ def test_policy_get_and_set_state(self): policy.set_state(state1) state3 = policy.get_state() # Make sure everything is the same. - check(state1["_exploration_state"], state3["_exploration_state"]) + # This is only supported without RLModule API. See AlgorithmConfig for + # more info. + if not config._enable_rl_module_api: + check(state1["_exploration_state"], state3["_exploration_state"]) check(state1["global_timestep"], state3["global_timestep"]) check(state1["weights"], state3["weights"]) @@ -42,7 +45,10 @@ def test_policy_get_and_set_state(self): if isinstance(policy, (EagerTFPolicyV2, DynamicTFPolicyV2, TorchPolicyV2)): policy_restored_from_scratch = Policy.from_state(state3) state4 = policy_restored_from_scratch.get_state() - check(state3["_exploration_state"], state4["_exploration_state"]) + # This is only supported without RLModule API. See AlgorithmConfig for + # more info. + if not config._enable_rl_module_api: + check(state3["_exploration_state"], state4["_exploration_state"]) check(state3["global_timestep"], state4["global_timestep"]) # For tf static graph, the new model has different layer names # (as it gets written into the same graph as the old one). diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py index 8ce18df5e979..fe5e23a330e8 100644 --- a/rllib/policy/tf_mixins.py +++ b/rllib/policy/tf_mixins.py @@ -33,7 +33,9 @@ class LearningRateSchedule: @DeveloperAPI def __init__(self, lr, lr_schedule): self._lr_schedule = None - if lr_schedule is None: + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. + if lr_schedule is None or self.config.get("_enable_learner_api", False): self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False) else: self._lr_schedule = PiecewiseSchedule( @@ -78,7 +80,11 @@ class EntropyCoeffSchedule: @DeveloperAPI def __init__(self, entropy_coeff, entropy_coeff_schedule): self._entropy_coeff_schedule = None - if entropy_coeff_schedule is None: + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. + if entropy_coeff_schedule is None or ( + self.config.get("_enable_learner_api", False) + ): self.entropy_coeff = get_variable( entropy_coeff, framework="tf", tf_name="entropy_coeff", trainable=False ) @@ -208,37 +214,32 @@ class TargetNetworkMixin: """ def __init__(self): - if self.config.get("_enable_rl_module_api", False): - # In order to access the variables for rl modules, we need to - # use the underlying keras api model.trainable_variables. - model_vars = self.model.trainable_variables - target_model_vars = self.target_model.trainable_variables - else: + if not self.config.get("_enable_rl_module_api", False): model_vars = self.model.trainable_variables() target_model_vars = self.target_model.trainable_variables() - @make_tf_callable(self.get_session()) - def update_target_fn(tau): - tau = tf.convert_to_tensor(tau, dtype=tf.float32) - update_target_expr = [] - assert len(model_vars) == len(target_model_vars), ( - model_vars, - target_model_vars, - ) - for var, var_target in zip(model_vars, target_model_vars): - update_target_expr.append( - var_target.assign(tau * var + (1.0 - tau) * var_target) + @make_tf_callable(self.get_session()) + def update_target_fn(tau): + tau = tf.convert_to_tensor(tau, dtype=tf.float32) + update_target_expr = [] + assert len(model_vars) == len(target_model_vars), ( + model_vars, + target_model_vars, ) - logger.debug("Update target op {}".format(var_target)) - return tf.group(*update_target_expr) - - # Hard initial update. - self._do_update = update_target_fn - # TODO: The previous SAC implementation does an update(1.0) here. - # If this is changed to tau != 1.0 the sac_loss_function test fails. Why? - # Also the test is not very maintainable, we need to change that unittest - # anyway. - self.update_target(tau=1.0) # self.config.get("tau", 1.0)) + for var, var_target in zip(model_vars, target_model_vars): + update_target_expr.append( + var_target.assign(tau * var + (1.0 - tau) * var_target) + ) + logger.debug("Update target op {}".format(var_target)) + return tf.group(*update_target_expr) + + # Hard initial update. + self._do_update = update_target_fn + # TODO: The previous SAC implementation does an update(1.0) here. + # If this is changed to tau != 1.0 the sac_loss_function test fails. Why? + # Also the test is not very maintainable, we need to change that unittest + # anyway. + self.update_target(tau=1.0) # self.config.get("tau", 1.0)) @property def q_func_vars(self): @@ -276,7 +277,8 @@ def set_weights(self, weights): EagerTFPolicyV2.set_weights(self, weights) elif isinstance(self, EagerTFPolicy): # Handle TF2 policies. EagerTFPolicy.set_weights(self, weights) - self.update_target(self.config.get("tau", 1.0)) + if not self.config.get("_enable_rl_module_api", False): + self.update_target(self.config.get("tau", 1.0)) class ValueNetworkMixin: diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py index 159c993e7826..b258c1d74560 100644 --- a/rllib/policy/torch_mixins.py +++ b/rllib/policy/torch_mixins.py @@ -8,8 +8,6 @@ torch, nn = try_import_torch() -# TODO: (sven) Unify hyperparam annealing procedures across RLlib (tf/torch) -# and for all possible hyperparams, not just lr. @DeveloperAPI class LearningRateSchedule: """Mixin for TorchPolicy that adds a learning rate schedule.""" @@ -17,6 +15,8 @@ class LearningRateSchedule: @DeveloperAPI def __init__(self, lr, lr_schedule): self._lr_schedule = None + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. if lr_schedule is None: self.cur_lr = lr else: @@ -28,7 +28,7 @@ def __init__(self, lr, lr_schedule): @override(Policy) def on_global_var_update(self, global_vars): super().on_global_var_update(global_vars) - if self._lr_schedule: + if self._lr_schedule and not self.config.get("_enable_learner_api", False): self.cur_lr = self._lr_schedule.value(global_vars["timestep"]) for opt in self._optimizers: for p in opt.param_groups: @@ -42,7 +42,11 @@ class EntropyCoeffSchedule: @DeveloperAPI def __init__(self, entropy_coeff, entropy_coeff_schedule): self._entropy_coeff_schedule = None - if entropy_coeff_schedule is None: + # Disable any scheduling behavior related to learning if Learner API is active. + # Schedules are handled by Learner class. + if entropy_coeff_schedule is None or ( + self.config.get("_enable_learner_api", False) + ): self.entropy_coeff = entropy_coeff else: # Allows for custom schedule similar to lr_schedule format @@ -172,7 +176,7 @@ class TargetNetworkMixin: - Adds the `update_target` method to the policy. Calling `update_target` updates all target Q-networks' weights from their - respective "main" Q-metworks, based on tau (smooth, partial updating). + respective "main" Q-networks, based on tau (smooth, partial updating). """ def __init__(self): @@ -184,17 +188,32 @@ def update_target(self, tau=None): # Update_target_fn will be called periodically to copy Q network to # target Q network, using (soft) tau-synching. tau = tau or self.config.get("tau", 1.0) + model_state_dict = self.model.state_dict() + # Support partial (soft) synching. # If tau == 1.0: Full sync from Q-model to target Q-model. - target_state_dict = next(iter(self.target_models.values())).state_dict() - model_state_dict = { - k: tau * model_state_dict[k] + (1 - tau) * v - for k, v in target_state_dict.items() - } - for target in self.target_models.values(): - target.load_state_dict(model_state_dict) + if self.config.get("_enable_rl_module_api", False): + target_current_network_pairs = self.model.get_target_network_pairs() + for target_network, current_network in target_current_network_pairs: + current_state_dict = current_network.state_dict() + new_state_dict = { + k: tau * current_state_dict[k] + (1 - tau) * v + for k, v in target_network.state_dict().items() + } + target_network.load_state_dict(new_state_dict) + else: + # Support partial (soft) synching. + # If tau == 1.0: Full sync from Q-model to target Q-model. + target_state_dict = next(iter(self.target_models.values())).state_dict() + model_state_dict = { + k: tau * model_state_dict[k] + (1 - tau) * v + for k, v in target_state_dict.items() + } + + for target in self.target_models.values(): + target.load_state_dict(model_state_dict) @override(TorchPolicy) def set_weights(self, weights): diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index f78b13858390..9c2d573635e3 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -775,7 +775,13 @@ def set_state(self, state: PolicyState) -> None: if optimizer_vars: assert len(optimizer_vars) == len(self._optimizers) for o, s in zip(self._optimizers, optimizer_vars): - optim_state_dict = convert_to_torch_tensor(s, device=self.device) + # Torch optimizer param_groups include things like beta, etc. These + # parameters should be left as scalar and not converted to tensors. + # otherwise, torch.optim.step() will start to complain. + optim_state_dict = {"param_groups": s["param_groups"]} + optim_state_dict["state"] = convert_to_torch_tensor( + s["state"], device=self.device + ) o.load_state_dict(optim_state_dict) # Set exploration's state. if hasattr(self, "exploration") and "_exploration_state" in state: diff --git a/rllib/policy/torch_policy_v2.py b/rllib/policy/torch_policy_v2.py index 79546b0623ba..e4db0da342df 100644 --- a/rllib/policy/torch_policy_v2.py +++ b/rllib/policy/torch_policy_v2.py @@ -12,6 +12,7 @@ import tree # pip install dm_tree import ray +from ray.rllib.core.models.base import STATE_OUT from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper @@ -182,29 +183,32 @@ def __init__( self.exploration = None else: self.exploration = self._create_exploration() - self._optimizers = force_list(self.optimizer()) - - # Backward compatibility workaround so Policy will call self.loss() directly. - # TODO(jungong): clean up after all policies are migrated to new sub-class - # implementation. - self._loss = None - - # Store, which params (by index within the model's list of - # parameters) should be updated per optimizer. - # Maps optimizer idx to set or param indices. - self.multi_gpu_param_groups: List[Set[int]] = [] - main_params = {p: i for i, p in enumerate(self.model.parameters())} - for o in self._optimizers: - param_indices = [] - for pg_idx, pg in enumerate(o.param_groups): - for p in pg["params"]: - param_indices.append(main_params[p]) - self.multi_gpu_param_groups.append(set(param_indices)) - - # Create n sample-batch buffers (num_multi_gpu_tower_stacks), each - # one with m towers (num_gpus). - num_buffers = self.config.get("num_multi_gpu_tower_stacks", 1) - self._loaded_batches = [[] for _ in range(num_buffers)] + + if not self.config.get("_enable_learner_api", False): + self._optimizers = force_list(self.optimizer()) + + # Backward compatibility workaround so Policy will call self.loss() + # directly. + # TODO (jungong): clean up after all policies are migrated to new sub-class + # implementation. + self._loss = None + + # Store, which params (by index within the model's list of + # parameters) should be updated per optimizer. + # Maps optimizer idx to set or param indices. + self.multi_gpu_param_groups: List[Set[int]] = [] + main_params = {p: i for i, p in enumerate(self.model.parameters())} + for o in self._optimizers: + param_indices = [] + for pg_idx, pg in enumerate(o.param_groups): + for p in pg["params"]: + param_indices.append(main_params[p]) + self.multi_gpu_param_groups.append(set(param_indices)) + + # Create n sample-batch buffers (num_multi_gpu_tower_stacks), each + # one with m towers (num_gpus). + num_buffers = self.config.get("num_multi_gpu_tower_stacks", 1) + self._loaded_batches = [[] for _ in range(num_buffers)] # If set, means we are using distributed allreduce during learning. self.distributed_world_size = None @@ -993,7 +997,13 @@ def set_state(self, state: PolicyState) -> None: if optimizer_vars: assert len(optimizer_vars) == len(self._optimizers) for o, s in zip(self._optimizers, optimizer_vars): - optim_state_dict = convert_to_torch_tensor(s, device=self.device) + # Torch optimizer param_groups include things like beta, etc. These + # parameters should be left as scalar and not converted to tensors. + # otherwise, torch.optim.step() will start to complain. + optim_state_dict = {"param_groups": s["param_groups"]} + optim_state_dict["state"] = convert_to_torch_tensor( + s["state"], device=self.device + ) o.load_state_dict(optim_state_dict) # Set exploration's state. if hasattr(self, "exploration") and "_exploration_state" in state: @@ -1098,24 +1108,30 @@ def _compute_action_helper( if self.model: self.model.eval() - extra_fetches = {} + extra_fetches = None if isinstance(self.model, RLModule): if explore: + action_dist_class = self.model.get_exploration_action_dist_cls() fwd_out = self.model.forward_exploration(input_dict) - else: - fwd_out = self.model.forward_inference(input_dict) - # anything but action_dist and state_out is an extra fetch - action_dist = fwd_out.pop("action_dist") - - if explore: + action_dist = action_dist_class.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) actions = action_dist.sample() logp = action_dist.logp(actions) else: + action_dist_class = self.model.get_inference_action_dist_cls() + fwd_out = self.model.forward_inference(input_dict) + action_dist = action_dist_class.from_logits( + fwd_out[SampleBatch.ACTION_DIST_INPUTS] + ) + action_dist = action_dist.to_deterministic() actions = action_dist.sample() logp = None - state_out = fwd_out.pop("state_out", {}) + + # Anything but actions and state_out is an extra fetch. + state_out = fwd_out.pop(STATE_OUT, {}) extra_fetches = fwd_out - dist_inputs = None + dist_inputs = fwd_out[SampleBatch.ACTION_DIST_INPUTS] elif is_overridden(self.action_sampler_fn): action_dist = None actions, logp, dist_inputs, state_out = self.action_sampler_fn( @@ -1160,7 +1176,7 @@ def _compute_action_helper( ) # Add default and custom fetches. - if not extra_fetches: + if extra_fetches is None: extra_fetches = self.extra_action_out( input_dict, state_batches, self.model, action_dist ) @@ -1179,7 +1195,6 @@ def _compute_action_helper( return convert_to_numpy((actions, state_out, extra_fetches)) def _lazy_tensor_dict(self, postprocessed_batch: SampleBatch, device=None): - # TODO: (sven): Keep for a while to ensure backward compatibility. if not isinstance(postprocessed_batch, SampleBatch): postprocessed_batch = SampleBatch(postprocessed_batch) postprocessed_batch.set_get_interceptor( diff --git a/rllib/tests/run_regression_tests.py b/rllib/tests/run_regression_tests.py index f9809bebb3ba..70827e279f0a 100644 --- a/rllib/tests/run_regression_tests.py +++ b/rllib/tests/run_regression_tests.py @@ -114,7 +114,9 @@ # for overriding the episode reward mean for tf2 tests for off policy # long learning tests such as sac and ddpg on the pendulum environment. if args.override_mean_reward != 0.0: - exp["stop"]["episode_reward_mean"] = args.override_mean_reward + exp["stop"][ + "sampler_results/episode_reward_mean" + ] = args.override_mean_reward # QMIX does not support tf yet -> skip. if exp["run"] == "QMIX" and args.framework != "torch": @@ -158,9 +160,17 @@ # we evaluate against an actual environment. check_eval = exp["config"].get("evaluation_interval", None) is not None reward_mean = ( - t.last_result["evaluation"]["episode_reward_mean"] + t.last_result["evaluation"]["sampler_results"][ + "episode_reward_mean" + ] if check_eval - else t.last_result["episode_reward_mean"] + else ( + # Some algos don't store sampler results under `sampler_results` + # e.g. ARS. Need to keep this logic around for now. + t.last_result["sampler_results"]["episode_reward_mean"] + if "sampler_results" in t.last_result + else t.last_result["episode_reward_mean"] + ) ) # If we are using evaluation workers, we may have @@ -168,12 +178,14 @@ # not, use `episode_reward_mean`. if check_eval: min_reward = t.stopping_criterion.get( - "evaluation/episode_reward_mean", - t.stopping_criterion.get("episode_reward_mean"), + "evaluation/sampler_results/episode_reward_mean", + t.stopping_criterion.get("sampler_results/episode_reward_mean"), ) # Otherwise, expect `episode_reward_mean` to be set. else: - min_reward = t.stopping_criterion.get("episode_reward_mean") + min_reward = t.stopping_criterion.get( + "sampler_results/episode_reward_mean" + ) # If min reward not defined, always pass. if min_reward is None or reward_mean >= min_reward: diff --git a/rllib/tests/test_algorithm_checkpoint_restore.py b/rllib/tests/test_algorithm_checkpoint_restore.py index 2c60ec2a9c58..af74dfc55dd3 100644 --- a/rllib/tests/test_algorithm_checkpoint_restore.py +++ b/rllib/tests/test_algorithm_checkpoint_restore.py @@ -1,11 +1,8 @@ #!/usr/bin/env python - -import numpy as np import unittest import ray -from ray.rllib.utils.test_utils import check, framework_iterator from ray.rllib.algorithms.apex_ddpg import ApexDDPGConfig from ray.rllib.algorithms.sac import SACConfig from ray.rllib.algorithms.simple_q import SimpleQConfig @@ -15,14 +12,8 @@ from ray.rllib.algorithms.ddpg import DDPGConfig from ray.rllib.algorithms.ars import ARSConfig from ray.rllib.algorithms.a3c import A3CConfig -from ray.tune.registry import get_trainable_cls - - -def get_mean_action(alg, obs): - out = [] - for _ in range(5000): - out.append(float(alg.compute_single_action(obs))) - return np.mean(out) +from ray.rllib.utils.test_utils import test_ckpt_restore +import os # As we transition things to RLModule API the explore=False will get @@ -32,7 +23,12 @@ def get_mean_action(alg, obs): # explore=None if we compare the mean of the distribution of actions for the # same observation to be the same. algorithms_and_configs = { - "A3C": (A3CConfig().exploration(explore=False).rollouts(num_rollout_workers=1)), + "A3C": ( + A3CConfig() + .exploration(explore=False) + .rollouts(num_rollout_workers=1) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) + ), "APEX_DDPG": ( ApexDDPGConfig() .exploration(explore=False) @@ -42,29 +38,34 @@ def get_mean_action(alg, obs): optimizer={"num_replay_buffer_shards": 1}, num_steps_sampled_before_learning_starts=0, ) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), "ARS": ( ARSConfig() .exploration(explore=False) .rollouts(num_rollout_workers=2, observation_filter="MeanStdFilter") .training(num_rollouts=10, noise_size=2500000) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), "DDPG": ( DDPGConfig() .exploration(explore=False) .reporting(min_sample_timesteps_per_iteration=100) .training(num_steps_sampled_before_learning_starts=0) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), "DQN": ( DQNConfig() .exploration(explore=False) .training(num_steps_sampled_before_learning_starts=0) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), "ES": ( ESConfig() .exploration(explore=False) .training(episodes_per_batch=10, train_batch_size=100, noise_size=2500000) .rollouts(observation_filter="MeanStdFilter", num_rollout_workers=2) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), "PPO": ( # See the comment before the `algorithms_and_configs` dict. @@ -72,166 +73,94 @@ def get_mean_action(alg, obs): PPOConfig() .training(num_sgd_iter=5, train_batch_size=1000) .rollouts(num_rollout_workers=2) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), "SimpleQ": ( SimpleQConfig() .exploration(explore=False) .training(num_steps_sampled_before_learning_starts=0) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), "SAC": ( SACConfig() .exploration(explore=False) .training(num_steps_sampled_before_learning_starts=0) + .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0"))) ), } -def ckpt_restore_test(algo_name, tf2=False, object_store=False, replay_buffer=False): - config = algorithms_and_configs[algo_name].to_dict() - # If required, store replay buffer data in checkpoints as well. - if replay_buffer: - config["store_buffer_in_checkpoints"] = True - - frameworks = (["tf2"] if tf2 else []) + ["torch", "tf"] - for fw in framework_iterator(config, frameworks=frameworks): - for use_object_store in [False, True] if object_store else [False]: - print("use_object_store={}".format(use_object_store)) - cls = get_trainable_cls(algo_name) - if "DDPG" in algo_name or "SAC" in algo_name: - alg1 = cls(config=config, env="Pendulum-v1") - alg2 = cls(config=config, env="Pendulum-v1") - else: - alg1 = cls(config=config, env="CartPole-v1") - alg2 = cls(config=config, env="CartPole-v1") - - policy1 = alg1.get_policy() - - res = alg1.train() - print("current status: " + str(res)) - - # Check optimizer state as well. - optim_state = policy1.get_state().get("_optimizer_variables") - - if use_object_store: - checkpoint = alg1.save_to_object() - else: - checkpoint = alg1.save() - - # Test if we can restore multiple times (at least twice, assuming failure - # would mainly stem from improperly reused variables) - for num_restores in range(2): - # Sync the models - if use_object_store: - alg2.restore_from_object(checkpoint) - else: - alg2.restore(checkpoint) - - # Compare optimizer state with re-loaded one. - if optim_state: - s2 = alg2.get_policy().get_state().get("_optimizer_variables") - # Tf -> Compare states 1:1. - if fw in ["tf2", "tf"]: - check(s2, optim_state) - # For torch, optimizers have state_dicts with keys=params, - # which are different for the two models (ignore these - # different keys, but compare all values nevertheless). - else: - for i, s2_ in enumerate(s2): - check( - list(s2_["state"].values()), - list(optim_state[i]["state"].values()), - ) - - # Compare buffer content with restored one. - if replay_buffer: - data = alg1.local_replay_buffer.replay_buffers[ - "default_policy" - ]._storage[42 : 42 + 42] - new_data = alg2.local_replay_buffer.replay_buffers[ - "default_policy" - ]._storage[42 : 42 + 42] - check(data, new_data) - - for _ in range(1): - if "DDPG" in algo_name or "SAC" in algo_name: - obs = np.clip( - np.random.uniform(size=3), - policy1.observation_space.low, - policy1.observation_space.high, - ) - else: - obs = np.clip( - np.random.uniform(size=4), - policy1.observation_space.low, - policy1.observation_space.high, - ) - a1 = get_mean_action(alg1, obs) - a2 = get_mean_action(alg2, obs) - print("Checking computed actions", alg1, obs, a1, a2) - if abs(a1 - a2) > 0.1: - raise AssertionError( - "algo={} [a1={} a2={}]".format(algo_name, a1, a2) - ) - # Stop both algos. - alg1.stop() - alg2.stop() - - class TestCheckpointRestorePG(unittest.TestCase): @classmethod def setUpClass(cls): - ray.init(num_cpus=5) + ray.init() @classmethod def tearDownClass(cls): ray.shutdown() def test_a3c_checkpoint_restore(self): - ckpt_restore_test("A3C") + # TODO(Kourosh) A3C cannot run a restored algorithm for some reason. + test_ckpt_restore( + algorithms_and_configs["A3C"], "CartPole-v1", run_restored_algorithm=False + ) def test_ppo_checkpoint_restore(self): - ckpt_restore_test("PPO", object_store=True) + test_ckpt_restore( + algorithms_and_configs["PPO"], "CartPole-v1", object_store=True + ) class TestCheckpointRestoreOffPolicy(unittest.TestCase): @classmethod def setUpClass(cls): - ray.init(num_cpus=5) + ray.init() @classmethod def tearDownClass(cls): ray.shutdown() def test_apex_ddpg_checkpoint_restore(self): - ckpt_restore_test("APEX_DDPG") + test_ckpt_restore(algorithms_and_configs["APEX_DDPG"], "Pendulum-v1") def test_ddpg_checkpoint_restore(self): - ckpt_restore_test("DDPG", replay_buffer=True) + test_ckpt_restore( + algorithms_and_configs["DDPG"], "Pendulum-v1", replay_buffer=True + ) def test_dqn_checkpoint_restore(self): - ckpt_restore_test("DQN", object_store=True, replay_buffer=True) + test_ckpt_restore( + algorithms_and_configs["DQN"], + "CartPole-v1", + object_store=True, + replay_buffer=True, + ) def test_sac_checkpoint_restore(self): - ckpt_restore_test("SAC", replay_buffer=True) + test_ckpt_restore( + algorithms_and_configs["SAC"], "Pendulum-v1", replay_buffer=True + ) def test_simpleq_checkpoint_restore(self): - ckpt_restore_test("SimpleQ", replay_buffer=True) + test_ckpt_restore( + algorithms_and_configs["SimpleQ"], "CartPole-v1", replay_buffer=True + ) class TestCheckpointRestoreEvolutionAlgos(unittest.TestCase): @classmethod def setUpClass(cls): - ray.init(num_cpus=5) + ray.init() @classmethod def tearDownClass(cls): ray.shutdown() def test_ars_checkpoint_restore(self): - ckpt_restore_test("ARS") + test_ckpt_restore(algorithms_and_configs["ARS"], "CartPole-v1") def test_es_checkpoint_restore(self): - ckpt_restore_test("ES") + test_ckpt_restore(algorithms_and_configs["ES"], "CartPole-v1") if __name__ == "__main__": diff --git a/rllib/tests/test_algorithm_save_load_checkpoint_learner.py b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py new file mode 100644 index 000000000000..211d0dac10c7 --- /dev/null +++ b/rllib/tests/test_algorithm_save_load_checkpoint_learner.py @@ -0,0 +1,128 @@ +import tempfile +import unittest + +import ray +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig +from ray.rllib.utils.test_utils import check, framework_iterator +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY + + +algorithms_and_configs = { + "PPO": (PPOConfig().training(train_batch_size=2, sgd_minibatch_size=2)) +} + + +@ray.remote +def save_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): + """Create an algo, checkpoint it, then train for 2 iterations. + + Note: This function uses a seeded algorithm that can modify the global random state. + Running it multiple times in the same process can affect other algorithms. + Making it a Ray task runs it in a separate process and prevents it from + affecting other algorithms' random state. + + Args: + algo_cfg: The algorithm config to build the algo from. + env: The gym genvironment to train on. + tmpdir: The temporary directory to save the checkpoint to. + + Returns: + The learner stats after 2 iterations of training. + """ + algo_cfg = ( + algo_cfg.training(_enable_learner_api=True) + .rl_module(_enable_rl_module_api=True) + .rollouts(num_rollout_workers=0) + # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 + # to make sure that we get results as soon as sampling/training is done at + # least once + .reporting(min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=1) + .debugging(seed=10) + ) + algo = algo_cfg.environment(env).build() + + tmpdir = str(tmpdir) + algo.save_checkpoint(tmpdir) + for _ in range(2): + results = algo.train() + return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY] + + +@ray.remote +def load_and_train(algo_cfg: AlgorithmConfig, env: str, tmpdir): + """Loads the checkpoint saved by save_and_train and trains for 2 iterations. + + Note: This function uses a seeded algorithm that can modify the global random state. + Running it multiple times in the same process can affect other algorithms. + Making it a Ray task runs it in a separate process and prevents it from + affecting other algorithms' random state. + + Args: + algo_cfg: The algorithm config to build the algo from. + env: The gym genvironment to train on. + tmpdir: The temporary directory to save the checkpoint to. + + Returns: + The learner stats after 2 iterations of training. + + """ + algo_cfg = ( + algo_cfg.training(_enable_learner_api=True) + .rl_module(_enable_rl_module_api=True) + .rollouts(num_rollout_workers=0) + # setting min_time_s_per_iteration=0 and min_sample_timesteps_per_iteration=1 + # to make sure that we get results as soon as sampling/training is done at + # least once + .reporting(min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=1) + .debugging(seed=10) + ) + algo = algo_cfg.environment(env).build() + tmpdir = str(tmpdir) + algo.load_checkpoint(tmpdir) + for _ in range(2): + results = algo.train() + return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY] + + +class TestAlgorithmWithLearnerSaveAndRestore(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + ray.init() + + @classmethod + def tearDowClass(cls) -> None: + ray.shutdown() + + def test_save_and_restore(self): + for algo_name in algorithms_and_configs: + config = algorithms_and_configs[algo_name] + for _ in framework_iterator(config, frameworks=["torch", "tf2"]): + with tempfile.TemporaryDirectory() as tmpdir: + # create an algorithm, checkpoint it, then train for 2 iterations + ray.get(save_and_train.remote(config, "CartPole-v1", tmpdir)) + # load that checkpoint into a new algorithm and train for 2 + # iterations + results_algo_2 = ray.get( + load_and_train.remote(config, "CartPole-v1", tmpdir) + ) + + # load that checkpoint into another new algorithm and train for 2 + # iterations + results_algo_3 = ray.get( + load_and_train.remote(config, "CartPole-v1", tmpdir) + ) + + # check that the results are the same across loaded algorithms + # they won't be the same as the first algorithm since the random + # state that is used for each algorithm is not preserved across + # checkpoints. + check(results_algo_3, results_algo_2) + + +if __name__ == "__main__": + import sys + import pytest + + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/tests/test_node_failure.py b/rllib/tests/test_node_failure.py index ab1671b66704..3787eaa62a52 100644 --- a/rllib/tests/test_node_failure.py +++ b/rllib/tests/test_node_failure.py @@ -5,7 +5,7 @@ import ray from ray._private.test_utils import get_other_nodes from ray.cluster_utils import Cluster -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray.rllib.algorithms.ppo import PPO, PPOConfig diff --git a/rllib/tests/test_rllib_train_and_evaluate.py b/rllib/tests/test_rllib_train_and_evaluate.py index 172c86b1da96..0d540abb6a1d 100644 --- a/rllib/tests/test_rllib_train_and_evaluate.py +++ b/rllib/tests/test_rllib_train_and_evaluate.py @@ -96,12 +96,23 @@ def learn_test_plus_evaluate(algo: str, env="CartPole-v1"): print("Saving results to {}".format(tmp_dir)) rllib_dir = str(Path(__file__).parent.parent.absolute()) + + # This is only supported without RLModule API. See AlgorithmConfig for + # more info. We need to prefetch the default config that will be used when we + # call rllib train here to see if the RLModule API is enabled. + algo_cls = get_trainable_cls(algo) + config = algo_cls.get_default_config() + if config._enable_rl_module_api: + eval_ = ', \\"evaluation_config\\": {}' + else: + eval_ = ', \\"evaluation_config\\": {\\"explore\\": false}' + print("RLlib dir = {}\nexists={}".format(rllib_dir, os.path.exists(rllib_dir))) os.system( "python {}/train.py --local-dir={} --run={} " "--checkpoint-freq=1 --checkpoint-at-end ".format(rllib_dir, tmp_dir, algo) - + '--config="{\\"num_gpus\\": 0, \\"num_workers\\": 1, ' - '\\"evaluation_config\\": {\\"explore\\": false}' + + '--config="{\\"num_gpus\\": 0, \\"num_workers\\": 1' + + eval_ + fw_ + '}" ' + '--stop="{\\"episode_reward_mean\\": 100.0}"' @@ -182,7 +193,8 @@ def policy_fn(agent_id, episode, **kwargs): policy_mapping_fn=policy_fn, ) .resources(num_gpus=0) - .evaluation(evaluation_config=AlgorithmConfig.overrides(explore=False)) + .evaluation(evaluation_config=AlgorithmConfig.overrides(explore=True)) + .evaluation(evaluation_config=AlgorithmConfig.overrides(explore=True)) .rl_module( rl_module_spec=MultiAgentRLModuleSpec( module_specs={ diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 7d7a975bd248..00d290a7677c 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -1,10 +1,6 @@ import logging -import time import unittest -import numpy as np -from gymnasium.spaces import Box, Dict, Discrete, Tuple, MultiDiscrete, MultiBinary - import ray from ray.rllib.algorithms.a3c import A3CConfig from ray.rllib.algorithms.appo import APPOConfig @@ -15,176 +11,10 @@ from ray.rllib.algorithms.impala import ImpalaConfig from ray.rllib.algorithms.ppo import PPOConfig from ray.rllib.algorithms.sac import SACConfig -from ray.rllib.examples.env.random_env import RandomEnv -from ray.rllib.models.tf.complex_input_net import ComplexInputNetwork as ComplexNet -from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNet -from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNet -from ray.rllib.models.torch.complex_input_net import ( - ComplexInputNetwork as TorchComplexNet, -) -from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet -from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNet -from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.utils.test_utils import framework_iterator +from ray.rllib.utils.test_utils import check_supported_spaces -logger = logging.getLogger(__name__) -ACTION_SPACES_TO_TEST = { - # Test discrete twice here until we support multi_binary action spaces - "discrete": Discrete(5), - "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32), - "int_actions": Box(0, 3, (2, 3), dtype=np.int32), - "multidiscrete": MultiDiscrete([1, 2, 3, 4]), - "tuple": Tuple([Discrete(2), Discrete(3), Box(-1.0, 1.0, (5,), dtype=np.float32)]), - "dict": Dict( - { - "action_choice": Discrete(3), - "parameters": Box(-1.0, 1.0, (1,), dtype=np.float32), - "yet_another_nested_dict": Dict({"a": Tuple([Discrete(2), Discrete(3)])}), - } - ), -} - -OBSERVATION_SPACES_TO_TEST = { - "multi_binary": MultiBinary([3, 10, 10]), - "discrete": Discrete(5), - "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32), - "vector2d": Box(-1.0, 1.0, (5, 5), dtype=np.float32), - "image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32), - "vizdoomgym": Box(-1.0, 1.0, (240, 320, 3), dtype=np.float32), - "tuple": Tuple([Discrete(10), Box(-1.0, 1.0, (5,), dtype=np.float32)]), - "dict": Dict( - { - "task": Discrete(10), - "position": Box(-1.0, 1.0, (5,), dtype=np.float32), - } - ), -} - -# TODO(Artur): Add back tf2 once we CNNs there -RLMODULE_SUPPORTED_FRAMEWORKS = {"torch"} - -# The action spaces that we test RLModules with -RLMODULE_SUPPORTED_ACTION_SPACES = ["discrete", "continuous"] - -# The observation spaces that we test RLModules with -RLMODULE_SUPPORTED_OBSERVATION_SPACES = [ - "multi_binary", - "discrete", - "continuous", - "image", - "vizdoomgym", - "tuple", - "dict", -] - -DEFAULT_OBSERVATION_SPACE = DEFAULT_ACTION_SPACE = "discrete" - - -def check_support(alg, config, train=True, check_bounds=False, tf2=False): - config["log_level"] = "ERROR" - config["env"] = RandomEnv - - def _do_check(alg, config, a_name, o_name): - # We need to copy here so that this validation does not affect the actual - # validation method call further down the line. - config_copy = config.copy() - config_copy.validate() - # If RLModules are enabled, we need to skip a few tests for now: - if config_copy._enable_rl_module_api: - # Skip PPO cases in which RLModules don't support the given spaces yet. - if o_name not in RLMODULE_SUPPORTED_OBSERVATION_SPACES: - logger.warning( - "Skipping PPO test with RLModules for obs space {}".format(o_name) - ) - return - if a_name not in RLMODULE_SUPPORTED_ACTION_SPACES: - logger.warning( - "Skipping PPO test with RLModules for action space {}".format( - a_name - ) - ) - return - - fw = config["framework"] - action_space = ACTION_SPACES_TO_TEST[a_name] - obs_space = OBSERVATION_SPACES_TO_TEST[o_name] - print( - "=== Testing {} (fw={}) action_space={} obs_space={} ===".format( - alg, fw, action_space, obs_space - ) - ) - t0 = time.time() - config.update_from_dict( - dict( - env_config=dict( - action_space=action_space, - observation_space=obs_space, - reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32), - p_terminated=1.0, - check_action_bounds=check_bounds, - ) - ) - ) - stat = "ok" - - try: - algo = config.build() - except ray.exceptions.RayActorError as e: - if len(e.args) >= 2 and isinstance(e.args[2], UnsupportedSpaceException): - stat = "unsupported" - elif isinstance(e.args[0].args[2], UnsupportedSpaceException): - stat = "unsupported" - else: - raise - except UnsupportedSpaceException: - stat = "unsupported" - else: - if alg not in ["DDPG", "ES", "ARS", "SAC", "PPO"]: - # 2D (image) input: Expect VisionNet. - if o_name in ["atari", "image"]: - if fw == "torch": - assert isinstance(algo.get_policy().model, TorchVisionNet) - else: - assert isinstance(algo.get_policy().model, VisionNet) - # 1D input: Expect FCNet. - elif o_name == "continuous": - if fw == "torch": - assert isinstance(algo.get_policy().model, TorchFCNet) - else: - assert isinstance(algo.get_policy().model, FCNet) - # Could be either one: ComplexNet (if disabled Preprocessor) - # or FCNet (w/ Preprocessor). - elif o_name == "vector2d": - if fw == "torch": - assert isinstance( - algo.get_policy().model, (TorchComplexNet, TorchFCNet) - ) - else: - assert isinstance(algo.get_policy().model, (ComplexNet, FCNet)) - if train: - algo.train() - algo.stop() - print("Test: {}, ran in {}s".format(stat, time.time() - t0)) - - frameworks = {"tf", "torch"} - if tf2: - frameworks.add("tf2") - - if config._enable_rl_module_api: - # Only test the frameworks that are supported by RLModules. - frameworks = frameworks.intersection(RLMODULE_SUPPORTED_FRAMEWORKS) - - for _ in framework_iterator(config, frameworks=frameworks): - # Test all action spaces first. - for a_name in ACTION_SPACES_TO_TEST.keys(): - o_name = DEFAULT_OBSERVATION_SPACE - _do_check(alg, config, a_name, o_name) - - # Now test all observation spaces. - for o_name in OBSERVATION_SPACES_TO_TEST.keys(): - a_name = DEFAULT_ACTION_SPACE - _do_check(alg, config, a_name, o_name) +logger = logging.getLogger(__name__) class TestSupportedSpacesIMPALA(unittest.TestCase): @@ -197,7 +27,7 @@ def tearDownClass(cls) -> None: ray.shutdown() def test_impala(self): - check_support( + check_supported_spaces( "IMPALA", ( ImpalaConfig() @@ -222,9 +52,8 @@ def test_appo(self): .resources(num_gpus=0) .training(vtrace=False, model={"fcnet_hiddens": [10]}) ) - check_support("APPO", config, train=False) config.training(vtrace=True) - check_support("APPO", config) + check_supported_spaces("APPO", config) class TestSupportedSpacesA3C(unittest.TestCase): @@ -245,7 +74,7 @@ def test_a3c(self): model={"fcnet_hiddens": [10]}, ) ) - check_support("A3C", config, check_bounds=True) + check_supported_spaces("A3C", config, check_bounds=True) class TestSupportedSpacesPPO(unittest.TestCase): @@ -270,7 +99,7 @@ def test_ppo(self): }, ) ) - check_support("PPO", config, check_bounds=True, tf2=True) + check_supported_spaces("PPO", config, check_bounds=True) class TestSupportedSpacesPPONoPreprocessorGPU(unittest.TestCase): @@ -311,7 +140,35 @@ def test_ppo_no_preprocessors_gpu(self): _enable_learner_api=False ) - check_support("PPO", config, check_bounds=True, tf2=True) + check_supported_spaces( + "PPO", + config, + check_bounds=True, + frameworks=["torch", "tf"], + use_gpu=True, + ) + + +class TestSupportedSpacesDQN(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + ray.init() + + @classmethod + def tearDownClass(cls) -> None: + ray.shutdown() + + def test_dqn(self): + config = ( + DQNConfig() + .reporting(min_sample_timesteps_per_iteration=1) + .training( + replay_buffer_config={ + "capacity": 1000, + } + ) + ) + check_supported_spaces("DQN", config, frameworks=["tf2", "torch", "tf"]) class TestSupportedSpacesOffPolicy(unittest.TestCase): @@ -324,7 +181,7 @@ def tearDownClass(cls) -> None: ray.shutdown() def test_ddpg(self): - check_support( + check_supported_spaces( "DDPG", DDPGConfig() .exploration(exploration_config={"ou_base_scale": 100.0}) @@ -336,20 +193,8 @@ def test_ddpg(self): check_bounds=True, ) - def test_dqn(self): - config = ( - DQNConfig() - .reporting(min_sample_timesteps_per_iteration=1) - .training( - replay_buffer_config={ - "capacity": 1000, - } - ) - ) - check_support("DQN", config, tf2=True) - def test_sac(self): - check_support( + check_supported_spaces( "SAC", SACConfig().training(replay_buffer_config={"capacity": 1000}), check_bounds=True, @@ -366,19 +211,23 @@ def tearDownClass(cls) -> None: ray.shutdown() def test_ars(self): - check_support( + check_supported_spaces( "ARS", ARSConfig() .rollouts(num_rollout_workers=1) .training(noise_size=1500000, num_rollouts=1, rollouts_used=1), + # framework=None corresponds to numpy since ARS uses a numpy policy + frameworks=[None], ) def test_es(self): - check_support( + check_supported_spaces( "ES", ESConfig() .rollouts(num_rollout_workers=1) .training(noise_size=1500000, episodes_per_batch=1, train_batch_size=1), + # framework=None corresponds to numpy since ES uses a numpy policy + frameworks=[None], ) diff --git a/rllib/tuned_examples/a2c/atari-a2c.yaml b/rllib/tuned_examples/a2c/atari-a2c.yaml index f2e933e92d29..b46c83498c47 100644 --- a/rllib/tuned_examples/a2c/atari-a2c.yaml +++ b/rllib/tuned_examples/a2c/atari-a2c.yaml @@ -11,8 +11,11 @@ atari-a2c: config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 train_batch_size: 500 rollout_fragment_length: auto clip_rewards: True diff --git a/rllib/tuned_examples/a2c/cartpole-a2c-fake-gpus.yaml b/rllib/tuned_examples/a2c/cartpole-a2c-fake-gpus.yaml index 8eb80d49610f..6c2e1d4964fc 100644 --- a/rllib/tuned_examples/a2c/cartpole-a2c-fake-gpus.yaml +++ b/rllib/tuned_examples/a2c/cartpole-a2c-fake-gpus.yaml @@ -2,7 +2,7 @@ cartpole-a2c-fake-gpus: env: CartPole-v1 run: A2C stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 training_iteration: 200 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml b/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml index c784917ff943..1beeae3619cf 100644 --- a/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml +++ b/rllib/tuned_examples/a2c/cartpole-a2c-microbatch.yaml @@ -2,7 +2,7 @@ cartpole-a2c-microbatch: env: CartPole-v1 run: A2C stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 1000000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/a2c/cartpole-a2c.yaml b/rllib/tuned_examples/a2c/cartpole-a2c.yaml index 73a01bf0b51e..c5b67de577bc 100644 --- a/rllib/tuned_examples/a2c/cartpole-a2c.yaml +++ b/rllib/tuned_examples/a2c/cartpole-a2c.yaml @@ -2,7 +2,7 @@ cartpole-a2c: env: CartPole-v1 run: A2C stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 500000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/a3c/cartpole-a3c.yaml b/rllib/tuned_examples/a3c/cartpole-a3c.yaml index 6835453d36d6..351cf07ac74b 100644 --- a/rllib/tuned_examples/a3c/cartpole-a3c.yaml +++ b/rllib/tuned_examples/a3c/cartpole-a3c.yaml @@ -2,7 +2,7 @@ cartpole-a3c: env: CartPole-v1 run: A3C stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 200000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/a3c/pong-a3c.yaml b/rllib/tuned_examples/a3c/pong-a3c.yaml index f1af764b5719..e5b4a114904f 100644 --- a/rllib/tuned_examples/a3c/pong-a3c.yaml +++ b/rllib/tuned_examples/a3c/pong-a3c.yaml @@ -6,8 +6,11 @@ pong-a3c: config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - nondeterministic: False # deterministic + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 num_workers: 16 rollout_fragment_length: 20 vf_loss_coeff: 0.5 diff --git a/rllib/tuned_examples/alpha_star/multi-agent-cartpole-alpha-star.yaml b/rllib/tuned_examples/alpha_star/multi-agent-cartpole-alpha-star.yaml index 2b46df8ea8d6..daade938b441 100644 --- a/rllib/tuned_examples/alpha_star/multi-agent-cartpole-alpha-star.yaml +++ b/rllib/tuned_examples/alpha_star/multi-agent-cartpole-alpha-star.yaml @@ -2,7 +2,7 @@ multi-agent-cartpole-alpha-star: env: ray.rllib.examples.env.multi_agent.MultiAgentCartPole run: AlphaStar stop: - episode_reward_mean: 600 # 600 / 4 (==num_agents) = 150 + sampler_results/episode_reward_mean: 600 # 600 / 4 (==num_agents) = 150 timesteps_total: 200000 config: # Works for both torch and tf. @@ -22,7 +22,6 @@ multi-agent-cartpole-alpha-star: num_sgd_iter: 1 vf_loss_coeff: 0.005 vtrace: true - vtrace_drop_last_ts: false model: fcnet_hiddens: [32] fcnet_activation: linear diff --git a/rllib/tuned_examples/alpha_zero/cartpole-sparse-rewards-alpha-zero.yaml b/rllib/tuned_examples/alpha_zero/cartpole-sparse-rewards-alpha-zero.yaml index e1d8768f6dbf..a18e49c669cb 100644 --- a/rllib/tuned_examples/alpha_zero/cartpole-sparse-rewards-alpha-zero.yaml +++ b/rllib/tuned_examples/alpha_zero/cartpole-sparse-rewards-alpha-zero.yaml @@ -2,7 +2,7 @@ cartpole-sparse-rewards-alpha-zero: env: ray.rllib.examples.env.cartpole_sparse_rewards.CartPoleSparseRewards run: AlphaZero stop: - episode_reward_mean: 30.0 + sampler_results/episode_reward_mean: 30.0 timesteps_total: 100000 config: # Only supported for torch right now. diff --git a/rllib/tuned_examples/apex_ddpg/mountaincarcontinuous-apex-ddpg.yaml b/rllib/tuned_examples/apex_ddpg/mountaincarcontinuous-apex-ddpg.yaml index aa8bad0fea9c..a0fc3ba2fe24 100644 --- a/rllib/tuned_examples/apex_ddpg/mountaincarcontinuous-apex-ddpg.yaml +++ b/rllib/tuned_examples/apex_ddpg/mountaincarcontinuous-apex-ddpg.yaml @@ -3,7 +3,7 @@ mountaincarcontinuous-apex-ddpg: env: MountainCarContinuous-v0 run: APEX_DDPG stop: - episode_reward_mean: 90 + sampler_results/episode_reward_mean: 90 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/apex_ddpg/pendulum-apex-ddpg.yaml b/rllib/tuned_examples/apex_ddpg/pendulum-apex-ddpg.yaml index bf7d2a83c809..d6682897de59 100644 --- a/rllib/tuned_examples/apex_ddpg/pendulum-apex-ddpg.yaml +++ b/rllib/tuned_examples/apex_ddpg/pendulum-apex-ddpg.yaml @@ -3,7 +3,7 @@ pendulum-apex-ddpg: env: Pendulum-v1 run: APEX_DDPG stop: - episode_reward_mean: -160 + sampler_results/episode_reward_mean: -160 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/apex_dqn/atari-apex-dqn.yaml b/rllib/tuned_examples/apex_dqn/atari-apex-dqn.yaml index 094582854b64..d3ed5f6d3071 100644 --- a/rllib/tuned_examples/apex_dqn/atari-apex-dqn.yaml +++ b/rllib/tuned_examples/apex_dqn/atari-apex-dqn.yaml @@ -4,11 +4,14 @@ apex-breakoutnoframeskip-v5: # Minimum reward and total ts (in given time_total_s) to pass this test. stop: time_total_s: 7200 - episode_reward_mean: 20.0 + sampler_results/episode_reward_mean: 20.0 timesteps_total: 7000000 config: + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 double_q: false dueling: false num_atoms: 1 diff --git a/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn-fake-gpus.yaml b/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn-fake-gpus.yaml index 2c67e2b6814b..64f2cf6e78b3 100644 --- a/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn-fake-gpus.yaml +++ b/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn-fake-gpus.yaml @@ -9,7 +9,7 @@ cartpole-apex-dqn: env: CartPole-v1 run: APEX stop: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 250000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn.yaml b/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn.yaml index 45e2bfbb986a..4ee7c2406326 100644 --- a/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn.yaml +++ b/rllib/tuned_examples/apex_dqn/cartpole-apex-dqn.yaml @@ -9,7 +9,7 @@ cartpole-apex-dqn-training-itr: env: CartPole-v1 run: APEX stop: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 250000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/apex_dqn/pong-apex-dqn.yaml b/rllib/tuned_examples/apex_dqn/pong-apex-dqn.yaml index 22ec786a2e6a..4516d520230b 100644 --- a/rllib/tuned_examples/apex_dqn/pong-apex-dqn.yaml +++ b/rllib/tuned_examples/apex_dqn/pong-apex-dqn.yaml @@ -5,13 +5,16 @@ pong-apex: env: ALE/Pong-v5 run: APEX stop: - episode_reward_mean: 19.0 + sampler_results/episode_reward_mean: 19.0 timesteps_total: 4000000 config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 target_network_update_freq: 20000 num_workers: 4 num_envs_per_worker: 8 diff --git a/rllib/tuned_examples/appo/cartpole-appo-learner.yaml b/rllib/tuned_examples/appo/cartpole-appo-learner.yaml deleted file mode 100644 index 0400f6708dc9..000000000000 --- a/rllib/tuned_examples/appo/cartpole-appo-learner.yaml +++ /dev/null @@ -1,31 +0,0 @@ -cartpole-appo-learner: - env: CartPole-v1 - run: APPO - stop: - episode_reward_mean: 150 - timesteps_total: 200000 - config: - # Works for both torch and tf. - framework: tf2 - num_workers: - grid_search: - - 3 - num_gpus: 0 - observation_filter: MeanStdFilter - num_sgd_iter: - grid_search: - - 6 - vf_loss_coeff: 0.01 - vtrace: True - model: - fcnet_hiddens: [32] - fcnet_activation: linear - vf_share_layers: true - enable_connectors: True - _enable_learner_api: True - _enable_rl_module_api: True - eager_tracing: True - lr: 0.001 - entropy_coeff: 0.1 - kl_coeff: 0.01 - exploration_config: null diff --git a/rllib/tuned_examples/appo/cartpole-appo-vtrace-fake-gpus.yaml b/rllib/tuned_examples/appo/cartpole-appo-vtrace-fake-gpus.yaml index bf6bbed83db8..73581ac2b267 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-vtrace-fake-gpus.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo-vtrace-fake-gpus.yaml @@ -2,7 +2,7 @@ cartpole-appo-vtrace-fake-gpus: env: CartPole-v1 run: APPO stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 training_iteration: 400 config: # Works for both torch and tf. @@ -13,7 +13,6 @@ cartpole-appo-vtrace-fake-gpus: num_sgd_iter: 6 vf_loss_coeff: 0.01 vtrace: true - vtrace_drop_last_ts: false # Double batch size (2 GPUs). train_batch_size: 1000 diff --git a/rllib/tuned_examples/appo/cartpole-appo-vtrace-separate-losses.yaml b/rllib/tuned_examples/appo/cartpole-appo-vtrace-separate-losses.yaml index 8f55d7ed9f6a..970c36f1de17 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-vtrace-separate-losses.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo-vtrace-separate-losses.yaml @@ -2,7 +2,7 @@ cartpole-appo-vtrace-separate-losses: env: CartPole-v1 run: APPO stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 200000 config: # Only works for tf|tf2 so far. diff --git a/rllib/tuned_examples/appo/cartpole-appo-vtrace.yaml b/rllib/tuned_examples/appo/cartpole-appo-vtrace.yaml index f5cda41a6bef..1c4a9755a214 100644 --- a/rllib/tuned_examples/appo/cartpole-appo-vtrace.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo-vtrace.yaml @@ -2,7 +2,7 @@ cartpole-appo-vtrace: env: CartPole-v1 run: APPO stop: - episode_reward_mean: 180 + sampler_results/episode_reward_mean: 180 timesteps_total: 200000 config: # Works for both torch and tf. @@ -14,7 +14,6 @@ cartpole-appo-vtrace: num_sgd_iter: 1 vf_loss_coeff: 0.01 vtrace: true - vtrace_drop_last_ts: false model: fcnet_hiddens: [32] fcnet_activation: linear diff --git a/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml new file mode 100644 index 000000000000..34ec96700541 --- /dev/null +++ b/rllib/tuned_examples/appo/cartpole-appo-w-rl-modules-and-learner.yaml @@ -0,0 +1,27 @@ +cartpole-appo-w-rl-modules-and-learner: + env: CartPole-v1 + run: APPO + stop: + sampler_results/episode_reward_mean: 150 + timesteps_total: 200000 + config: + # Works for both torch and tf. + framework: torch + num_envs_per_worker: 5 + num_workers: 2 + train_batch_size: 1000 + lr: 0.001 + num_gpus: 0 + observation_filter: MeanStdFilter + vf_loss_coeff: 0.1 + vtrace: true + + enable_connectors: true + _enable_learner_api: true + _enable_rl_module_api: true + num_learner_workers: 2 + num_gpus_per_learner_worker: 0 + num_cpus_per_learner_worker: 1 + # Need to unset this b/c we are using the RLModule API, which + # provides exploration control via the RLModule's `forward_exploration` method. + exploration_config: {} diff --git a/rllib/tuned_examples/appo/cartpole-appo.yaml b/rllib/tuned_examples/appo/cartpole-appo.yaml index a5eb4850a3b4..7ad2cc89be11 100644 --- a/rllib/tuned_examples/appo/cartpole-appo.yaml +++ b/rllib/tuned_examples/appo/cartpole-appo.yaml @@ -2,7 +2,7 @@ cartpole-appo: env: CartPole-v1 run: APPO stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 200000 config: # Works for both torch and tf. @@ -18,4 +18,4 @@ cartpole-appo: fcnet_hiddens: [32] fcnet_activation: linear vf_share_layers: true - enable_connectors: True + enable_connectors: true diff --git a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml index 2214fac64b9a..52587329d163 100644 --- a/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml +++ b/rllib/tuned_examples/appo/frozenlake-appo-vtrace.yaml @@ -2,7 +2,7 @@ frozenlake-appo-vtrace: env: FrozenLake-v1 run: APPO stop: - episode_reward_mean: 0.99 + sampler_results/episode_reward_mean: 0.99 timesteps_total: 1000000 config: # Works for both torch and tf. @@ -24,7 +24,6 @@ frozenlake-appo-vtrace: rollout_fragment_length: 10 batch_mode: complete_episodes vtrace: true - vtrace_drop_last_ts: false num_envs_per_worker: 5 num_workers: 4 diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-appo.yaml b/rllib/tuned_examples/appo/multi-agent-cartpole-appo.yaml index 648cf579b71e..71e521aefe20 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-appo.yaml +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-appo.yaml @@ -2,7 +2,7 @@ multi-agent-cartpole-appo: env: ray.rllib.examples.env.multi_agent.MultiAgentCartPole run: APPO stop: - episode_reward_mean: 600 # 600 / 4 (==num_agents) = 150 + sampler_results/episode_reward_mean: 600 # 600 / 4 (==num_agents) = 150 timesteps_total: 200000 config: # Works for both torch and tf. @@ -22,7 +22,6 @@ multi-agent-cartpole-appo: num_sgd_iter: 1 vf_loss_coeff: 0.005 vtrace: true - vtrace_drop_last_ts: false model: fcnet_hiddens: [32] fcnet_activation: linear diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-restart-env-appo.yaml b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-restart-env-appo.yaml index 1c7e9eed5793..34132fc02c92 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-restart-env-appo.yaml +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-crashing-restart-env-appo.yaml @@ -2,7 +2,7 @@ multi-agent-cartpole-crashing-appo: env: ray.rllib.examples.env.cartpole_crashing.MultiAgentCartPoleCrashing run: APPO stop: - evaluation/episode_reward_mean: 300.0 + evaluation/sampler_results/episode_reward_mean: 300.0 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py index 551c1e578562..0af819627e9b 100644 --- a/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py +++ b/rllib/tuned_examples/appo/multi-agent-cartpole-w-100-policies-appo.py @@ -32,7 +32,6 @@ num_sgd_iter=1, vf_loss_coeff=0.005, vtrace=True, - vtrace_drop_last_ts=False, ) .multi_agent( # 2 agents per sub-env. diff --git a/rllib/tuned_examples/appo/pendulum-appo.yaml b/rllib/tuned_examples/appo/pendulum-appo.yaml index cd35451574da..dddf26c364a3 100644 --- a/rllib/tuned_examples/appo/pendulum-appo.yaml +++ b/rllib/tuned_examples/appo/pendulum-appo.yaml @@ -2,7 +2,7 @@ pendulum-appo-vtrace: env: Pendulum-v1 run: APPO stop: - episode_reward_mean: -1000 # just check it learns a bit + sampler_results/episode_reward_mean: -1000 # just check it learns a bit timesteps_total: 500000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml new file mode 100644 index 000000000000..79a6fbf2aef5 --- /dev/null +++ b/rllib/tuned_examples/appo/pong-appo-w-rl-modules-and-learner.yaml @@ -0,0 +1,55 @@ +# This can reach 18.0 reward in ~10 minutes on 4x M60 GPUs +# with 30 rollout workers, 4 learning workers, and 8 envs per rollout worker. +pong-appo: + env: ALE/Pong-v5 + run: APPO + stop: + episode_reward_mean: 18.0 + timesteps_total: 20000000 + config: + # Works for both torch and tf. + framework: torch + # Make analogous to old v4 + NoFrameskip. + env_config: + frameskip: 1 # no frameskip + full_action_space: false + repeat_action_probability: 0.0 # deterministic + vtrace: true + use_kl_loss: false + rollout_fragment_length: 50 + train_batch_size: 4000 + lr: 0.0005 + # On a 32 CPU machine (g3.2xlarge), we use 30 CPUs for the rollout workers + # and 2 for the learner workers. + num_workers: 30 + broadcast_interval: 1 + max_sample_requests_in_flight_per_worker: 1 + num_envs_per_worker: 8 + num_sgd_iter: 2 + vf_loss_coeff: 1.0 + clip_param: 0.3 + num_gpus: 0 + + model: + dim: 42 + conv_filters: [[16, 4, 2], [32, 4, 2], [256, 11, 1, "valid"]] + conv_activation: relu + conv_add_final_dense: false + conv_flattened_dim: 256 + use_cnn_heads: true + + # Run with Learner API. + _enable_learner_api: true + grad_clip_by_global_norm: 10.0 + # Use N Learner worker on the GPU. + num_learner_workers: 4 + num_gpus_per_learner_worker: 1 + # Since we are using learner workers, the driver process does not need + # a CPU in particular. + num_cpus_for_local_worker: 1 + + # Run with RLModule API. + _enable_rl_module_api: true + # Need to unset this b/c we are using the RLModule API, which + # provides exploration control via the RLModule's `forward_exploration` method. + exploration_config: {} diff --git a/rllib/tuned_examples/appo/pong-appo.yaml b/rllib/tuned_examples/appo/pong-appo.yaml index 12c81437d4a4..8614de9c3cfc 100644 --- a/rllib/tuned_examples/appo/pong-appo.yaml +++ b/rllib/tuned_examples/appo/pong-appo.yaml @@ -7,15 +7,18 @@ pong-appo: env: ALE/Pong-v5 run: APPO stop: - episode_reward_mean: 18.0 + sampler_results/episode_reward_mean: 18.0 timesteps_total: 5000000 config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: frameskip: 1 # no frameskip - vtrace: True - use_kl_loss: False + full_action_space: false + repeat_action_probability: 0.0 # deterministic + vtrace: true + use_kl_loss: false rollout_fragment_length: 50 train_batch_size: 750 num_workers: 32 diff --git a/rllib/tuned_examples/bandits/interest-evolution-recsim-env-bandit-linucb.yaml b/rllib/tuned_examples/bandits/interest-evolution-recsim-env-bandit-linucb.yaml index 4157d4e38ffd..5984b9b471df 100644 --- a/rllib/tuned_examples/bandits/interest-evolution-recsim-env-bandit-linucb.yaml +++ b/rllib/tuned_examples/bandits/interest-evolution-recsim-env-bandit-linucb.yaml @@ -2,7 +2,7 @@ interest-evolution-recsim-env-bandit-linucb: env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv run: BanditLinUCB stop: - episode_reward_mean: 180.0 + sampler_results/episode_reward_mean: 180.0 timesteps_total: 50000 config: framework: torch diff --git a/rllib/tuned_examples/cql/pendulum-cql.yaml b/rllib/tuned_examples/cql/pendulum-cql.yaml index 7cba06eac9c3..5ba42e4b7a8c 100644 --- a/rllib/tuned_examples/cql/pendulum-cql.yaml +++ b/rllib/tuned_examples/cql/pendulum-cql.yaml @@ -6,7 +6,7 @@ pendulum-cql: env: Pendulum-v1 run: CQL stop: - evaluation/episode_reward_mean: -700 + evaluation/sampler_results/episode_reward_mean: -700 timesteps_total: 800000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/crr/cartpole-v1-crr.yaml b/rllib/tuned_examples/crr/cartpole-v1-crr.yaml index 2501ecd933f7..744c71730f8f 100644 --- a/rllib/tuned_examples/crr/cartpole-v1-crr.yaml +++ b/rllib/tuned_examples/crr/cartpole-v1-crr.yaml @@ -2,7 +2,7 @@ cartpole_crr: env: 'CartPole-v1' run: CRR stop: - evaluation/episode_reward_mean: 200 + evaluation/sampler_results/episode_reward_mean: 200 training_iteration: 100 config: input: 'dataset' diff --git a/rllib/tuned_examples/crr/cartpole-v1-crr_expectation.yaml b/rllib/tuned_examples/crr/cartpole-v1-crr_expectation.yaml index cd752aaef7f2..95863298d9f3 100644 --- a/rllib/tuned_examples/crr/cartpole-v1-crr_expectation.yaml +++ b/rllib/tuned_examples/crr/cartpole-v1-crr_expectation.yaml @@ -2,7 +2,7 @@ cartpole_crr: env: 'CartPole-v1' run: CRR stop: - evaluation/episode_reward_mean: 200 + evaluation/sampler_results/episode_reward_mean: 200 training_iteration: 100 config: input: 'dataset' diff --git a/rllib/tuned_examples/crr/pendulum-v1-crr.yaml b/rllib/tuned_examples/crr/pendulum-v1-crr.yaml index 539b3ee886b4..89373bb009cc 100644 --- a/rllib/tuned_examples/crr/pendulum-v1-crr.yaml +++ b/rllib/tuned_examples/crr/pendulum-v1-crr.yaml @@ -3,7 +3,7 @@ pendulum_crr: run: CRR stop: # We could make this -200, but given that we have 4 cpus for our tests, we will have to settle for -300. - evaluation/episode_reward_mean: -300 + evaluation/sampler_results/episode_reward_mean: -300 timesteps_total: 2000000 config: input: 'dataset' diff --git a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml index 51b70ef776b3..878fe239ec2f 100644 --- a/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml +++ b/rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml @@ -3,7 +3,7 @@ halfcheetah-ddpg: env: HalfCheetah-v2 run: DDPG stop: - episode_reward_mean: 2000 + sampler_results/episode_reward_mean: 2000 time_total_s: 5400 # 90 minutes config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml b/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml index ef8c9995d73b..fa8c278228e7 100644 --- a/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml +++ b/rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml @@ -3,7 +3,7 @@ ddpg-halfcheetahbulletenv-v0: env: HalfCheetahBulletEnv-v0 run: DDPG stop: - episode_reward_mean: -300.0 + sampler_results/episode_reward_mean: -300.0 timesteps_total: 200000 config: actor_hiddens: [256, 256] diff --git a/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml b/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml index c436cf5f5e8d..806641eb430e 100644 --- a/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml +++ b/rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml @@ -4,7 +4,7 @@ ddpg-hopperbulletenv-v0: run: DDPG # Minimum reward and total ts (in given time_total_s) to pass this test. pass_criteria: - episode_reward_mean: 120.0 + sampler_results/episode_reward_mean: 120.0 timesteps_total: 50000 stop: time_total_s: 2000 diff --git a/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml b/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml index 6d260436c0b9..0b5ed7f90995 100644 --- a/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml +++ b/rllib/tuned_examples/ddpg/mountaincarcontinuous-ddpg.yaml @@ -3,7 +3,7 @@ mountaincarcontinuous-ddpg: env: MountainCarContinuous-v0 run: DDPG stop: - episode_reward_mean: 90 + sampler_results/episode_reward_mean: 90 time_total_s: 600 # 10 minutes config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ddpg/pendulum-ddpg-fake-gpus.yaml b/rllib/tuned_examples/ddpg/pendulum-ddpg-fake-gpus.yaml index fa9780dfff89..325795577b05 100644 --- a/rllib/tuned_examples/ddpg/pendulum-ddpg-fake-gpus.yaml +++ b/rllib/tuned_examples/ddpg/pendulum-ddpg-fake-gpus.yaml @@ -2,7 +2,7 @@ pendulum-ddpg-fake-gpus: env: Pendulum-v1 run: DDPG stop: - episode_reward_mean: -1000 + sampler_results/episode_reward_mean: -1000 timesteps_total: 40000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml b/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml index a4db86b9d235..de3239bc306b 100644 --- a/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml +++ b/rllib/tuned_examples/ddpg/pendulum-ddpg.yaml @@ -3,7 +3,7 @@ pendulum-ddpg: env: Pendulum-v1 run: DDPG stop: - episode_reward_mean: -320 + sampler_results/episode_reward_mean: -320 timesteps_total: 30000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ddppo/cartpole-ddppo.yaml b/rllib/tuned_examples/ddppo/cartpole-ddppo.yaml index 24a072c33954..3332b6c29ce1 100644 --- a/rllib/tuned_examples/ddppo/cartpole-ddppo.yaml +++ b/rllib/tuned_examples/ddppo/cartpole-ddppo.yaml @@ -2,7 +2,7 @@ cartpole-ddppo: env: CartPole-v1 run: DDPPO stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 100000 config: framework: torch diff --git a/rllib/tuned_examples/ddppo/pendulum-ddppo.yaml b/rllib/tuned_examples/ddppo/pendulum-ddppo.yaml index c89e5574274a..32aa9e171087 100644 --- a/rllib/tuned_examples/ddppo/pendulum-ddppo.yaml +++ b/rllib/tuned_examples/ddppo/pendulum-ddppo.yaml @@ -2,7 +2,7 @@ pendulum-ddppo: env: Pendulum-v1 run: DDPPO stop: - episode_reward_mean: -300 + sampler_results/episode_reward_mean: -300 timesteps_total: 1500000 config: framework: torch diff --git a/rllib/tuned_examples/dqn/atari-dist-dqn.yaml b/rllib/tuned_examples/dqn/atari-dist-dqn.yaml index b381dedb7293..1366fca92f96 100644 --- a/rllib/tuned_examples/dqn/atari-dist-dqn.yaml +++ b/rllib/tuned_examples/dqn/atari-dist-dqn.yaml @@ -7,8 +7,11 @@ atari-dist-dqn: - ALE/SpaceInvaders-v5 run: DQN config: + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 double_q: false dueling: false num_atoms: 51 diff --git a/rllib/tuned_examples/dqn/atari-dqn.yaml b/rllib/tuned_examples/dqn/atari-dqn.yaml index 2e00b269936c..4e12176bf1f0 100644 --- a/rllib/tuned_examples/dqn/atari-dqn.yaml +++ b/rllib/tuned_examples/dqn/atari-dqn.yaml @@ -11,8 +11,11 @@ atari-basic-dqn: config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 double_q: false dueling: false num_atoms: 1 diff --git a/rllib/tuned_examples/dqn/atari-duel-ddqn.yaml b/rllib/tuned_examples/dqn/atari-duel-ddqn.yaml index cfb15c8813b0..361eaeca346f 100644 --- a/rllib/tuned_examples/dqn/atari-duel-ddqn.yaml +++ b/rllib/tuned_examples/dqn/atari-duel-ddqn.yaml @@ -11,8 +11,11 @@ dueling-ddqn: config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 double_q: true dueling: true num_atoms: 1 diff --git a/rllib/tuned_examples/dqn/cartpole-dqn-fake-gpus.yaml b/rllib/tuned_examples/dqn/cartpole-dqn-fake-gpus.yaml index f1970f915b7f..410826fd3227 100644 --- a/rllib/tuned_examples/dqn/cartpole-dqn-fake-gpus.yaml +++ b/rllib/tuned_examples/dqn/cartpole-dqn-fake-gpus.yaml @@ -2,7 +2,7 @@ cartpole-dqn-fake-gpus: env: CartPole-v1 run: DQN stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 training_iteration: 400 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/dqn/cartpole-dqn-param-noise.yaml b/rllib/tuned_examples/dqn/cartpole-dqn-param-noise.yaml index 968c810a4159..7da16136a00d 100644 --- a/rllib/tuned_examples/dqn/cartpole-dqn-param-noise.yaml +++ b/rllib/tuned_examples/dqn/cartpole-dqn-param-noise.yaml @@ -2,7 +2,7 @@ cartpole-dqn-w-param-noise: env: CartPole-v1 run: DQN stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 300000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/dqn/cartpole-dqn-softq.yaml b/rllib/tuned_examples/dqn/cartpole-dqn-softq.yaml index 6f24ca1e55f2..112838425c4b 100644 --- a/rllib/tuned_examples/dqn/cartpole-dqn-softq.yaml +++ b/rllib/tuned_examples/dqn/cartpole-dqn-softq.yaml @@ -2,7 +2,7 @@ cartpole-dqn: env: CartPole-v1 run: DQN stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/dqn/cartpole-dqn.yaml b/rllib/tuned_examples/dqn/cartpole-dqn.yaml index a699ea00a849..7ff4ad4a950e 100644 --- a/rllib/tuned_examples/dqn/cartpole-dqn.yaml +++ b/rllib/tuned_examples/dqn/cartpole-dqn.yaml @@ -2,7 +2,7 @@ cartpole-dqn: env: CartPole-v1 run: DQN stop: - episode_reward_mean: 100 + sampler_results/episode_reward_mean: 100 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/dqn/pong-dqn.yaml b/rllib/tuned_examples/dqn/pong-dqn.yaml index 65b003d14747..7ac6a3b59fa2 100644 --- a/rllib/tuned_examples/dqn/pong-dqn.yaml +++ b/rllib/tuned_examples/dqn/pong-dqn.yaml @@ -3,13 +3,16 @@ pong-deterministic-dqn: env: ALE/Pong-v5 run: DQN stop: - episode_reward_mean: 20 + sampler_results/episode_reward_mean: 20 time_total_s: 7200 config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - nondeterministic: False # deterministic + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 num_gpus: 1 gamma: 0.99 lr: .0001 diff --git a/rllib/tuned_examples/dqn/pong-rainbow.yaml b/rllib/tuned_examples/dqn/pong-rainbow.yaml index 95e121e49c5a..60702c062ec1 100644 --- a/rllib/tuned_examples/dqn/pong-rainbow.yaml +++ b/rllib/tuned_examples/dqn/pong-rainbow.yaml @@ -2,10 +2,13 @@ pong-deterministic-rainbow: env: ALE/Pong-v5 run: DQN stop: - episode_reward_mean: 20 + sampler_results/episode_reward_mean: 20 config: + # Make analogous to old v4 + NoFrameskip. env_config: - nondeterministic: False # deterministic + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 num_atoms: 51 noisy: True gamma: 0.99 diff --git a/rllib/tuned_examples/dt/cartpole-v1-dt.yaml b/rllib/tuned_examples/dt/cartpole-v1-dt.yaml index 845bd6d91282..a4104487d134 100644 --- a/rllib/tuned_examples/dt/cartpole-v1-dt.yaml +++ b/rllib/tuned_examples/dt/cartpole-v1-dt.yaml @@ -2,7 +2,7 @@ cartpole_dt: env: 'CartPole-v1' run: DT stop: - evaluation/episode_reward_mean: 200 + evaluation/sampler_results/episode_reward_mean: 200 training_iteration: 100 config: input: 'dataset' diff --git a/rllib/tuned_examples/dt/pendulum-v1-dt.yaml b/rllib/tuned_examples/dt/pendulum-v1-dt.yaml index 9d67afd6b06e..d5d98f6fe4bf 100644 --- a/rllib/tuned_examples/dt/pendulum-v1-dt.yaml +++ b/rllib/tuned_examples/dt/pendulum-v1-dt.yaml @@ -3,7 +3,7 @@ pendulum_dt: run: DT stop: # We could make this higher, but given that we have 4 cpus for our tests, we will have to settle for -300. - evaluation/episode_reward_mean: -300 + evaluation/sampler_results/episode_reward_mean: -300 timesteps_total: 20000000 config: input: 'dataset' diff --git a/rllib/tuned_examples/dt/pendulum-v1-medium-expert-dt.yaml b/rllib/tuned_examples/dt/pendulum-v1-medium-expert-dt.yaml index 30b027f9c2a5..aaa7156bf363 100644 --- a/rllib/tuned_examples/dt/pendulum-v1-medium-expert-dt.yaml +++ b/rllib/tuned_examples/dt/pendulum-v1-medium-expert-dt.yaml @@ -3,7 +3,7 @@ pendulum_medium_expert_dt: run: DT stop: # We could make this higher, but given that we have 4 cpus for our tests, we will have to settle for -350. - evaluation/episode_reward_mean: -350 + evaluation/sampler_results/episode_reward_mean: -350 timesteps_total: 20000000 config: input: 'dataset' diff --git a/rllib/tuned_examples/es/cartpole-es.yaml b/rllib/tuned_examples/es/cartpole-es.yaml index 64b68a50c1ba..a87c26d3ffe8 100644 --- a/rllib/tuned_examples/es/cartpole-es.yaml +++ b/rllib/tuned_examples/es/cartpole-es.yaml @@ -2,7 +2,7 @@ cartpole-es: env: CartPole-v1 run: ES stop: - episode_reward_mean: 100 + sampler_results/episode_reward_mean: 100 timesteps_total: 500000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/es/humanoid-es.yaml b/rllib/tuned_examples/es/humanoid-es.yaml index 90aa8ef761fe..00e9a2ba068c 100644 --- a/rllib/tuned_examples/es/humanoid-es.yaml +++ b/rllib/tuned_examples/es/humanoid-es.yaml @@ -2,7 +2,7 @@ humanoid-v2-es: env: Humanoid-v2 run: ES stop: - episode_reward_mean: 6000 + sampler_results/episode_reward_mean: 6000 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/impala/atari-impala-large.yaml b/rllib/tuned_examples/impala/atari-impala-large.yaml index cc3f7363462f..8e8a882e84c7 100644 --- a/rllib/tuned_examples/impala/atari-impala-large.yaml +++ b/rllib/tuned_examples/impala/atari-impala-large.yaml @@ -11,8 +11,11 @@ atari-impala: stop: timesteps_total: 3000000 config: + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 rollout_fragment_length: 50 train_batch_size: 500 num_workers: 128 diff --git a/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml b/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml index cc1940a82240..35568b1092b4 100644 --- a/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml +++ b/rllib/tuned_examples/impala/atari-impala-multi-gpu.yaml @@ -6,8 +6,11 @@ atari-impala: config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 rollout_fragment_length: 50 train_batch_size: 4000 num_gpus: 4 diff --git a/rllib/tuned_examples/impala/atari-impala.yaml b/rllib/tuned_examples/impala/atari-impala.yaml index ec0661af2491..5c5a4d8fed9b 100644 --- a/rllib/tuned_examples/impala/atari-impala.yaml +++ b/rllib/tuned_examples/impala/atari-impala.yaml @@ -9,8 +9,11 @@ atari-impala: - ALE/SpaceInvaders-v5 run: IMPALA config: + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 rollout_fragment_length: 50 train_batch_size: 500 num_workers: 32 diff --git a/rllib/tuned_examples/impala/cartpole-impala-fake-gpus.yaml b/rllib/tuned_examples/impala/cartpole-impala-fake-gpus.yaml index 9d871487bdff..1281e75e3f94 100644 --- a/rllib/tuned_examples/impala/cartpole-impala-fake-gpus.yaml +++ b/rllib/tuned_examples/impala/cartpole-impala-fake-gpus.yaml @@ -2,7 +2,7 @@ cartpole-impala-fake-gpus: env: CartPole-v1 run: IMPALA stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 training_iteration: 400 config: # Works for both torch and tf. @@ -17,5 +17,3 @@ cartpole-impala-fake-gpus: # Fake 2 GPUs. num_gpus: 2 _fake_gpus: true - - vtrace_drop_last_ts: false diff --git a/rllib/tuned_examples/impala/cartpole-impala.yaml b/rllib/tuned_examples/impala/cartpole-impala.yaml index 02ef13fed725..1df02c4313cb 100644 --- a/rllib/tuned_examples/impala/cartpole-impala.yaml +++ b/rllib/tuned_examples/impala/cartpole-impala.yaml @@ -2,15 +2,14 @@ cartpole-impala: env: CartPole-v1 run: IMPALA stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 500000 config: # Works for both torch and tf. framework: tf2 num_gpus: 0 - vtrace_drop_last_ts: false - _enable_rl_module_api: True - _enable_learner_api: True + _enable_rl_module_api: true + _enable_learner_api: true grad_clip: 40 num_workers: 2 num_learner_workers: 1 diff --git a/rllib/tuned_examples/impala/multi-agent-cartpole-impala.yaml b/rllib/tuned_examples/impala/multi-agent-cartpole-impala.yaml index 7f2232615c5d..56f2ac106207 100644 --- a/rllib/tuned_examples/impala/multi-agent-cartpole-impala.yaml +++ b/rllib/tuned_examples/impala/multi-agent-cartpole-impala.yaml @@ -2,7 +2,7 @@ multi-agent-cartpole-impala: env: ray.rllib.examples.env.multi_agent.MultiAgentCartPole run: IMPALA stop: - episode_reward_mean: 600 # 600 / 4 (==num_agents) = 150 + sampler_results/episode_reward_mean: 600 # 600 / 4 (==num_agents) = 150 timesteps_total: 200000 config: # Works for both torch and tf. @@ -22,7 +22,6 @@ multi-agent-cartpole-impala: num_sgd_iter: 1 vf_loss_coeff: 0.005 vtrace: true - vtrace_drop_last_ts: false model: fcnet_hiddens: [32] fcnet_activation: linear diff --git a/rllib/tuned_examples/impala/pendulum-impala.yaml b/rllib/tuned_examples/impala/pendulum-impala.yaml index fccb247e3127..4ce4ba03a826 100644 --- a/rllib/tuned_examples/impala/pendulum-impala.yaml +++ b/rllib/tuned_examples/impala/pendulum-impala.yaml @@ -2,5 +2,5 @@ pendulum-impala-tf: env: Pendulum-v1 run: IMPALA stop: - episode_reward_mean: -700 + sampler_results/episode_reward_mean: -700 timesteps_total: 500000 diff --git a/rllib/tuned_examples/impala/pong-impala-fast.yaml b/rllib/tuned_examples/impala/pong-impala-fast.yaml index 50840de91ba4..d038f207af6c 100644 --- a/rllib/tuned_examples/impala/pong-impala-fast.yaml +++ b/rllib/tuned_examples/impala/pong-impala-fast.yaml @@ -7,8 +7,11 @@ pong-impala-fast: env: ALE/Pong-v5 run: IMPALA config: + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 rollout_fragment_length: 50 train_batch_size: 1000 num_workers: 128 diff --git a/rllib/tuned_examples/impala/pong-impala-vectorized.yaml b/rllib/tuned_examples/impala/pong-impala-vectorized.yaml index fe6912c24c07..9623bd8d1e27 100644 --- a/rllib/tuned_examples/impala/pong-impala-vectorized.yaml +++ b/rllib/tuned_examples/impala/pong-impala-vectorized.yaml @@ -5,8 +5,11 @@ pong-impala-vectorized: env: ALE/Pong-v5 run: IMPALA config: + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 rollout_fragment_length: 50 train_batch_size: 500 num_workers: 32 diff --git a/rllib/tuned_examples/impala/pong-impala.yaml b/rllib/tuned_examples/impala/pong-impala.yaml index 901d1e8daa8e..b003be9b850e 100644 --- a/rllib/tuned_examples/impala/pong-impala.yaml +++ b/rllib/tuned_examples/impala/pong-impala.yaml @@ -7,8 +7,11 @@ pong-impala: env: ALE/Pong-v5 run: IMPALA config: + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 rollout_fragment_length: 50 train_batch_size: 500 num_workers: 128 diff --git a/rllib/tuned_examples/maddpg/two-step-game-maddpg.yaml b/rllib/tuned_examples/maddpg/two-step-game-maddpg.yaml index 6b50cf6aaa0e..69ed92ba2bb9 100644 --- a/rllib/tuned_examples/maddpg/two-step-game-maddpg.yaml +++ b/rllib/tuned_examples/maddpg/two-step-game-maddpg.yaml @@ -2,7 +2,7 @@ two-step-game-maddpg: env: ray.rllib.examples.env.two_step_game.TwoStepGame run: MADDPG stop: - episode_reward_mean: 7.2 + sampler_results/episode_reward_mean: 7.2 timesteps_total: 20000 config: # MADDPG only supports tf for now. diff --git a/rllib/tuned_examples/mbmpo/cartpole-mbmpo.yaml b/rllib/tuned_examples/mbmpo/cartpole-mbmpo.yaml index bf2a9676225d..2d37a5a2b818 100644 --- a/rllib/tuned_examples/mbmpo/cartpole-mbmpo.yaml +++ b/rllib/tuned_examples/mbmpo/cartpole-mbmpo.yaml @@ -2,7 +2,7 @@ cartpole-mbmpo: env: ray.rllib.examples.env.mbmpo_env.CartPoleWrapper run: MBMPO stop: - episode_reward_mean: 190 + sampler_results/episode_reward_mean: 190 training_iteration: 20 config: # Only supported in torch right now. diff --git a/rllib/tuned_examples/mbmpo/pendulum-mbmpo.yaml b/rllib/tuned_examples/mbmpo/pendulum-mbmpo.yaml index 67b0b263c519..2ae964ab1e03 100644 --- a/rllib/tuned_examples/mbmpo/pendulum-mbmpo.yaml +++ b/rllib/tuned_examples/mbmpo/pendulum-mbmpo.yaml @@ -2,7 +2,7 @@ pendulum-mbmpo: env: ray.rllib.examples.env.mbmpo_env.PendulumWrapper run: MBMPO stop: - episode_reward_mean: -500 + sampler_results/episode_reward_mean: -500 training_iteration: 50 config: # Only supported in torch right now. diff --git a/rllib/tuned_examples/pg/cartpole-crashing-pg.yaml b/rllib/tuned_examples/pg/cartpole-crashing-pg.yaml index 39ce012f1ae6..babfdc4368f7 100644 --- a/rllib/tuned_examples/pg/cartpole-crashing-pg.yaml +++ b/rllib/tuned_examples/pg/cartpole-crashing-pg.yaml @@ -2,7 +2,7 @@ cartpole-crashing-pg: env: ray.rllib.examples.env.cartpole_crashing.CartPoleCrashing run: PG stop: - evaluation/episode_reward_mean: 150.0 + evaluation/sampler_results/episode_reward_mean: 150.0 num_env_steps_sampled: 150000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/pg/cartpole-crashing-with-remote-envs-pg.yaml b/rllib/tuned_examples/pg/cartpole-crashing-with-remote-envs-pg.yaml index 09e8b8c6e3fd..14bd428787bb 100644 --- a/rllib/tuned_examples/pg/cartpole-crashing-with-remote-envs-pg.yaml +++ b/rllib/tuned_examples/pg/cartpole-crashing-with-remote-envs-pg.yaml @@ -2,7 +2,7 @@ cartpole-crashing-with-remote-envs-pg: env: ray.rllib.examples.env.cartpole_crashing.CartPoleCrashing run: PG stop: - evaluation/episode_reward_mean: 35.0 + evaluation/sampler_results/episode_reward_mean: 35.0 timesteps_total: 25000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/pg/cartpole-pg-fake-gpus.yaml b/rllib/tuned_examples/pg/cartpole-pg-fake-gpus.yaml index 774e646d6831..99472f649f6a 100644 --- a/rllib/tuned_examples/pg/cartpole-pg-fake-gpus.yaml +++ b/rllib/tuned_examples/pg/cartpole-pg-fake-gpus.yaml @@ -2,7 +2,7 @@ cartpole-pg-fake-gpus: env: CartPole-v1 run: PG stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 training_iteration: 600 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/pg/cartpole-pg.yaml b/rllib/tuned_examples/pg/cartpole-pg.yaml index b288f9c2143c..e17ec213ceeb 100644 --- a/rllib/tuned_examples/pg/cartpole-pg.yaml +++ b/rllib/tuned_examples/pg/cartpole-pg.yaml @@ -2,7 +2,7 @@ cartpole-pg: env: CartPole-v1 run: PG stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-restart-sub-envs-pg.yaml b/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-restart-sub-envs-pg.yaml index b24578b1a4d7..43c322637d91 100644 --- a/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-restart-sub-envs-pg.yaml +++ b/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-restart-sub-envs-pg.yaml @@ -2,7 +2,7 @@ multi-agent-cartpole-crashing-pg: env: ray.rllib.examples.env.cartpole_crashing.MultiAgentCartPoleCrashing run: PG stop: - evaluation/episode_reward_mean: 300.0 + evaluation/sampler_results/episode_reward_mean: 300.0 num_env_steps_sampled: 300000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-with-remote-envs-pg.yaml b/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-with-remote-envs-pg.yaml index 39009d9f4fc5..492c08f6e9bb 100644 --- a/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-with-remote-envs-pg.yaml +++ b/rllib/tuned_examples/pg/multi-agent-cartpole-crashing-with-remote-envs-pg.yaml @@ -2,7 +2,7 @@ multi-agent-cartpole-crashing-pg: env: ray.rllib.examples.env.cartpole_crashing.MultiAgentCartPoleCrashing run: PG stop: - evaluation/episode_reward_mean: 300.0 + evaluation/sampler_results/episode_reward_mean: 300.0 num_env_steps_sampled: 300000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ppo/atari-ppo.yaml b/rllib/tuned_examples/ppo/atari-ppo.yaml index 187db074d6eb..22a024da04d2 100644 --- a/rllib/tuned_examples/ppo/atari-ppo.yaml +++ b/rllib/tuned_examples/ppo/atari-ppo.yaml @@ -11,8 +11,11 @@ atari-ppo: config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 lambda: 0.95 kl_coeff: 0.5 clip_rewards: True diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml b/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml index 08b47c188ff8..6263aa2c2131 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml +++ b/rllib/tuned_examples/ppo/cartpole-ppo-fake-gpus.yaml @@ -2,7 +2,7 @@ cartpole-ppo-fake-gpus: env: CartPole-v1 run: PPO stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 training_iteration: 400 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml b/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml index 8bed23ee363d..25638e5ac864 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml +++ b/rllib/tuned_examples/ppo/cartpole-ppo-hyperband.yaml @@ -3,7 +3,7 @@ cartpole-ppo: run: PPO num_samples: 3 stop: - episode_reward_mean: 200 + sampler_results/episode_reward_mean: 200 time_total_s: 180 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml b/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml index a675da81eaf2..2f6afebd53ef 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml +++ b/rllib/tuned_examples/ppo/cartpole-ppo-with-rl-module.yaml @@ -2,7 +2,7 @@ cartpole-ppo: env: CartPole-v1 run: PPO stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 100000 config: # Both torch and tf2 works. @@ -19,5 +19,5 @@ cartpole-ppo: vf_share_layers: true enable_connectors: true _enable_rl_module_api: true - _enable_learner_api: false + _enable_learner_api: true eager_tracing: false \ No newline at end of file diff --git a/rllib/tuned_examples/ppo/cartpole-ppo.yaml b/rllib/tuned_examples/ppo/cartpole-ppo.yaml index dea271014ef6..1922f1d6256b 100644 --- a/rllib/tuned_examples/ppo/cartpole-ppo.yaml +++ b/rllib/tuned_examples/ppo/cartpole-ppo.yaml @@ -2,7 +2,7 @@ cartpole-ppo: env: CartPole-v1 run: PPO stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml index b220b615dabe..8e442f6a0492 100644 --- a/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml +++ b/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml @@ -2,7 +2,7 @@ halfcheetah-ppo: env: HalfCheetah-v2 run: PPO stop: - episode_reward_mean: 9800 + sampler_results/episode_reward_mean: 9800 time_total_s: 10800 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml b/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml index 2e76777afe20..707bab654f44 100644 --- a/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml +++ b/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml @@ -2,7 +2,7 @@ humanoid-ppo-gae: env: Humanoid-v1 run: PPO stop: - episode_reward_mean: 6000 + sampler_results/episode_reward_mean: 6000 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/ppo/humanoid-ppo.yaml b/rllib/tuned_examples/ppo/humanoid-ppo.yaml index 0b5cf0955480..88dee3fe8b2c 100644 --- a/rllib/tuned_examples/ppo/humanoid-ppo.yaml +++ b/rllib/tuned_examples/ppo/humanoid-ppo.yaml @@ -2,7 +2,7 @@ humanoid-ppo: env: Humanoid-v1 run: PPO stop: - episode_reward_mean: 6000 + sampler_results/episode_reward_mean: 6000 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml b/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml index 04c98cd56cb8..98da67a36fdb 100644 --- a/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml +++ b/rllib/tuned_examples/ppo/pendulum-ppo-with-rl-module.yaml @@ -3,11 +3,10 @@ pendulum-ppo: env: Pendulum-v1 run: PPO stop: - episode_reward_mean: -400 + sampler_results/episode_reward_mean: -400 timesteps_total: 400000 config: # Works for both torch and tf2 - # TODO (Kourosh) tf2 is way slower than torch framework: torch train_batch_size: 512 vf_clip_param: 10.0 @@ -22,6 +21,8 @@ pendulum-ppo: enable_connectors: true model: fcnet_activation: relu + _enable_learner_api: true _enable_rl_module_api: true + # Need to unset this b/c we are using the RLModule API, which + # provides exploration control via the RLModule's `forward_exploration` method. exploration_config: {} - eager_tracing: false diff --git a/rllib/tuned_examples/ppo/pendulum-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-ppo.yaml index 113f593cef72..607e736c2905 100644 --- a/rllib/tuned_examples/ppo/pendulum-ppo.yaml +++ b/rllib/tuned_examples/ppo/pendulum-ppo.yaml @@ -3,7 +3,7 @@ pendulum-ppo: env: Pendulum-v1 run: PPO stop: - episode_reward_mean: -400 + sampler_results/episode_reward_mean: -400 timesteps_total: 400000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml index 7be2eaa18fcd..64c15f113700 100644 --- a/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml +++ b/rllib/tuned_examples/ppo/pendulum-transformed-actions-ppo.yaml @@ -3,7 +3,7 @@ pendulum-ppo: env: ray.rllib.examples.env.transformed_action_space_env.TransformedActionPendulum run: PPO stop: - episode_reward_mean: -500 + sampler_results/episode_reward_mean: -500 timesteps_total: 400000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/ppo/pong-ppo.yaml b/rllib/tuned_examples/ppo/pong-ppo.yaml index 2aabde87d949..3da49952e0ae 100644 --- a/rllib/tuned_examples/ppo/pong-ppo.yaml +++ b/rllib/tuned_examples/ppo/pong-ppo.yaml @@ -8,8 +8,11 @@ pong-ppo: config: # Works for both torch and tf. framework: torch + # Make analogous to old v4 + NoFrameskip. env_config: - frameskip: 1 # no frameskip + frameskip: 1 + full_action_space: false + repeat_action_probability: 0.0 lambda: 0.95 kl_coeff: 0.5 clip_rewards: True diff --git a/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml b/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml index abead1cc0647..808f1955dab0 100644 --- a/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml +++ b/rllib/tuned_examples/ppo/recomm-sys001-ppo.yaml @@ -2,7 +2,7 @@ recomm-sys001-ppo: env: ray.rllib.examples.env.recommender_system_envs.RecommSys001 run: PPO stop: - #evaluation/episode_reward_mean: 48.0 + #evaluation/sampler_results/episode_reward_mean: 48.0 timesteps_total: 200000 config: framework: torch diff --git a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml index 6a4d74b8a326..00fb269ff1d0 100644 --- a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml +++ b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml @@ -3,7 +3,7 @@ repeat-after-me-ppo-w-lstm: env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv run: PPO stop: - episode_reward_mean: 50 + sampler_results/episode_reward_mean: 50 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml index 65d314f42ea1..29b0c2c8d5db 100644 --- a/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml +++ b/rllib/tuned_examples/qmix/two-step-game-qmix-no-mixer.yaml @@ -2,7 +2,7 @@ two-step-game-qmix-without-mixer: env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents run: QMIX stop: - episode_reward_mean: 6.5 + sampler_results/episode_reward_mean: 6.5 timesteps_total: 70000 config: # QMIX only supports torch for now. diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml index 59f7560ec160..0ef024653849 100644 --- a/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml +++ b/rllib/tuned_examples/qmix/two-step-game-qmix-vdn-mixer.yaml @@ -2,7 +2,7 @@ two-step-game-qmix-with-vdn-mixer: env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents run: QMIX stop: - episode_reward_mean: 6.5 + sampler_results/episode_reward_mean: 6.5 timesteps_total: 70000 config: # QMIX only supports torch for now. diff --git a/rllib/tuned_examples/qmix/two-step-game-qmix.yaml b/rllib/tuned_examples/qmix/two-step-game-qmix.yaml index 1bf036f7bba8..9e462559bd39 100644 --- a/rllib/tuned_examples/qmix/two-step-game-qmix.yaml +++ b/rllib/tuned_examples/qmix/two-step-game-qmix.yaml @@ -2,7 +2,7 @@ two-step-game-qmix-with-qmix-mixer: env: ray.rllib.examples.env.two_step_game.TwoStepGameWithGroupedAgents run: QMIX stop: - episode_reward_mean: 7.5 + sampler_results/episode_reward_mean: 7.5 timesteps_total: 70000 config: # QMIX only supports torch for now. diff --git a/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2-fake-gpus.yaml b/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2-fake-gpus.yaml index 3888d5f90811..9634d6819836 100644 --- a/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2-fake-gpus.yaml +++ b/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2-fake-gpus.yaml @@ -2,7 +2,7 @@ stateless-cartpole-r2d2: env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole run: R2D2 stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 1000000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2.yaml b/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2.yaml index 0288315e6a2d..8389ca24e08c 100644 --- a/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2.yaml +++ b/rllib/tuned_examples/r2d2/stateless-cartpole-r2d2.yaml @@ -2,7 +2,7 @@ stateless-cartpole-r2d2: env: ray.rllib.examples.env.stateless_cartpole.StatelessCartPole run: R2D2 stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 1000000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml b/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml index 86da2f1d661c..9e27a2664faa 100644 --- a/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml +++ b/rllib/tuned_examples/sac/cartpole-continuous-pybullet-sac.yaml @@ -2,7 +2,7 @@ cartpole-sac: env: CartPoleContinuousBulletEnv-v0 run: SAC stop: - episode_reward_mean: 40 + sampler_results/episode_reward_mean: 40 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/sac/cartpole-sac.yaml b/rllib/tuned_examples/sac/cartpole-sac.yaml index b6ab12fe630a..c599ede52e89 100644 --- a/rllib/tuned_examples/sac/cartpole-sac.yaml +++ b/rllib/tuned_examples/sac/cartpole-sac.yaml @@ -2,7 +2,7 @@ cartpole-sac: env: CartPole-v1 run: SAC stop: - episode_reward_mean: 150.0 + sampler_results/episode_reward_mean: 150.0 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml b/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml index 5cabaaca07eb..f48d6049a98a 100644 --- a/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml +++ b/rllib/tuned_examples/sac/halfcheetah-pybullet-sac.yaml @@ -2,7 +2,7 @@ halfcheetah-pybullet-sac: env: HalfCheetahBulletEnv-v0 run: SAC stop: - episode_reward_mean: 800.0 + sampler_results/episode_reward_mean: 800.0 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/sac/halfcheetah-sac.yaml b/rllib/tuned_examples/sac/halfcheetah-sac.yaml index aa6c312109d4..b69aabfe0536 100644 --- a/rllib/tuned_examples/sac/halfcheetah-sac.yaml +++ b/rllib/tuned_examples/sac/halfcheetah-sac.yaml @@ -3,7 +3,7 @@ halfcheetah_sac: env: HalfCheetah-v3 run: SAC stop: - episode_reward_mean: 9000 + sampler_results/episode_reward_mean: 9000 config: # Works for both torch and tf. framework: torch diff --git a/rllib/tuned_examples/sac/mspacman-sac.yaml b/rllib/tuned_examples/sac/mspacman-sac.yaml index 09e9d9386912..86ef3ca442f9 100644 --- a/rllib/tuned_examples/sac/mspacman-sac.yaml +++ b/rllib/tuned_examples/sac/mspacman-sac.yaml @@ -5,7 +5,7 @@ mspacman-sac-tf: env: ALE/MsPacman-v5 run: SAC stop: - episode_reward_mean: 800 + sampler_results/episode_reward_mean: 800 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/sac/pendulum-sac-fake-gpus.yaml b/rllib/tuned_examples/sac/pendulum-sac-fake-gpus.yaml index 96ddf12bd3dc..fb20bf925aa0 100644 --- a/rllib/tuned_examples/sac/pendulum-sac-fake-gpus.yaml +++ b/rllib/tuned_examples/sac/pendulum-sac-fake-gpus.yaml @@ -2,7 +2,7 @@ pendulum-sac-fake-gpus: env: Pendulum-v1 run: SAC stop: - episode_reward_mean: -270 + sampler_results/episode_reward_mean: -270 timesteps_total: 10000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/sac/pendulum-sac.yaml b/rllib/tuned_examples/sac/pendulum-sac.yaml index fcc662eaa7c4..7fedc4ecd8ac 100644 --- a/rllib/tuned_examples/sac/pendulum-sac.yaml +++ b/rllib/tuned_examples/sac/pendulum-sac.yaml @@ -4,7 +4,7 @@ pendulum-sac: env: Pendulum-v1 run: SAC stop: - episode_reward_mean: -250 + sampler_results/episode_reward_mean: -250 timesteps_total: 10000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/sac/pendulum-transformed-actions-sac.yaml b/rllib/tuned_examples/sac/pendulum-transformed-actions-sac.yaml index 5d98ebf7fc55..44ff5ebd1789 100644 --- a/rllib/tuned_examples/sac/pendulum-transformed-actions-sac.yaml +++ b/rllib/tuned_examples/sac/pendulum-transformed-actions-sac.yaml @@ -4,7 +4,7 @@ transformed-actions-pendulum-sac-dummy-torch: env: ray.rllib.examples.env.transformed_action_space_env.TransformedActionPendulum run: SAC stop: - episode_reward_mean: -200 + sampler_results/episode_reward_mean: -200 timesteps_total: 10000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/simple_q/cartpole-simpleq-fake-gpus.yaml b/rllib/tuned_examples/simple_q/cartpole-simpleq-fake-gpus.yaml index 59e57f8ba127..55833b1abcb5 100644 --- a/rllib/tuned_examples/simple_q/cartpole-simpleq-fake-gpus.yaml +++ b/rllib/tuned_examples/simple_q/cartpole-simpleq-fake-gpus.yaml @@ -2,7 +2,7 @@ cartpole-simpleq-fake-gpus: env: CartPole-v1 run: SimpleQ stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 training_iteration: 400 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/simple_q/cartpole-simpleq-test.yaml b/rllib/tuned_examples/simple_q/cartpole-simpleq-test.yaml index bd0490687782..1464f22703e5 100644 --- a/rllib/tuned_examples/simple_q/cartpole-simpleq-test.yaml +++ b/rllib/tuned_examples/simple_q/cartpole-simpleq-test.yaml @@ -2,7 +2,7 @@ cartpole-simpleq-test: env: CartPole-v1 run: SimpleQ stop: - episode_reward_mean: 50.0 + sampler_results/episode_reward_mean: 50.0 timesteps_total: 10000 config: framework: torch diff --git a/rllib/tuned_examples/simple_q/cartpole-simpleq.yaml b/rllib/tuned_examples/simple_q/cartpole-simpleq.yaml index 3b7bc198ddde..ab507415992c 100644 --- a/rllib/tuned_examples/simple_q/cartpole-simpleq.yaml +++ b/rllib/tuned_examples/simple_q/cartpole-simpleq.yaml @@ -2,7 +2,7 @@ cartpole-simpleq: env: CartPole-v1 run: SimpleQ stop: - episode_reward_mean: 150 + sampler_results/episode_reward_mean: 150 timesteps_total: 50000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq-fake-gpus.yaml b/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq-fake-gpus.yaml index d5fc1ccc5c2e..8ef384ce8571 100644 --- a/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq-fake-gpus.yaml +++ b/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq-fake-gpus.yaml @@ -2,7 +2,7 @@ interest-evolution-recsim-env-slateq: env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv run: SlateQ stop: - episode_reward_mean: 160.0 + sampler_results/episode_reward_mean: 160.0 timesteps_total: 100000 config: framework: torch diff --git a/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq.yaml b/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq.yaml index 86056565bca9..1f2cdc53e574 100644 --- a/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq.yaml +++ b/rllib/tuned_examples/slateq/interest-evolution-10-candidates-recsim-env-slateq.yaml @@ -2,7 +2,7 @@ interest-evolution-recsim-env-slateq: env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv run: SlateQ stop: - episode_reward_mean: 160.0 + sampler_results/episode_reward_mean: 160.0 timesteps_total: 120000 config: framework: torch diff --git a/rllib/tuned_examples/slateq/interest-evolution-50-candidates-recsim-env-slateq.yaml b/rllib/tuned_examples/slateq/interest-evolution-50-candidates-recsim-env-slateq.yaml index b698457260ae..668cfaf95051 100644 --- a/rllib/tuned_examples/slateq/interest-evolution-50-candidates-recsim-env-slateq.yaml +++ b/rllib/tuned_examples/slateq/interest-evolution-50-candidates-recsim-env-slateq.yaml @@ -2,7 +2,7 @@ interest-evolution-recsim-env-slateq: env: ray.rllib.examples.env.recommender_system_envs_with_recsim.InterestEvolutionRecSimEnv run: SlateQ stop: - episode_reward_mean: 162.0 + sampler_results/episode_reward_mean: 162.0 timesteps_total: 300000 config: framework: tf2 diff --git a/rllib/tuned_examples/slateq/long-term-satisfaction-recsim-env-slateq.yaml b/rllib/tuned_examples/slateq/long-term-satisfaction-recsim-env-slateq.yaml index 9f5419517c01..a83f7186361e 100644 --- a/rllib/tuned_examples/slateq/long-term-satisfaction-recsim-env-slateq.yaml +++ b/rllib/tuned_examples/slateq/long-term-satisfaction-recsim-env-slateq.yaml @@ -5,7 +5,7 @@ long-term-satisfaction-recsim-env-slateq: # Random baseline rewards: # num_candidates=20; slate_size=2; resample=true: ~951 # num_candidates=50; slate_size=3; resample=true: ~946 - evaluation/episode_reward_mean: 1000.0 + evaluation/sampler_results/episode_reward_mean: 1000.0 timesteps_total: 200000 config: # Works for both tf and torch. diff --git a/rllib/tuned_examples/slateq/parametric-item-reco-env-slateq.yaml b/rllib/tuned_examples/slateq/parametric-item-reco-env-slateq.yaml index e2bf73023eec..fdbc25476836 100644 --- a/rllib/tuned_examples/slateq/parametric-item-reco-env-slateq.yaml +++ b/rllib/tuned_examples/slateq/parametric-item-reco-env-slateq.yaml @@ -2,7 +2,7 @@ parametric-item-reco-env-slateq: env: ray.rllib.examples.env.bandit_envs_recommender_system.ParametricItemRecoEnv run: SlateQ stop: - #evaluation/episode_reward_mean: 48.0 + #evaluation/sampler_results/episode_reward_mean: 48.0 timesteps_total: 200000 config: # SlateQ only supported for torch so far. diff --git a/rllib/tuned_examples/slateq/recomm-sys001-slateq.yaml b/rllib/tuned_examples/slateq/recomm-sys001-slateq.yaml index 356629592885..2fff86d59ea9 100644 --- a/rllib/tuned_examples/slateq/recomm-sys001-slateq.yaml +++ b/rllib/tuned_examples/slateq/recomm-sys001-slateq.yaml @@ -2,7 +2,7 @@ recomm-sys001-slateq: env: ray.rllib.examples.env.recommender_system_envs.RecommSys001 run: SlateQ stop: - #evaluation/episode_reward_mean: 48.0 + #evaluation/sampler_results/episode_reward_mean: 48.0 timesteps_total: 200000 config: # SlateQ only supported for torch so far. diff --git a/rllib/tuned_examples/td3/invertedpendulum-td3.yaml b/rllib/tuned_examples/td3/invertedpendulum-td3.yaml index c0b1092bd513..081a88c1d1f8 100644 --- a/rllib/tuned_examples/td3/invertedpendulum-td3.yaml +++ b/rllib/tuned_examples/td3/invertedpendulum-td3.yaml @@ -5,7 +5,7 @@ invertedpendulum-td3: env: InvertedPendulum-v2 run: TD3 stop: - episode_reward_mean: 9999.9 + sampler_results/episode_reward_mean: 9999.9 time_total_s: 900 # 15 minutes timesteps_total: 1000000 config: diff --git a/rllib/tuned_examples/td3/pendulum-td3-fake-gpus.yaml b/rllib/tuned_examples/td3/pendulum-td3-fake-gpus.yaml index 4a2a383fec86..09e135049377 100644 --- a/rllib/tuned_examples/td3/pendulum-td3-fake-gpus.yaml +++ b/rllib/tuned_examples/td3/pendulum-td3-fake-gpus.yaml @@ -2,7 +2,7 @@ pendulum-td3-fake-gpus: env: Pendulum-v1 run: TD3 stop: - episode_reward_mean: -900 + sampler_results/episode_reward_mean: -900 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/tuned_examples/td3/pendulum-td3.yaml b/rllib/tuned_examples/td3/pendulum-td3.yaml index d772f16c6ac5..038caaa7e791 100644 --- a/rllib/tuned_examples/td3/pendulum-td3.yaml +++ b/rllib/tuned_examples/td3/pendulum-td3.yaml @@ -3,7 +3,7 @@ pendulum-td3: env: Pendulum-v1 run: TD3 stop: - episode_reward_mean: -900 + sampler_results/episode_reward_mean: -900 timesteps_total: 100000 config: # Works for both torch and tf. diff --git a/rllib/utils/__init__.py b/rllib/utils/__init__.py index 14ecece9a2ab..402f747e1cc4 100644 --- a/rllib/utils/__init__.py +++ b/rllib/utils/__init__.py @@ -2,10 +2,15 @@ from functools import partial from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI -from ray.rllib.utils.framework import try_import_tf, try_import_tfp, try_import_torch from ray.rllib.utils.deprecation import deprecation_warning -from ray.rllib.utils.filter_manager import FilterManager from ray.rllib.utils.filter import Filter +from ray.rllib.utils.filter_manager import FilterManager +from ray.rllib.utils.framework import ( + try_import_jax, + try_import_tf, + try_import_tfp, + try_import_torch, +) from ray.rllib.utils.numpy import ( sigmoid, softmax, @@ -120,6 +125,7 @@ def __exit__(self, *args): "relu", "sigmoid", "softmax", + "try_import_jax", "try_import_tf", "try_import_tfp", "try_import_torch", diff --git a/rllib/utils/actor_manager.py b/rllib/utils/actor_manager.py index 1d06bb9cca97..d8d0a86606e5 100644 --- a/rllib/utils/actor_manager.py +++ b/rllib/utils/actor_manager.py @@ -232,8 +232,8 @@ class _ActorState: def __init__( self, actors: Optional[List[ActorHandle]] = None, - max_remote_requests_in_flight_per_actor: Optional[int] = 2, - init_id: Optional[int] = 0, + max_remote_requests_in_flight_per_actor: int = 2, + init_id: int = 0, ): """Construct a FaultTolerantActorManager. @@ -738,7 +738,7 @@ def fetch_ready_async_reqs( Automatically mark actors unhealthy if they fail to respond. Note: If tags is an empty tuple then results from all ready async requests are - returned. + returned. Args: timeout_seconds: Ray.get() timeout. Default is 0 (only those that are diff --git a/rllib/utils/checkpoints.py b/rllib/utils/checkpoints.py index 7bbc456ca148..19e5cc145b31 100644 --- a/rllib/utils/checkpoints.py +++ b/rllib/utils/checkpoints.py @@ -29,7 +29,11 @@ # 1.1: Same as 1.0, but has a new "format" field in the rllib_checkpoint.json file # indicating, whether the checkpoint is `cloudpickle` (default) or `msgpack`. + +# 1.2: Introduces the checkpoint for the new Learner API if the Learner api is enabled. + CHECKPOINT_VERSION = version.Version("1.1") +CHECKPOINT_VERSION_LEARNER = version.Version("1.2") @PublicAPI(stability="alpha") @@ -102,15 +106,15 @@ def get_checkpoint_info(checkpoint: Union[str, Checkpoint]) -> Dict[str, Any]: rllib_checkpoint_info["checkpoint_version"] ) info.update(rllib_checkpoint_info) - - # No rllib_checkpoint.json file present: Warn and continue trying to figure out - # checkpoint info ourselves. - if log_once("no_rllib_checkpoint_json_file"): - logger.warning( - "No `rllib_checkpoint.json` file found in checkpoint directory " - f"{checkpoint}! Trying to extract checkpoint info from other files " - f"found in that dir." - ) + else: + # No rllib_checkpoint.json file present: Warn and continue trying to figure + # out checkpoint info ourselves. + if log_once("no_rllib_checkpoint_json_file"): + logger.warning( + "No `rllib_checkpoint.json` file found in checkpoint directory " + f"{checkpoint}! Trying to extract checkpoint info from other files " + f"found in that dir." + ) # Policy checkpoint file found. for extension in ["pkl", "msgpck"]: @@ -222,7 +226,10 @@ def convert_to_msgpack_checkpoint( state["worker"]["is_policy_to_train"] = NOT_SERIALIZABLE # Add RLlib checkpoint version (as string). - state["checkpoint_version"] = str(CHECKPOINT_VERSION) + if state["config"]["_enable_learner_api"]: + state["checkpoint_version"] = str(CHECKPOINT_VERSION_LEARNER) + else: + state["checkpoint_version"] = str(CHECKPOINT_VERSION) # Write state (w/o policies) to disk. state_file = os.path.join(msgpack_checkpoint_dir, "algorithm_state.msgpck") diff --git a/rllib/utils/debug/deterministic.py b/rllib/utils/debug/deterministic.py index f41fdabf323b..d3696c92b54d 100644 --- a/rllib/utils/debug/deterministic.py +++ b/rllib/utils/debug/deterministic.py @@ -37,10 +37,7 @@ def update_global_seed_if_necessary( if cuda_version is not None and float(torch.version.cuda) >= 10.2: os.environ["CUBLAS_WORKSPACE_CONFIG"] = "4096:8" else: - try: - from packaging.version import Version - except ImportError: - from distutils.version import LooseVersion as Version + from packaging.version import Version if Version(torch.__version__) >= Version("1.8.0"): # Not all Operations support this. diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index 4e969347c58e..fc5830adf051 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -28,6 +28,9 @@ def do_test_explorations(config, dummy_obs, prev_a=None, expected_mean_action=No for exploration in [None, "Random"]: local_config = config.copy() if exploration == "Random": + if local_config._enable_rl_module_api: + # TODO(Artur): Support Random exploration with RL Modules. + continue local_config.exploration(exploration_config={"type": "Random"}) print("exploration={}".format(exploration or "default")) diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py index a94a71151a96..5e6f138f13db 100644 --- a/rllib/utils/framework.py +++ b/rllib/utils/framework.py @@ -290,6 +290,8 @@ def get_variable( ) elif framework == "torch" and torch_tensor is True: torch, _ = try_import_torch() + if not isinstance(value, np.ndarray): + value = np.array(value) var_ = torch.from_numpy(value) if dtype in [torch.float32, np.float32]: var_ = var_.float() diff --git a/rllib/utils/metrics/window_stat.py b/rllib/utils/metrics/window_stat.py index 8b270b4a2206..ff6c15569797 100644 --- a/rllib/utils/metrics/window_stat.py +++ b/rllib/utils/metrics/window_stat.py @@ -2,7 +2,7 @@ class WindowStat: - """Handles/stores incoming datastream and provides window-based statistics. + """Handles/stores incoming dataset and provides window-based statistics. Examples: >>> win_stats = WindowStat("level", 3) diff --git a/rllib/utils/minibatch_utils.py b/rllib/utils/minibatch_utils.py index 2920e32e4080..f3dd7c2c19ab 100644 --- a/rllib/utils/minibatch_utils.py +++ b/rllib/utils/minibatch_utils.py @@ -61,9 +61,9 @@ def __iter__(self): if len(module_batch) == 0: raise ValueError( - f"The batch for module_id {module_id} is empty. " + f"The batch for module_id {module_id} is empty! " "This will create an infinite loop because we need to cover " - "the same number of samples for each module_id. " + "the same number of samples for each module_id." ) s = self._start[module_id] # start n_steps = self._minibatch_size diff --git a/rllib/utils/schedules/scheduler.py b/rllib/utils/schedules/scheduler.py new file mode 100644 index 000000000000..7d349329791b --- /dev/null +++ b/rllib/utils/schedules/scheduler.py @@ -0,0 +1,157 @@ +from collections import defaultdict +from typing import List, Optional, Tuple + +from ray.rllib.core.rl_module.rl_module import ModuleID +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule +from ray.rllib.utils.typing import TensorType + + +_, tf, _ = try_import_tf() +torch, _ = try_import_torch() + + +class Scheduler: + """Class to manage a scheduled (framework-dependent) tensor variable. + + Uses the PiecewiseSchedule (for maximum configuration flexibility) + """ + + def __init__( + self, + *, + fixed_value: Optional[float] = None, + schedule: Optional[List[Tuple[int, float]]] = None, + framework: str = "torch", + device: Optional[str] = None, + ): + """Initializes a Scheduler instance. + + Args: + fixed_value: A fixed, constant value (in case no schedule should be used). + Set `schedule` to None to always just use this fixed value. + If `fixed_value` is None, `schedule` must be provided. + schedule: The schedule configuration to use. In the format of + [[timestep, value], [timestep, value], ...] + Intermediary timesteps will be assigned to interpolated values (linear + interpolation will be used). A schedule config's first entry must + start with timestep 0, i.e.: [[0, initial_value], [...]]. + framework: The framework string, for which to create the tensor variable + that hold the current value. This is the variable that can be used in + the graph, e.g. in a loss function. + device: Optional device (for torch) to place the tensor variable on. + """ + self.use_schedule = schedule is not None + self.framework = framework + self.device = device + + if self.use_schedule: + # Custom schedule, based on list of + # ([ts], [value to be reached by ts])-tuples. + self.schedule_per_module = defaultdict( + lambda: PiecewiseSchedule( + schedule, + outside_value=schedule[-1][-1], + framework=None, + ) + ) + # As initial tensor valie, use the first timestep's (must be 0) value. + self.curr_value_per_module = defaultdict( + lambda: self._create_tensor_variable(initial_value=schedule[0][1]) + ) + # If no schedule, pin (fix) given value. + else: + self.curr_value_per_module = defaultdict(lambda: fixed_value) + + @staticmethod + def validate( + schedule: Optional[List[Tuple[int, float]]], + schedule_name: str, + value_name: str, + ) -> None: + """Performs checking of a certain schedule configuration. + + The first entry in `schedule` must have a timestep of 0. + + Args: + schedule: The schedule configuration to check. In the format of + [[timestep, value], [timestep, value], ...] + Intermediary timesteps will be assigned to interpolated values (linear + interpolation will be used). A schedule config's first entry must + start with timestep 0, i.e.: [[0, initial_value], [...]]. + schedule_name: The name of the schedule, e.g. `lr_schedule`. + value_name: A full text description of the variable that's being scheduled, + e.g. `learning rate`. + + Raises: + ValueError: In case, errors are found in the schedule's format. + """ + if schedule is not None: + if not isinstance(schedule, (list, tuple)) or (len(schedule) < 2): + raise ValueError( + f"Invalid `{schedule_name}` ({schedule}) specified! Must be a " + "list of at least 2 tuples, each of the form " + f"(`timestep`, `{value_name} to reach`), e.g. " + "`[(0, 0.001), (1e6, 0.0001), (2e6, 0.00005)]`." + ) + elif schedule[0][0] != 0: + raise ValueError( + f"When providing a `{schedule_name}`, the first timestep must be 0 " + f"and the corresponding lr value is the initial {value_name}! You " + f"provided ts={schedule[0][0]} {value_name}={schedule[0][1]}." + ) + + def get_current_value(self, module_id: ModuleID) -> TensorType: + """Returns the current value (as a tensor variable), given a ModuleID. + + Args: + module_id: The module ID, for which to retrueve the current tensor value. + + Returns: + The tensor variable (holding the current value to be used). + """ + return self.curr_value_per_module[module_id] + + def update(self, module_id: ModuleID, timestep: int) -> float: + """Updates the underlying (framework specific) tensor variable. + + Args: + module_id: The module ID, for which to update the tensor variable. + timestep: The current timestep. + + Returns: + The current value of the tensor variable as a python float. + """ + if self.use_schedule: + python_value = self.schedule_per_module[module_id].value(t=timestep) + if self.framework == "torch": + self.curr_value_per_module[module_id].data = torch.tensor(python_value) + else: + self.curr_value_per_module[module_id].assign(python_value) + else: + python_value = self.curr_value_per_module[module_id] + + return python_value + + def _create_tensor_variable(self, initial_value: float) -> TensorType: + """Creates a framework-specific tensor variable to be scheduled. + + Args: + initial_value: The initial (float) value for the variable to hold. + + Returns: + The created framework-specific tensor variable. + """ + if self.framework == "torch": + return torch.tensor( + initial_value, + requires_grad=False, + dtype=torch.float32, + device=self.device, + ) + else: + return tf.Variable( + initial_value, + trainable=False, + dtype=tf.float32, + ) diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py index c60cbd5ae633..fd57836fb47b 100644 --- a/rllib/utils/test_utils.py +++ b/rllib/utils/test_utils.py @@ -1,7 +1,9 @@ from collections import Counter import copy import gymnasium as gym -from gymnasium.spaces import Box +from gymnasium.spaces import Box, Discrete, MultiDiscrete, MultiBinary +from gymnasium.spaces import Dict as GymDict +from gymnasium.spaces import Tuple as GymTuple import logging import numpy as np import os @@ -34,6 +36,9 @@ ) from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.typing import PartialAlgorithmConfigDict, ResultDict +from ray.rllib.utils.error import UnsupportedSpaceException + + from ray.tune import CLIReporter, run_experiments @@ -781,9 +786,9 @@ def should_check_eval(experiment): check_eval = should_check_eval(e) episode_reward_key = ( - "episode_reward_mean" + "sampler_results/episode_reward_mean" if not check_eval - else "evaluation/episode_reward_mean" + else "evaluation/sampler_results/episode_reward_mean" ) # For smoke-tests, we just run for n min. @@ -899,14 +904,18 @@ def should_check_eval(experiment): if check_eval: episode_reward_mean = np.mean( [ - t.metric_analysis["evaluation/episode_reward_mean"]["max"] + t.metric_analysis[ + "evaluation/sampler_results/episode_reward_mean" + ]["max"] for t in trials_for_experiment ] ) else: episode_reward_mean = np.mean( [ - t.metric_analysis["episode_reward_mean"]["max"] + t.metric_analysis["sampler_results/episode_reward_mean"][ + "max" + ] for t in trials_for_experiment ] ) @@ -1261,3 +1270,323 @@ def check(self, rtol=None): # same input and all nets have the same (dummy) weight values. for v in self.output_values.values(): check(v, self.output_values[main_key], rtol=rtol or 0.002) + + +def _get_mean_action_from_algorithm(alg: "Algorithm", obs: np.ndarray) -> np.ndarray: + """Returns the mean action computed by the given algorithm. + + Note: This makes calls to `Algorithm.compute_single_action` + + Args: + alg: The constructed algorithm to run inference on. + obs: The observation to compute the action for. + + Returns: + The mean action computed by the algorithm over 5000 samples. + + """ + out = [] + for _ in range(5000): + out.append(float(alg.compute_single_action(obs))) + return np.mean(out) + + +def test_ckpt_restore( + config: "AlgorithmConfig", + env_name: str, + tf2=False, + object_store=False, + replay_buffer=False, + run_restored_algorithm=True, +): + """Test that after an algorithm is trained, its checkpoint can be restored. + + Check the replay buffers of the algorithm to see if they have identical data. + Check the optimizer weights of the policy on the algorithm to see if they're + identical. + + Args: + config: The config of the algorithm to be trained. + env_name: The name of the gymansium environment to be trained on. + tf2: Whether to test the algorithm with the tf2 framework or not. + object_store: Whether to test checkpointing with objects from the object store. + replay_buffer: Whether to test checkpointing with replay buffers. + run_restored_algorithm: Whether to run the restored algorithm after restoring. + + """ + # config = algorithms_and_configs[algo_name].to_dict() + # If required, store replay buffer data in checkpoints as well. + if replay_buffer: + config["store_buffer_in_checkpoints"] = True + + frameworks = (["tf2"] if tf2 else []) + ["torch", "tf"] + for fw in framework_iterator(config, frameworks=frameworks): + for use_object_store in [False, True] if object_store else [False]: + print("use_object_store={}".format(use_object_store)) + env = gym.make(env_name) + alg1 = config.environment(env_name).framework(fw).build() + alg2 = config.environment(env_name).build() + + policy1 = alg1.get_policy() + + res = alg1.train() + print("current status: " + str(res)) + + # Check optimizer state as well. + optim_state = policy1.get_state().get("_optimizer_variables") + + if use_object_store: + checkpoint = alg1.save_to_object() + else: + checkpoint = alg1.save() + + # Test if we can restore multiple times (at least twice, assuming failure + # would mainly stem from improperly reused variables) + for num_restores in range(2): + # Sync the models + if use_object_store: + alg2.restore_from_object(checkpoint) + else: + alg2.restore(checkpoint) + + # Compare optimizer state with re-loaded one. + if optim_state: + s2 = alg2.get_policy().get_state().get("_optimizer_variables") + # Tf -> Compare states 1:1. + if fw in ["tf2", "tf"]: + check(s2, optim_state) + # For torch, optimizers have state_dicts with keys=params, + # which are different for the two models (ignore these + # different keys, but compare all values nevertheless). + else: + for i, s2_ in enumerate(s2): + check( + list(s2_["state"].values()), + list(optim_state[i]["state"].values()), + ) + + # Compare buffer content with restored one. + if replay_buffer: + data = alg1.local_replay_buffer.replay_buffers[ + "default_policy" + ]._storage[42 : 42 + 42] + new_data = alg2.local_replay_buffer.replay_buffers[ + "default_policy" + ]._storage[42 : 42 + 42] + check(data, new_data) + + for _ in range(1): + obs = env.observation_space.sample() + a1 = _get_mean_action_from_algorithm(alg1, obs) + a2 = _get_mean_action_from_algorithm(alg2, obs) + print("Checking computed actions", alg1, obs, a1, a2) + if abs(a1 - a2) > 0.1: + raise AssertionError( + "algo={} [a1={} a2={}]".format(str(alg1.__class__), a1, a2) + ) + # Stop algo 1. + alg1.stop() + + if run_restored_algorithm: + # Check that algo 2 can still run. + print("Starting second run on Algo 2...") + alg2.train() + alg2.stop() + + +def check_supported_spaces( + alg: str, + config: "AlgorithmConfig", + train: bool = True, + check_bounds: bool = False, + frameworks: List = None, + use_gpu: bool = False, +): + """Checks whether the given algorithm supports different action and obs spaces. + + Performs the checks by constructing an rllib algorithm from the config and + checking to see that the model inside the policy is the correct one given + the action and obs spaces. For example if the action space is discrete and + the obs space is an image, then the model should be a vision network with + a categorical action distribution. + + Args: + alg: The name of the algorithm to test. + config: The config to use for the algorithm. + train: Whether to train the algorithm for a few iterations. + check_bounds: Whether to check the bounds of the action space. + frameworks: The frameworks to test the algorithm with. + use_gpu: Whether to check support for training on a gpu. + + + """ + # do these imports here because otherwise we have circular imports + from ray.rllib.examples.env.random_env import RandomEnv + from ray.rllib.models.tf.complex_input_net import ComplexInputNetwork as ComplexNet + from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNet + from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNet + from ray.rllib.models.torch.complex_input_net import ( + ComplexInputNetwork as TorchComplexNet, + ) + from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet + from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNet + + action_spaces_to_test = { + # Test discrete twice here until we support multi_binary action spaces + "discrete": Discrete(5), + "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32), + "int_actions": Box(0, 3, (2, 3), dtype=np.int32), + "multidiscrete": MultiDiscrete([1, 2, 3, 4]), + "tuple": GymTuple( + [Discrete(2), Discrete(3), Box(-1.0, 1.0, (5,), dtype=np.float32)] + ), + "dict": GymDict( + { + "action_choice": Discrete(3), + "parameters": Box(-1.0, 1.0, (1,), dtype=np.float32), + "yet_another_nested_dict": GymDict( + {"a": GymTuple([Discrete(2), Discrete(3)])} + ), + } + ), + } + + observation_spaces_to_test = { + "multi_binary": MultiBinary([3, 10, 10]), + "discrete": Discrete(5), + "continuous": Box(-1.0, 1.0, (5,), dtype=np.float32), + "vector2d": Box(-1.0, 1.0, (5, 5), dtype=np.float32), + "image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32), + "vizdoomgym": Box(-1.0, 1.0, (240, 320, 3), dtype=np.float32), + "tuple": GymTuple([Discrete(10), Box(-1.0, 1.0, (5,), dtype=np.float32)]), + "dict": GymDict( + { + "task": Discrete(10), + "position": Box(-1.0, 1.0, (5,), dtype=np.float32), + } + ), + } + + # The observation spaces that we test RLModules with + rlmodule_supported_observation_spaces = [ + "multi_binary", + "discrete", + "continuous", + "image", + "vizdoomgym", + "tuple", + "dict", + ] + + # TODO(Artur): Add back tf2 once we CNNs there + rlmodule_supported_frameworks = {"torch"} + + # The action spaces that we test RLModules with + rlmodule_supported_action_spaces = ["discrete", "continuous"] + + default_observation_space = default_action_space = "discrete" + + config["log_level"] = "ERROR" + config["env"] = RandomEnv + + def _do_check(alg, config, a_name, o_name): + + # We need to copy here so that this validation does not affect the actual + # validation method call further down the line. + config_copy = config.copy() + config_copy.validate() + # If RLModules are enabled, we need to skip a few tests for now: + if config_copy._enable_rl_module_api: + # Skip PPO cases in which RLModules don't support the given spaces yet. + if o_name not in rlmodule_supported_observation_spaces: + logger.warning( + "Skipping PPO test with RLModules for obs space {}".format(o_name) + ) + return + if a_name not in rlmodule_supported_action_spaces: + logger.warning( + "Skipping PPO test with RLModules for action space {}".format( + a_name + ) + ) + return + + fw = config["framework"] + action_space = action_spaces_to_test[a_name] + obs_space = observation_spaces_to_test[o_name] + print( + "=== Testing {} (fw={}) action_space={} obs_space={} ===".format( + alg, fw, action_space, obs_space + ) + ) + t0 = time.time() + config.update_from_dict( + dict( + env_config=dict( + action_space=action_space, + observation_space=obs_space, + reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32), + p_terminated=1.0, + check_action_bounds=check_bounds, + ) + ) + ) + stat = "ok" + + try: + algo = config.build() + except ray.exceptions.RayActorError as e: + if len(e.args) >= 2 and isinstance(e.args[2], UnsupportedSpaceException): + stat = "unsupported" + elif isinstance(e.args[0].args[2], UnsupportedSpaceException): + stat = "unsupported" + else: + raise + except UnsupportedSpaceException: + stat = "unsupported" + else: + if alg not in ["DDPG", "ES", "ARS", "SAC", "PPO"]: + # 2D (image) input: Expect VisionNet. + if o_name in ["atari", "image"]: + if fw == "torch": + assert isinstance(algo.get_policy().model, TorchVisionNet) + else: + assert isinstance(algo.get_policy().model, VisionNet) + # 1D input: Expect FCNet. + elif o_name == "continuous": + if fw == "torch": + assert isinstance(algo.get_policy().model, TorchFCNet) + else: + assert isinstance(algo.get_policy().model, FCNet) + # Could be either one: ComplexNet (if disabled Preprocessor) + # or FCNet (w/ Preprocessor). + elif o_name == "vector2d": + if fw == "torch": + assert isinstance( + algo.get_policy().model, (TorchComplexNet, TorchFCNet) + ) + else: + assert isinstance(algo.get_policy().model, (ComplexNet, FCNet)) + if train: + algo.train() + algo.stop() + print("Test: {}, ran in {}s".format(stat, time.time() - t0)) + + if config._enable_rl_module_api: + # Only test the frameworks that are supported by RLModules. + frameworks = frameworks.intersection(rlmodule_supported_frameworks) + + if not frameworks: + frameworks = ["tf2", "torch", "tf"] + _do_check_remote = ray.remote(_do_check) + _do_check_remote = _do_check_remote.options(num_gpus=1 if use_gpu else 0) + for _ in framework_iterator(config, frameworks=frameworks): + # Test all action spaces first. + for a_name in action_spaces_to_test.keys(): + o_name = default_observation_space + ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) + + # Now test all observation spaces. + for o_name in observation_spaces_to_test.keys(): + a_name = default_action_space + ray.get(_do_check_remote.remote(alg, config, a_name, o_name)) diff --git a/rllib/utils/tests/test_actor_manager.py b/rllib/utils/tests/test_actor_manager.py index f65f925d0508..cf11efde0d00 100644 --- a/rllib/utils/tests/test_actor_manager.py +++ b/rllib/utils/tests/test_actor_manager.py @@ -7,7 +7,7 @@ import unittest import ray -from ray.experimental.state.api import list_actors +from ray.util.state import list_actors from ray.rllib.utils.actor_manager import FaultAwareApply, FaultTolerantActorManager diff --git a/rllib/utils/tests/test_torch_utils.py b/rllib/utils/tests/test_torch_utils.py index ee4d70b643c5..94f8e3a7e79a 100644 --- a/rllib/utils/tests/test_torch_utils.py +++ b/rllib/utils/tests/test_torch_utils.py @@ -4,7 +4,10 @@ import torch.cuda import ray -from ray.rllib.utils.torch_utils import convert_to_torch_tensor +from ray.rllib.utils.torch_utils import ( + convert_to_torch_tensor, + copy_torch_tensors, +) class TestTorchUtils(unittest.TestCase): @@ -43,6 +46,54 @@ def test_convert_to_torch_tensor(self): self.assertTrue(converted["b"].dtype is torch.float32) self.assertTrue(converted["c"] is None) + def test_copy_torch_tensors(self): + array = np.array([1, 2, 3], dtype=np.float32) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + tensor = torch.from_numpy(array).to(device) + tensor_2 = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float64).to(device) + + # Test single tensor + copied_tensor = copy_torch_tensors(tensor, device) + self.assertTrue(copied_tensor.device == device) + self.assertNotEqual(id(copied_tensor), id(tensor)) + self.assertTrue(all(copied_tensor == tensor)) + + # check that dtypes aren't modified + copied_tensor_2 = copy_torch_tensors(tensor_2, device) + self.assertTrue(copied_tensor_2.dtype == tensor_2.dtype) + self.assertFalse(copied_tensor_2.dtype == torch.float32) + + # Test nested structure can be converted + nested_structure = {"a": tensor, "b": tensor_2, "c": 1} + copied_nested_structure = copy_torch_tensors(nested_structure, device) + self.assertTrue(copied_nested_structure["a"].device == device) + self.assertTrue(copied_nested_structure["b"].device == device) + self.assertTrue(copied_nested_structure["c"] == 1) + self.assertNotEqual(id(copied_nested_structure["a"]), id(tensor)) + self.assertNotEqual(id(copied_nested_structure["b"]), id(tensor_2)) + self.assertTrue(all(copied_nested_structure["a"] == tensor)) + self.assertTrue(all(copied_nested_structure["b"] == tensor_2)) + + # if gpu is available test moving tensor from cpu to gpu and vice versa + if torch.cuda.is_available(): + tensor = torch.from_numpy(array).to("cpu") + copied_tensor = copy_torch_tensors(tensor, "cuda:0") + self.assertFalse(copied_tensor.device == torch.device("cpu")) + self.assertTrue(copied_tensor.device == torch.device("cuda:0")) + self.assertNotEqual(id(copied_tensor), id(tensor)) + self.assertTrue( + all(copied_tensor.detach().cpu().numpy() == tensor.detach().numpy()) + ) + + tensor = torch.from_numpy(array).to("cuda:0") + copied_tensor = copy_torch_tensors(tensor, "cpu") + self.assertFalse(copied_tensor.device == torch.device("cuda:0")) + self.assertTrue(copied_tensor.device == torch.device("cpu")) + self.assertNotEqual(id(copied_tensor), id(tensor)) + self.assertTrue( + all(copied_tensor.detach().numpy() == tensor.detach().cpu().numpy()) + ) + if __name__ == "__main__": import pytest diff --git a/rllib/utils/tf_utils.py b/rllib/utils/tf_utils.py index 91d559262b7f..276fa886e5e9 100644 --- a/rllib/utils/tf_utils.py +++ b/rllib/utils/tf_utils.py @@ -3,7 +3,7 @@ import logging import numpy as np import tree # pip install dm_tree -from typing import Any, Callable, List, Optional, Type, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, List, Optional, Type, TYPE_CHECKING, Union from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI from ray.rllib.utils.framework import try_import_tf @@ -27,6 +27,48 @@ tf1, tf, tfv = try_import_tf() +@PublicAPI +def clip_gradients( + gradients_dict: Dict[str, "tf.Tensor"], + *, + grad_clip: Optional[float] = None, + grad_clip_by: str = "value", +) -> None: + """Performs gradient clipping on a grad-dict based on a clip value and clip mode. + + Changes the provided gradient dict in place. + + Args: + gradients_dict: The gradients dict, mapping str to gradient tensors. + grad_clip: The value to clip with. The way gradients are clipped is defined + by the `grad_clip_by` arg (see below). + grad_clip_by: One of 'value', 'norm', or 'global_norm'. + """ + # No clipping, return. + if grad_clip is None: + return + + # Clip by value (each gradient individually). + if grad_clip_by == "value": + for k, v in gradients_dict.copy().items(): + gradients_dict[k] = tf.clip_by_value(v, -grad_clip, grad_clip) + + # Clip by L2-norm (per gradient tensor). + elif grad_clip_by == "norm": + for k, v in gradients_dict.copy().items(): + gradients_dict[k] = tf.clip_by_norm(v, grad_clip) + + # Clip by global L2-norm (across all gradient tensors). + else: + assert grad_clip_by == "global_norm" + + clipped_grads, _ = tf.clip_by_global_norm( + list(gradients_dict.values()), grad_clip + ) + for k, v in zip(gradients_dict.copy().keys(), clipped_grads): + gradients_dict[k] = v + + @PublicAPI def explained_variance(y: TensorType, pred: TensorType) -> TensorType: """Computes the explained variance for a pair of labels and predictions. @@ -176,7 +218,7 @@ def get_placeholder( value: Optional[Any] = None, name: Optional[str] = None, time_axis: bool = False, - flatten: bool = True + flatten: bool = True, ) -> "tf1.placeholder": """Returns a tf1.placeholder object given optional hints, such as a space. @@ -413,6 +455,8 @@ def _create_placeholders(path, value): return make_wrapper +# TODO (sven): Deprecate this function once we have moved completely to the Learner API. +# Replaced with `clip_gradients()`. @PublicAPI def minimize_and_clip( optimizer: LocalOptimizer, diff --git a/rllib/utils/torch_utils.py b/rllib/utils/torch_utils.py index 4239174b0ccb..6349273732e9 100644 --- a/rllib/utils/torch_utils.py +++ b/rllib/utils/torch_utils.py @@ -1,7 +1,7 @@ import os import logging import warnings -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import Dict, List, Optional, TYPE_CHECKING, Union import gymnasium as gym import numpy as np @@ -32,12 +32,19 @@ FLOAT_MAX = 3.4e38 +# TODO (sven): Deprecate this function once we have moved completely to the Learner API. +# Replaced with `clip_gradients()`. @PublicAPI def apply_grad_clipping( policy: "TorchPolicy", optimizer: LocalOptimizer, loss: TensorType ) -> Dict[str, TensorType]: """Applies gradient clipping to already computed grads inside `optimizer`. + Note: This function does NOT perform an analogous operation as + tf.clip_by_global_norm. It merely clips by norm (per gradient tensor) and + then computes the global norm across all given tensors (but without clipping + by that global norm). + Args: policy: The TorchPolicy, which calculated `loss`. optimizer: A local torch optimizer object. @@ -83,6 +90,66 @@ def atanh(x: TensorType) -> TensorType: pass +@PublicAPI +def clip_gradients( + gradients_dict: Dict[str, "torch.Tensor"], + *, + grad_clip: Optional[float] = None, + grad_clip_by: str = "value", +) -> None: + """Performs gradient clipping on a grad-dict based on a clip value and clip mode. + + Changes the provided gradient dict in place. + + Args: + gradients_dict: The gradients dict, mapping str to gradient tensors. + grad_clip: The value to clip with. The way gradients are clipped is defined + by the `grad_clip_by` arg (see below). + grad_clip_by: One of 'value', 'norm', or 'global_norm'. + """ + # No clipping, return. + if grad_clip is None: + return + + # Clip by value (each gradient individually). + if grad_clip_by == "value": + for k, v in gradients_dict.copy().items(): + gradients_dict[k] = ( + None if v is None else torch.clip(v, -grad_clip, grad_clip) + ) + + # Clip by L2-norm (per gradient tensor). + elif grad_clip_by == "norm": + for k, v in gradients_dict.copy().items(): + gradients_dict[k] = ( + None if v is None else nn.utils.clip_grad_norm_(v, grad_clip) + ) + + # Clip by global L2-norm (across all gradient tensors). + else: + assert ( + grad_clip_by == "global_norm" + ), f"`grad_clip_by` ({grad_clip_by}) must be one of [value|norm|global_norm]!" + + # Compute the global L2-norm of all the gradient tensors. + total_l2_norm = sum( + # `.norm()` is the square root of the sum of all squares. + # We need to "undo" the square root b/c we want to compute the global + # norm afterwards -> `** 2`. + t.norm(2) ** 2 + for t in gradients_dict.values() + if t is not None + ) + # Now we do the square root. + total_l2_norm = torch.sqrt(total_l2_norm) + + # Clip all the gradients. + if total_l2_norm > grad_clip: + for tensor in gradients_dict.values(): + if tensor is not None: + tensor.mul_(grad_clip / total_l2_norm) + + @PublicAPI def concat_multi_gpu_td_errors( policy: Union["TorchPolicy", "TorchPolicyV2"] @@ -172,6 +239,37 @@ def mapping(item): return tree.map_structure(mapping, x) +@PublicAPI +def copy_torch_tensors(x: TensorStructType, device: Optional[str] = None): + """Creates a copy of `x` and makes deep copies torch.Tensors in x. + + Also moves the copied tensors to the specified device (if not None). + + Note if an object in x is not a torch.Tensor, it will be shallow-copied. + + Args: + x : Any (possibly nested) struct possibly containing torch.Tensors. + device : The device to move the tensors to. + + Returns: + Any: A new struct with the same structure as `x`, but with all + torch.Tensors deep-copied and moved to the specified device. + + """ + + def mapping(item): + if isinstance(item, torch.Tensor): + return ( + torch.clone(item.detach()) + if device is None + else item.detach().to(device) + ) + else: + return item + + return tree.map_structure(mapping, x) + + @PublicAPI def explained_variance(y: TensorType, pred: TensorType) -> TensorType: """Computes the explained variance for a pair of labels and predictions. diff --git a/rllib/utils/typing.py b/rllib/utils/typing.py index af6a31b1b8da..1e490174ffbe 100644 --- a/rllib/utils/typing.py +++ b/rllib/utils/typing.py @@ -23,14 +23,18 @@ from ray.rllib.policy.policy import PolicySpec from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch from ray.rllib.policy.view_requirement import ViewRequirement - from ray.rllib.utils import try_import_tf, try_import_torch + from ray.rllib.utils import try_import_jax, try_import_tf, try_import_torch _, tf, _ = try_import_tf() torch, _ = try_import_torch() + jax, _ = try_import_jax() + jnp = None + if jax is not None: + jnp = jax.numpy # Represents a generic tensor type. # This could be an np.ndarray, tf.Tensor, or a torch.Tensor. -TensorType = Union[np.array, "tf.Tensor", "torch.Tensor"] +TensorType = Union[np.array, "jnp.ndarray", "tf.Tensor", "torch.Tensor"] # Either a plain tensor, or a dict or tuple of tensors (or StructTensors). TensorStructType = Union[TensorType, dict, tuple] diff --git a/rllib_contrib/README.md b/rllib_contrib/README.md new file mode 100644 index 000000000000..1cc2e0e775ea --- /dev/null +++ b/rllib_contrib/README.md @@ -0,0 +1,30 @@ +# RLlib-Contrib + +RLlib-Contrib is a directory for more experimental community contributions to RLlib including contributed algorithms. **This directory has a more relaxed bar for contributions than Ray or RLlib.** If you are interested in contributing to RLlib-Contrib, please see the [contributing guide](CONTRIBUTING.md). + +## Getting Started and Installation +Navigate to the algorithm sub-directory you are interested in and see the README.md for installation instructions and example scripts to help you get started! + +## Maintenance + +**Any issues that are filed in `rllib_contrib` will be solved best-effort by the community and there is no expectation of maintenance by the RLlib team.** + +**The API surface between algorithms in `rllib_contrib` and current versions of Ray / RLlib is not guaranteed. This means that any APIs that are used in rllib_contrib could potentially become modified/removed in newer version of Ray/RLlib.** + +We will generally accept contributions to this directory that meet any of the following criteria: + +1. Updating dependencies. +2. Submitting community contributed algorithms that have been tested and are ready for use. +3. Enabling algorithms to be run in different environments (ex. adding support for a new type of gymnasium environment). +4. Updating algorithms for use with the newer RLlib APIs. +5. General bug fixes. + +We will not accept contributions that generally add a significant maintenance burden. In this case users should instead make their own repo with their contribution, using the same guidelines as this directory, and the RLlib team can help to market/promote it in the Ray docs. + +## Getting Involved + +| Platform | Purpose | Support Level | +| --- | --- | --- | +| [Discuss Forum](https://discuss.ray.io) | For discussions about development and questions about usage. | Community | +| [GitHub Issues](https://github.com/ray-project/rllib-contrib-maml/issues) | For reporting bugs and filing feature requests. | Community | +| [Slack](https://forms.gle/9TSdDYUgxYs8SA9e8) | For collaborating with other Ray users. | Community | diff --git a/rllib_contrib/a3c/README.rst b/rllib_contrib/a3c/README.rst new file mode 100644 index 000000000000..df3665c1408e --- /dev/null +++ b/rllib_contrib/a3c/README.rst @@ -0,0 +1,21 @@ +A3C (Asynchronous Advantage Actor-Critic) +----------------------------------------- + +`A3C ` is the asynchronous version of A2C, where gradients are computed on the workers directly after trajectory rollouts, and only then shipped to a central learner to accumulate these gradients on the central model. After the central model update, parameters are broadcast back to all workers. Similar to A2C, A3C scales to 16-32+ worker processes depending on the environment. + + +Installation +------------ + +.. code-block:: bash + + conda create -n rllib-a3c python=3.10 + conda activate rllib-a3c + pip install -r requirements.txt + pip install -e '.[development]' + + +Usage +----- + +.. literalinclude:: examples/a3c_cartpole_v1.py \ No newline at end of file diff --git a/rllib_contrib/a3c/examples/a3c_cartpole_v1.py b/rllib_contrib/a3c/examples/a3c_cartpole_v1.py new file mode 100644 index 000000000000..2f57ff71e105 --- /dev/null +++ b/rllib_contrib/a3c/examples/a3c_cartpole_v1.py @@ -0,0 +1,29 @@ +from rllib_a3c.a3c import A3C, A3CConfig + +import ray +from ray import air, tune + +if __name__ == "__main__": + ray.init() + + config = ( + A3CConfig() + .rollouts(num_rollout_workers=1) + .framework("torch") + .environment("CartPole-v1") + .training( + gamma=0.95, + ) + ) + + num_iterations = 100 + + tuner = tune.Tuner( + A3C, + param_space=config.to_dict(), + run_config=air.RunConfig( + stop={"episode_reward_mean": 150, "timesteps_total": 200000}, + failure_config=air.FailureConfig(fail_fast="raise"), + ), + ) + results = tuner.fit() diff --git a/rllib_contrib/a3c/pyproject.toml b/rllib_contrib/a3c/pyproject.toml new file mode 100644 index 000000000000..173999a039a8 --- /dev/null +++ b/rllib_contrib/a3c/pyproject.toml @@ -0,0 +1,18 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] + +[project] +name = "rllib-a3c" +authors = [{name = "Anyscale Inc."}] +version = "0.1.0" +description = "" +readme = "README.md" +requires-python = ">=3.7, <3.11" +dependencies = ["gym[accept-rom-license]", "gymnasium[mujoco]==0.26.3", "higher", "ray[rllib]==2.3.1"] + +[project.optional-dependencies] +development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "torch==1.12.0"] diff --git a/rllib_contrib/a3c/requirements.txt b/rllib_contrib/a3c/requirements.txt new file mode 100644 index 000000000000..f1191ef52412 --- /dev/null +++ b/rllib_contrib/a3c/requirements.txt @@ -0,0 +1,2 @@ +tensorflow==2.11.0 +torch==1.12.0 diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/__init__.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/__init__.py new file mode 100644 index 000000000000..3b050de0dca5 --- /dev/null +++ b/rllib_contrib/a3c/src/rllib_a3c/a3c/__init__.py @@ -0,0 +1,7 @@ +from rllib_a3c.a3c.a3c import A3C, A3CConfig + +from ray.tune.registry import register_trainable + +__all__ = ["A3CConfig", "A3C"] + +register_trainable("rllib-contrib-a3c", A3C) diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c.py new file mode 100644 index 000000000000..7f5a661cb94d --- /dev/null +++ b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c.py @@ -0,0 +1,261 @@ +import logging +from typing import Any, Dict, List, Optional, Type, Union + +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.evaluation.rollout_worker import RolloutWorker +from ray.rllib.policy.policy import Policy +from ray.rllib.utils.annotations import override +from ray.rllib.utils.metrics import ( + APPLY_GRADS_TIMER, + GRAD_WAIT_TIMER, + NUM_AGENT_STEPS_SAMPLED, + NUM_AGENT_STEPS_TRAINED, + NUM_ENV_STEPS_SAMPLED, + NUM_ENV_STEPS_TRAINED, + SYNCH_WORKER_WEIGHTS_TIMER, +) +from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder +from ray.rllib.utils.typing import ResultDict + +logger = logging.getLogger(__name__) + + +class A3CConfig(AlgorithmConfig): + """Defines a configuration class from which a A3C Algorithm can be built. + + Example: + >>> from ray import tune + >>> from ray.rllib.algorithms.a3c import A3CConfig + >>> config = A3CConfig() # doctest: +SKIP + >>> config = config.training(lr=0.01, grad_clip=30.0) # doctest: +SKIP + >>> config = config.resources(num_gpus=0) # doctest: +SKIP + >>> config = config.rollouts(num_rollout_workers=4) # doctest: +SKIP + >>> config = config.environment("CartPole-v1") # doctest: +SKIP + >>> print(config.to_dict()) # doctest: +SKIP + >>> # Build a Algorithm object from the config and run 1 training iteration. + >>> algo = config.build() # doctest: +SKIP + >>> algo.train() # doctest: +SKIP + + Example: + >>> from ray.rllib.algorithms.a3c import A3CConfig + >>> config = A3CConfig() + >>> # Print out some default values. + >>> print(config.sample_async) # doctest: +SKIP + >>> # Update the config object. + >>> config = config.training( # doctest: +SKIP + ... lr=tune.grid_search([0.001, 0.0001]), use_critic=False) + >>> # Set the config object's env. + >>> config = config.environment(env="CartPole-v1") # doctest: +SKIP + >>> # Use to_dict() to get the old-style python config dict + >>> # when running with tune. + >>> tune.Tuner( # doctest: +SKIP + ... "A3C", + ... stop={"episode_reward_mean": 200}, + ... param_space=config.to_dict(), + ... ).fit() + """ + + def __init__(self, algo_class=None): + """Initializes a A3CConfig instance.""" + super().__init__(algo_class=algo_class or A3C) + + # fmt: off + # __sphinx_doc_begin__ + # + # A3C specific settings. + self.use_critic = True + self.use_gae = True + self.lambda_ = 1.0 + self.grad_clip = 40.0 + self.lr_schedule = None + self.vf_loss_coeff = 0.5 + self.entropy_coeff = 0.01 + self.entropy_coeff_schedule = None + self.sample_async = True + + # Override some of AlgorithmConfig's default values with PPO-specific values. + self.num_rollout_workers = 2 + self.rollout_fragment_length = 10 + self.lr = 0.0001 + # Min time (in seconds) per reporting. + # This causes not every call to `training_iteration` to be reported, + # but to wait until n seconds have passed and then to summarize the + # thus far collected results. + self.min_time_s_per_iteration = 5 + self.exploration_config = { + # The Exploration class to use. In the simplest case, this is the name + # (str) of any class present in the `rllib.utils.exploration` package. + # You can also provide the python class directly or the full location + # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. + # EpsilonGreedy"). + "type": "StochasticSampling", + # Add constructor kwargs here (if any). + } + # __sphinx_doc_end__ + # fmt: on + + @override(AlgorithmConfig) + def training( + self, + *, + lr_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, + use_critic: Optional[bool] = NotProvided, + use_gae: Optional[bool] = NotProvided, + lambda_: Optional[float] = NotProvided, + grad_clip: Optional[float] = NotProvided, + vf_loss_coeff: Optional[float] = NotProvided, + entropy_coeff: Optional[float] = NotProvided, + entropy_coeff_schedule: Optional[List[List[Union[int, float]]]] = NotProvided, + sample_async: Optional[bool] = NotProvided, + **kwargs, + ) -> "A3CConfig": + """Sets the training related configuration. + + Args: + lr_schedule: Learning rate schedule. In the format of + [[timestep, lr-value], [timestep, lr-value], ...] + Intermediary timesteps will be assigned to interpolated learning rate + values. A schedule should normally start from timestep 0. + use_critic: Should use a critic as a baseline (otherwise don't use value + baseline; required for using GAE). + use_gae: If true, use the Generalized Advantage Estimator (GAE) + with a value function, see https://arxiv.org/pdf/1506.02438.pdf. + lambda_: GAE(gamma) parameter. + grad_clip: Max global norm for each gradient calculated by worker. + vf_loss_coeff: Value Function Loss coefficient. + entropy_coeff: Coefficient of the entropy regularizer. + entropy_coeff_schedule: Decay schedule for the entropy regularizer. + sample_async: Whether workers should sample async. Note that this + increases the effective rollout_fragment_length by up to 5x due + to async buffering of batches. + + Returns: + This updated AlgorithmConfig object. + """ + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if lr_schedule is not NotProvided: + self.lr_schedule = lr_schedule + if use_critic is not NotProvided: + self.lr_schedule = use_critic + if use_gae is not NotProvided: + self.use_gae = use_gae + if lambda_ is not NotProvided: + self.lambda_ = lambda_ + if grad_clip is not NotProvided: + self.grad_clip = grad_clip + if vf_loss_coeff is not NotProvided: + self.vf_loss_coeff = vf_loss_coeff + if entropy_coeff is not NotProvided: + self.entropy_coeff = entropy_coeff + if entropy_coeff_schedule is not NotProvided: + self.entropy_coeff_schedule = entropy_coeff_schedule + if sample_async is not NotProvided: + self.sample_async = sample_async + + return self + + @override(AlgorithmConfig) + def validate(self) -> None: + # Call super's validation method. + super().validate() + + if self.entropy_coeff < 0: + raise ValueError("`entropy_coeff` must be >= 0.0!") + if self.num_rollout_workers <= 0 and self.sample_async: + raise ValueError("`num_workers` for A3C must be >= 1!") + + +class A3C(Algorithm): + @classmethod + @override(Algorithm) + def get_default_config(cls) -> AlgorithmConfig: + return A3CConfig() + + @classmethod + @override(Algorithm) + def get_default_policy_class( + cls, config: AlgorithmConfig + ) -> Optional[Type[Policy]]: + if config["framework"] == "torch": + from ray.rllib.algorithms.a3c.a3c_torch_policy import A3CTorchPolicy + + return A3CTorchPolicy + elif config["framework"] == "tf": + from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTF1Policy + + return A3CTF1Policy + else: + from ray.rllib.algorithms.a3c.a3c_tf_policy import A3CTF2Policy + + return A3CTF2Policy + + def training_step(self) -> ResultDict: + # Shortcut. + local_worker = self.workers.local_worker() + + # Define the function executed in parallel by all RolloutWorkers to collect + # samples + compute and return gradients (and other information). + + def sample_and_compute_grads(worker: RolloutWorker) -> Dict[str, Any]: + """Call sample() and compute_gradients() remotely on workers.""" + samples = worker.sample() + grads, infos = worker.compute_gradients(samples) + return { + "grads": grads, + "infos": infos, + "agent_steps": samples.agent_steps(), + "env_steps": samples.env_steps(), + } + + # Perform rollouts and gradient calculations asynchronously. + with self._timers[GRAD_WAIT_TIMER]: + # Results are a mapping from ActorHandle (RolloutWorker) to their + # returned gradient calculation results. + self.workers.foreach_worker_async( + func=sample_and_compute_grads, + healthy_only=True, + ) + async_results = self.workers.fetch_ready_async_reqs() + + # Loop through all fetched worker-computed gradients (if any) + # and apply them - one by one - to the local worker's model. + # After each apply step (one step per worker that returned some gradients), + # update that particular worker's weights. + global_vars = None + learner_info_builder = LearnerInfoBuilder(num_devices=1) + to_sync_workers = set() + for worker_id, result in async_results: + # Apply gradients to local worker. + with self._timers[APPLY_GRADS_TIMER]: + local_worker.apply_gradients(result["grads"]) + self._timers[APPLY_GRADS_TIMER].push_units_processed(result["agent_steps"]) + + # Update all step counters. + self._counters[NUM_AGENT_STEPS_SAMPLED] += result["agent_steps"] + self._counters[NUM_ENV_STEPS_SAMPLED] += result["env_steps"] + self._counters[NUM_AGENT_STEPS_TRAINED] += result["agent_steps"] + self._counters[NUM_ENV_STEPS_TRAINED] += result["env_steps"] + + learner_info_builder.add_learn_on_batch_results_multi_agent(result["infos"]) + + # Create current global vars. + global_vars = { + "timestep": self._counters[NUM_AGENT_STEPS_SAMPLED], + } + + # Add this worker to be synced. + to_sync_workers.add(worker_id) + + # Synch updated weights back to the particular worker + # (only those policies that are trainable). + with self._timers[SYNCH_WORKER_WEIGHTS_TIMER]: + self.workers.sync_weights( + policies=local_worker.get_policies_to_train(), + to_worker_indices=list(to_sync_workers), + global_vars=global_vars, + ) + + return learner_info_builder.finalize() diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_tf_policy.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_tf_policy.py new file mode 100644 index 000000000000..bdc77f5790ae --- /dev/null +++ b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_tf_policy.py @@ -0,0 +1,183 @@ +"""Note: Keep in sync with changes to VTraceTFPolicy.""" +from typing import Dict, List, Optional, Type, Union + +from ray.rllib.evaluation.episode import Episode +from ray.rllib.evaluation.postprocessing import ( + Postprocessing, + compute_gae_for_sample_batch, +) +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_action_dist import TFActionDistribution +from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 +from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.tf_mixins import ( + EntropyCoeffSchedule, + LearningRateSchedule, + ValueNetworkMixin, + compute_gradients, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.tf_utils import explained_variance +from ray.rllib.utils.typing import ( + AgentID, + LocalOptimizer, + ModelGradients, + TensorType, + TFPolicyV2Type, +) + +tf1, tf, tfv = try_import_tf() + + +# We need this builder function because we want to share the same +# custom logics between TF1 dynamic and TF2 eager policies. +def get_a3c_tf_policy(name: str, base: TFPolicyV2Type) -> TFPolicyV2Type: + """Construct a A3CTFPolicy inheriting either dynamic or eager base policies. + + Args: + base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. + + Returns: + A TF Policy to be used with MAML. + """ + + class A3CTFPolicy( + ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, base + ): + def __init__( + self, + observation_space, + action_space, + config, + existing_model=None, + existing_inputs=None, + ): + # First thing first, enable eager execution if necessary. + base.enable_eager_execution_if_necessary() + + # Initialize base class. + base.__init__( + self, + observation_space, + action_space, + config, + existing_inputs=existing_inputs, + existing_model=existing_model, + ) + + ValueNetworkMixin.__init__(self, self.config) + LearningRateSchedule.__init__( + self, self.config["lr"], self.config["lr_schedule"] + ) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() + + @override(base) + def loss( + self, + model: Union[ModelV2, "tf.keras.Model"], + dist_class: Type[TFActionDistribution], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + model_out, _ = model(train_batch) + action_dist = dist_class(model_out, model) + if self.is_recurrent(): + max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) + valid_mask = tf.sequence_mask( + train_batch[SampleBatch.SEQ_LENS], max_seq_len + ) + valid_mask = tf.reshape(valid_mask, [-1]) + else: + valid_mask = tf.ones_like(train_batch[SampleBatch.REWARDS]) + + log_prob = action_dist.logp(train_batch[SampleBatch.ACTIONS]) + vf = model.value_function() + + # The "policy gradients" loss + self.pi_loss = -tf.reduce_sum( + tf.boolean_mask( + log_prob * train_batch[Postprocessing.ADVANTAGES], valid_mask + ) + ) + + delta = tf.boolean_mask( + vf - train_batch[Postprocessing.VALUE_TARGETS], valid_mask + ) + + # Compute a value function loss. + if self.config.get("use_critic", True): + self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta)) + # Ignore the value function. + else: + self.vf_loss = tf.constant(0.0) + + self.entropy_loss = tf.reduce_sum( + tf.boolean_mask(action_dist.entropy(), valid_mask) + ) + + self.total_loss = ( + self.pi_loss + + self.vf_loss * self.config["vf_loss_coeff"] + - self.entropy_loss * self.entropy_coeff + ) + + return self.total_loss + + @override(base) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + return { + "cur_lr": tf.cast(self.cur_lr, tf.float64), + "entropy_coeff": tf.cast(self.entropy_coeff, tf.float64), + "policy_loss": self.pi_loss, + "policy_entropy": self.entropy_loss, + "var_gnorm": tf.linalg.global_norm( + list(self.model.trainable_variables()) + ), + "vf_loss": self.vf_loss, + } + + @override(base) + def grad_stats_fn( + self, train_batch: SampleBatch, grads: ModelGradients + ) -> Dict[str, TensorType]: + return { + "grad_gnorm": tf.linalg.global_norm(grads), + "vf_explained_var": explained_variance( + train_batch[Postprocessing.VALUE_TARGETS], + self.model.value_function(), + ), + } + + @override(base) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, + episode: Optional[Episode] = None, + ): + sample_batch = super().postprocess_trajectory(sample_batch) + return compute_gae_for_sample_batch( + self, sample_batch, other_agent_batches, episode + ) + + @override(base) + def compute_gradients_fn( + self, optimizer: LocalOptimizer, loss: TensorType + ) -> ModelGradients: + return compute_gradients(self, optimizer, loss) + + A3CTFPolicy.__name__ = name + A3CTFPolicy.__qualname__ = name + + return A3CTFPolicy + + +A3CTF1Policy = get_a3c_tf_policy("A3CTF1Policy", DynamicTFPolicyV2) +A3CTF2Policy = get_a3c_tf_policy("A3CTF2Policy", EagerTFPolicyV2) diff --git a/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_torch_policy.py b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_torch_policy.py new file mode 100644 index 000000000000..e702254cd16c --- /dev/null +++ b/rllib_contrib/a3c/src/rllib_a3c/a3c/a3c_torch_policy.py @@ -0,0 +1,152 @@ +from typing import Dict, List, Optional, Type, Union + +from ray.rllib.evaluation.episode import Episode +from ray.rllib.evaluation.postprocessing import ( + Postprocessing, + compute_gae_for_sample_batch, +) +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.torch_mixins import ( + EntropyCoeffSchedule, + LearningRateSchedule, + ValueNetworkMixin, +) +from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.torch_utils import apply_grad_clipping, sequence_mask +from ray.rllib.utils.typing import AgentID, TensorType + +torch, nn = try_import_torch() + + +class A3CTorchPolicy( + ValueNetworkMixin, LearningRateSchedule, EntropyCoeffSchedule, TorchPolicyV2 +): + """PyTorch Policy class used with A3C.""" + + def __init__(self, observation_space, action_space, config): + TorchPolicyV2.__init__( + self, + observation_space, + action_space, + config, + max_seq_len=config["model"]["max_seq_len"], + ) + ValueNetworkMixin.__init__(self, config) + LearningRateSchedule.__init__(self, config["lr"], config["lr_schedule"]) + EntropyCoeffSchedule.__init__( + self, config["entropy_coeff"], config["entropy_coeff_schedule"] + ) + + # TODO: Don't require users to call this manually. + self._initialize_loss_from_dummy_batch() + + @override(TorchPolicyV2) + def loss( + self, + model: ModelV2, + dist_class: Type[TorchDistributionWrapper], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + """Constructs the loss function. + + Args: + model: The Model to calculate the loss for. + dist_class: The action distr. class. + train_batch: The training data. + + Returns: + The A3C loss tensor given the input batch. + """ + logits, _ = model(train_batch) + values = model.value_function() + + if self.is_recurrent(): + B = len(train_batch[SampleBatch.SEQ_LENS]) + max_seq_len = logits.shape[0] // B + mask_orig = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) + valid_mask = torch.reshape(mask_orig, [-1]) + else: + valid_mask = torch.ones_like(values, dtype=torch.bool) + + dist = dist_class(logits, model) + log_probs = dist.logp(train_batch[SampleBatch.ACTIONS]).reshape(-1) + pi_err = -torch.sum( + torch.masked_select( + log_probs * train_batch[Postprocessing.ADVANTAGES], valid_mask + ) + ) + + # Compute a value function loss. + if self.config["use_critic"]: + value_err = 0.5 * torch.sum( + torch.pow( + torch.masked_select( + values.reshape(-1) - train_batch[Postprocessing.VALUE_TARGETS], + valid_mask, + ), + 2.0, + ) + ) + # Ignore the value function. + else: + value_err = 0.0 + + entropy = torch.sum(torch.masked_select(dist.entropy(), valid_mask)) + + total_loss = ( + pi_err + + value_err * self.config["vf_loss_coeff"] + - entropy * self.entropy_coeff + ) + + # Store values for stats function in model (tower), such that for + # multi-GPU, we do not override them during the parallel loss phase. + model.tower_stats["entropy"] = entropy + model.tower_stats["pi_err"] = pi_err + model.tower_stats["value_err"] = value_err + + return total_loss + + @override(TorchPolicyV2) + def optimizer( + self, + ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]: + """Returns a torch optimizer (Adam) for A3C.""" + return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"]) + + @override(TorchPolicyV2) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + return convert_to_numpy( + { + "cur_lr": self.cur_lr, + "entropy_coeff": self.entropy_coeff, + "policy_entropy": torch.mean( + torch.stack(self.get_tower_stats("entropy")) + ), + "policy_loss": torch.mean(torch.stack(self.get_tower_stats("pi_err"))), + "vf_loss": torch.mean(torch.stack(self.get_tower_stats("value_err"))), + } + ) + + @override(TorchPolicyV2) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, + episode: Optional[Episode] = None, + ): + sample_batch = super().postprocess_trajectory(sample_batch) + return compute_gae_for_sample_batch( + self, sample_batch, other_agent_batches, episode + ) + + @override(TorchPolicyV2) + def extra_grad_process( + self, optimizer: "torch.optim.Optimizer", loss: TensorType + ) -> Dict[str, TensorType]: + return apply_grad_clipping(self, optimizer, loss) diff --git a/rllib_contrib/a3c/tests/test_a3c.py b/rllib_contrib/a3c/tests/test_a3c.py new file mode 100644 index 000000000000..66984eb1e4ae --- /dev/null +++ b/rllib_contrib/a3c/tests/test_a3c.py @@ -0,0 +1,100 @@ +import unittest + +from rllib_a3c.a3c import A3CConfig + +import ray +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.utils.metrics.learner_info import LEARNER_INFO, LEARNER_STATS_KEY +from ray.rllib.utils.test_utils import ( + check_compute_single_action, + check_train_results, + framework_iterator, +) + + +class TestA3C(unittest.TestCase): + """Sanity tests for A2C exec impl.""" + + def setUp(self): + ray.init(num_cpus=4) + + def tearDown(self): + ray.shutdown() + + def test_a3c_compilation(self): + """Test whether an A3C can be built with both frameworks.""" + config = A3CConfig().rollouts(num_rollout_workers=2, num_envs_per_worker=2) + + num_iterations = 2 + + # Test against all frameworks. + for _ in framework_iterator(config, with_eager_tracing=False): + for env in ["CartPole-v1", "Pendulum-v1"]: + print("env={}".format(env)) + config.model["use_lstm"] = env == "CartPole-v1" + algo = config.build(env=env) + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + check_compute_single_action( + algo, include_state=config.model["use_lstm"] + ) + algo.stop() + + def test_a3c_entropy_coeff_schedule(self): + """Test A3C entropy coeff schedule support.""" + config = A3CConfig().rollouts( + num_rollout_workers=1, + num_envs_per_worker=1, + batch_mode="truncate_episodes", + rollout_fragment_length=10, + ) + # Initial entropy coeff, doesn't really matter because of the schedule below. + config.training( + train_batch_size=20, + entropy_coeff=0.01, + entropy_coeff_schedule=[ + [0, 0.01], + [120, 0.0001], + ], + ) + # 0 metrics reporting delay, this makes sure timestep, + # which entropy coeff depends on, is updated after each worker rollout. + config.reporting( + min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=20 + ) + + def _step_n_times(trainer, n: int): + """Step trainer n times. + + Returns: + learning rate at the end of the execution. + """ + for _ in range(n): + results = trainer.train() + return results["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ + "entropy_coeff" + ] + + # Test against all frameworks. + for _ in framework_iterator(config): + algo = config.build(env="CartPole-v1") + + coeff = _step_n_times(algo, 1) # 20 timesteps + # Should be close to the starting coeff of 0.01 + self.assertGreaterEqual(coeff, 0.005) + + coeff = _step_n_times(algo, 10) # 200 timesteps + # Should have annealed to the final coeff of 0.0001. + self.assertLessEqual(coeff, 0.00011) + + algo.stop() + + +if __name__ == "__main__": + import sys + + import pytest + + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib_contrib/maml/README.rst b/rllib_contrib/maml/README.rst new file mode 100644 index 000000000000..912fca39ed35 --- /dev/null +++ b/rllib_contrib/maml/README.rst @@ -0,0 +1,27 @@ +MAML (Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks) +------------------------------------------------------------------------ + +`MAML ` is an on-policy meta RL algorithm. Unlike standard RL algorithms, which aim to maximize the sum of rewards into the future for a single task (e.g. HalfCheetah), meta RL algorithms seek to maximize the sum of rewards for *a given distribution of tasks*. + +On a high level, MAML seeks to learn quick adaptation across different tasks (e.g. different velocities for HalfCheetah). Quick adaptation is defined by the number of gradient steps it takes to adapt. MAML aims to maximize the RL objective for each task after `X` gradient steps. Doing this requires partitioning the algorithm into two steps. The first step is data collection. This involves collecting data for each task for each step of adaptation (from `1, 2, ..., X`). The second step is the meta-update step. This second step takes all the aggregated ddata from the first step and computes the meta-gradient. + +Code here is adapted from `https://github.com/jonasrothfuss`, which outperforms vanilla MAML and avoids computation of the higher order gradients during the meta-update step. MAML is evaluated on custom environments that are described in greater detail here. + +MAML uses additional metrics to measure performance; episode_reward_mean measures the agent’s returns before adaptation, episode_reward_mean_adapt_N measures the agent’s returns after N gradient steps of inner adaptation, and adaptation_delta measures the difference in performance before and after adaptation. + + +Installation +------------ + +.. code-block:: bash + + conda create -n rllib-maml python=3.10 + conda activate rllib-maml + pip install -r requirements.txt + pip install -e '.[development]' + + +Usage +----- + +.. literalinclude:: examples/cartpole_mass_maml.py \ No newline at end of file diff --git a/rllib_contrib/maml/examples/cartpole_mass_maml.py b/rllib_contrib/maml/examples/cartpole_mass_maml.py new file mode 100644 index 000000000000..72c27f83056c --- /dev/null +++ b/rllib_contrib/maml/examples/cartpole_mass_maml.py @@ -0,0 +1,52 @@ +from gymnasium.wrappers import TimeLimit +from rllib_maml.maml import MAML, MAMLConfig + +import ray +from ray import air, tune +from ray.rllib.examples.env.cartpole_mass import CartPoleMassEnv +from ray.tune.registry import register_env + +if __name__ == "__main__": + ray.init() + register_env( + "cartpole", + lambda env_cfg: TimeLimit(CartPoleMassEnv(), max_episode_steps=200), + ) + + rollout_fragment_length = 32 + + config = ( + MAMLConfig() + .rollouts( + num_rollout_workers=4, rollout_fragment_length=rollout_fragment_length + ) + .framework("torch") + .environment("cartpole", clip_actions=False) + .training( + inner_adaptation_steps=1, + maml_optimizer_steps=5, + gamma=0.99, + lambda_=1.0, + lr=0.001, + vf_loss_coeff=0.5, + inner_lr=0.03, + use_meta_env=False, + clip_param=0.3, + kl_target=0.01, + kl_coeff=0.001, + model=dict(fcnet_hiddens=[64, 64]), + train_batch_size=rollout_fragment_length, + ) + ) + + num_iterations = 100 + + tuner = tune.Tuner( + MAML, + param_space=config.to_dict(), + run_config=air.RunConfig( + stop={"training_iteration": num_iterations}, + failure_config=air.FailureConfig(fail_fast="raise"), + ), + ) + results = tuner.fit() diff --git a/rllib_contrib/maml/pyproject.toml b/rllib_contrib/maml/pyproject.toml new file mode 100644 index 000000000000..bf6df70018fe --- /dev/null +++ b/rllib_contrib/maml/pyproject.toml @@ -0,0 +1,18 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] + +[project] +name = "rllib-maml" +authors = [{name = "Anyscale Inc."}] +version = "0.1.0" +description = "" +readme = "README.md" +requires-python = ">=3.7, <3.11" +dependencies = ["gymnasium[mujoco]==0.26.3", "higher", "ray[rllib]==2.3.1"] + +[project.optional-dependencies] +development = ["pytest>=7.2.2", "pre-commit==2.21.0", "tensorflow==2.11.0", "torch==1.12.0"] diff --git a/rllib_contrib/maml/requirements.txt b/rllib_contrib/maml/requirements.txt new file mode 100644 index 000000000000..f1191ef52412 --- /dev/null +++ b/rllib_contrib/maml/requirements.txt @@ -0,0 +1,2 @@ +tensorflow==2.11.0 +torch==1.12.0 diff --git a/rllib_contrib/maml/src/rllib_maml/__init__.py b/rllib_contrib/maml/src/rllib_maml/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/rllib_contrib/maml/src/rllib_maml/envs/__init__.py b/rllib_contrib/maml/src/rllib_maml/envs/__init__.py new file mode 100644 index 000000000000..1796db67d13e --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/envs/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2023-onwards Anyscale, Inc. The use of this library is subject to the +# included LICENSE file. +from rllib_maml.envs.ant_rand_goal import AntRandGoalEnv +from rllib_maml.envs.cartpole_mass import CartPoleMassEnv +from rllib_maml.envs.pendulum_mass import PendulumMassEnv + +__all__ = [ + "AntRandGoalEnv", + "CartPoleMassEnv", + "PendulumMassEnv", +] diff --git a/rllib_contrib/maml/src/rllib_maml/envs/ant_rand_goal.py b/rllib_contrib/maml/src/rllib_maml/envs/ant_rand_goal.py new file mode 100644 index 000000000000..5dd2f3c8e026 --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/envs/ant_rand_goal.py @@ -0,0 +1,86 @@ +import numpy as np +from gymnasium.envs.mujoco.mujoco_env import MujocoEnv +from gymnasium.utils import EzPickle + +from ray.rllib.env.apis.task_settable_env import TaskSettableEnv + + +class AntRandGoalEnv(EzPickle, MujocoEnv, TaskSettableEnv): + """Ant Environment that randomizes goals as tasks + + Goals are randomly sampled 2D positions + """ + + def __init__(self): + self.set_task(self.sample_tasks(1)[0]) + MujocoEnv.__init__(self, "ant.xml", 5) + EzPickle.__init__(self) + + def sample_tasks(self, n_tasks): + # Samples a goal position (2x1 position ector) + a = np.random.random(n_tasks) * 2 * np.pi + r = 3 * np.random.random(n_tasks) ** 0.5 + return np.stack((r * np.cos(a), r * np.sin(a)), axis=-1) + + def set_task(self, task): + """ + Args: + task: task of the meta-learning environment + """ + self.goal_pos = task + + def get_task(self): + """ + Returns: + task: task of the meta-learning environment + """ + return self.goal_pos + + def step(self, a): + self.do_simulation(a, self.frame_skip) + xposafter = self.get_body_com("torso") + goal_reward = -np.sum( + np.abs(xposafter[:2] - self.goal_pos) + ) # make it happy, not suicidal + ctrl_cost = 0.1 * np.square(a).sum() + contact_cost = ( + 0.5 * 1e-3 * np.sum(np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) + ) + # survive_reward = 1.0 + survive_reward = 0.0 + reward = goal_reward - ctrl_cost - contact_cost + survive_reward + # notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0. + # done = not notdone + done = False + ob = self._get_obs() + return ( + ob, + reward, + done, + dict( + reward_forward=goal_reward, + reward_ctrl=-ctrl_cost, + reward_contact=-contact_cost, + reward_survive=survive_reward, + ), + ) + + def _get_obs(self): + return np.concatenate( + [ + self.sim.data.qpos.flat, + self.sim.data.qvel.flat, + np.clip(self.sim.data.cfrc_ext, -1, 1).flat, + ] + ) + + def reset_model(self): + qpos = self.init_qpos + self.np_random.uniform( + size=self.model.nq, low=-0.1, high=0.1 + ) + qvel = self.init_qvel + self.np_random.randn(self.model.nv) * 0.1 + self.set_state(qpos, qvel) + return self._get_obs() + + def viewer_setup(self): + self.viewer.cam.distance = self.model.stat.extent * 0.5 diff --git a/rllib_contrib/maml/src/rllib_maml/envs/cartpole_mass.py b/rllib_contrib/maml/src/rllib_maml/envs/cartpole_mass.py new file mode 100644 index 000000000000..bfd481402eb7 --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/envs/cartpole_mass.py @@ -0,0 +1,31 @@ +import numpy as np +from gymnasium.envs.classic_control.cartpole import CartPoleEnv +from gymnasium.utils import EzPickle + +from ray.rllib.env.apis.task_settable_env import TaskSettableEnv + + +class CartPoleMassEnv(CartPoleEnv, EzPickle, TaskSettableEnv): + """CartPoleMassEnv varies the weights of the cart and the pole.""" + + def sample_tasks(self, n_tasks): + # Sample new cart- and pole masses (random floats between 0.5 and 2.0 + # (cart) and between 0.05 and 0.2 (pole)). + cart_masses = np.random.uniform(low=0.5, high=2.0, size=(n_tasks, 1)) + pole_masses = np.random.uniform(low=0.05, high=0.2, size=(n_tasks, 1)) + return np.concatenate([cart_masses, pole_masses], axis=-1) + + def set_task(self, task): + """ + Args: + task (Tuple[float]): Masses of the cart and the pole. + """ + self.masscart = task[0] + self.masspole = task[1] + + def get_task(self): + """ + Returns: + Tuple[float]: The current mass of the cart- and pole. + """ + return np.array([self.masscart, self.masspole]) diff --git a/rllib_contrib/maml/src/rllib_maml/envs/pendulum_mass.py b/rllib_contrib/maml/src/rllib_maml/envs/pendulum_mass.py new file mode 100644 index 000000000000..2b4abdf20107 --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/envs/pendulum_mass.py @@ -0,0 +1,33 @@ +import numpy as np +from gymnasium.envs.classic_control.pendulum import PendulumEnv +from gymnasium.utils import EzPickle + +from ray.rllib.env.apis.task_settable_env import TaskSettableEnv + + +class PendulumMassEnv(PendulumEnv, EzPickle, TaskSettableEnv): + """PendulumMassEnv varies the weight of the pendulum + + Tasks are defined to be weight uniformly sampled between [0.5,2] + """ + + def sample_tasks(self, n_tasks): + # Sample new pendulum masses (random floats between 0.5 and 2). + return np.random.uniform(low=0.5, high=2.0, size=(n_tasks,)) + + def set_task(self, task): + """ + Args: + task: Task of the meta-learning environment (here: mass of + the pendulum). + """ + # self.m is the mass property of the pendulum. + self.m = task + + def get_task(self): + """ + Returns: + float: The current mass of the pendulum (self.m in the PendulumEnv + object). + """ + return self.m diff --git a/rllib_contrib/maml/src/rllib_maml/maml/__init__.py b/rllib_contrib/maml/src/rllib_maml/maml/__init__.py new file mode 100644 index 000000000000..1ec07956fabd --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/maml/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2023-onwards Anyscale, Inc. The use of this library is subject to the +# included LICENSE file. +from rllib_maml.maml.maml import MAML, MAMLConfig + +from ray.tune.registry import register_trainable + +__all__ = [ + "MAML", + "MAMLConfig", +] + +register_trainable("rllib-contrib-maml", MAML) diff --git a/rllib_contrib/maml/src/rllib_maml/maml/maml.py b/rllib_contrib/maml/src/rllib_maml/maml/maml.py new file mode 100644 index 000000000000..e03a7ff3f6ca --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/maml/maml.py @@ -0,0 +1,388 @@ +import logging +from typing import Optional, Type + +import numpy as np + +from ray.rllib.algorithms.algorithm import Algorithm +from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided +from ray.rllib.evaluation.metrics import collect_metrics, get_learner_stats +from ray.rllib.evaluation.worker_set import WorkerSet +from ray.rllib.execution.common import ( + STEPS_SAMPLED_COUNTER, + STEPS_TRAINED_COUNTER, + STEPS_TRAINED_THIS_ITER_COUNTER, + _get_shared_metrics, +) +from ray.rllib.execution.metric_ops import CollectMetrics +from ray.rllib.policy.policy import Policy +from ray.rllib.policy.sample_batch import ( + concat_samples, + convert_ma_batch_to_sample_batch, +) +from ray.rllib.utils.annotations import override +from ray.rllib.utils.deprecation import DEPRECATED_VALUE +from ray.rllib.utils.metrics.learner_info import LEARNER_INFO +from ray.rllib.utils.sgd import standardized +from ray.util.iter import LocalIterator, from_actors + +logger = logging.getLogger(__name__) + + +class MAMLConfig(AlgorithmConfig): + """Defines a configuration class from which a MAML Algorithm can be built. + + Example: + >>> from ray.rllib.algorithms.maml import MAMLConfig + >>> config = MAMLConfig().training(use_gae=False).resources(num_gpus=1) + >>> print(config.to_dict()) # doctest: +SKIP + >>> # Build a Algorithm object from the config and run 1 training iteration. + >>> algo = config.build(env="CartPole-v1") # doctest: +SKIP + >>> algo.train() # doctest: +SKIP + + Example: + >>> from ray.rllib.algorithms.maml import MAMLConfig + >>> from ray import air + >>> from ray import tune + >>> config = MAMLConfig() + >>> # Print out some default values. + >>> print(config.lr) # doctest: +SKIP + >>> # Update the config object. + >>> config = config.training( # doctest: +SKIP + ... grad_clip=tune.grid_search([10.0, 40.0])) + >>> # Set the config object's env. + >>> config = config.environment(env="CartPole-v1") + >>> # Use to_dict() to get the old-style python config dict + >>> # when running with tune. + >>> tune.Tuner( # doctest: +SKIP + ... "MAML", + ... run_config=air.RunConfig(stop={"episode_reward_mean": 200}), + ... param_space=config.to_dict(), + ... ).fit() + """ + + def __init__(self, algo_class=None): + """Initializes a PGConfig instance.""" + super().__init__(algo_class=algo_class or MAML) + + # fmt: off + # __sphinx_doc_begin__ + # MAML-specific config settings. + self.use_gae = True + self.lambda_ = 1.0 + self.kl_coeff = 0.0005 + self.vf_loss_coeff = 0.5 + self.entropy_coeff = 0.0 + self.clip_param = 0.3 + self.vf_clip_param = 10.0 + self.grad_clip = None + self.kl_target = 0.01 + self.inner_adaptation_steps = 1 + self.maml_optimizer_steps = 5 + self.inner_lr = 0.1 + self.use_meta_env = True + + # Override some of AlgorithmConfig's default values with MAML-specific values. + self.num_rollout_workers = 2 + self.rollout_fragment_length = 200 + self.create_env_on_local_worker = True + self.lr = 1e-3 + + # Share layers for value function. + self.model.update({ + "vf_share_layers": False, + }) + + self.batch_mode = "complete_episodes" + self._disable_execution_plan_api = False + self.exploration_config = { + # The Exploration class to use. In the simplest case, this is the name + # (str) of any class present in the `rllib.utils.exploration` package. + # You can also provide the python class directly or the full location + # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. + # EpsilonGreedy"). + "type": "StochasticSampling", + # Add constructor kwargs here (if any). + } + # __sphinx_doc_end__ + # fmt: on + + # Deprecated keys: + self.vf_share_layers = DEPRECATED_VALUE + + def training( + self, + *, + use_gae: Optional[bool] = NotProvided, + lambda_: Optional[float] = NotProvided, + kl_coeff: Optional[float] = NotProvided, + vf_loss_coeff: Optional[float] = NotProvided, + entropy_coeff: Optional[float] = NotProvided, + clip_param: Optional[float] = NotProvided, + vf_clip_param: Optional[float] = NotProvided, + grad_clip: Optional[float] = NotProvided, + kl_target: Optional[float] = NotProvided, + inner_adaptation_steps: Optional[int] = NotProvided, + maml_optimizer_steps: Optional[int] = NotProvided, + inner_lr: Optional[float] = NotProvided, + use_meta_env: Optional[bool] = NotProvided, + **kwargs, + ) -> "MAMLConfig": + """Sets the training related configuration. + + Args: + use_gae: If true, use the Generalized Advantage Estimator (GAE) + with a value function, see https://arxiv.org/pdf/1506.02438.pdf. + lambda_: The GAE (lambda) parameter. + kl_coeff: Initial coefficient for KL divergence. + vf_loss_coeff: Coefficient of the value function loss. + entropy_coeff: Coefficient of the entropy regularizer. + clip_param: PPO clip parameter. + vf_clip_param: Clip param for the value function. Note that this is + sensitive to the scale of the rewards. If your expected V is large, + increase this. + grad_clip: If specified, clip the global norm of gradients by this amount. + kl_target: Target value for KL divergence. + inner_adaptation_steps: Number of Inner adaptation steps for the MAML + algorithm. + maml_optimizer_steps: Number of MAML steps per meta-update iteration + (PPO steps). + inner_lr: Inner Adaptation Step size. + use_meta_env: Use Meta Env Template. + + Returns: + This updated AlgorithmConfig object. + """ + # Pass kwargs onto super's `training()` method. + super().training(**kwargs) + + if use_gae is not NotProvided: + self.use_gae = use_gae + if lambda_ is not NotProvided: + self.lambda_ = lambda_ + if kl_coeff is not NotProvided: + self.kl_coeff = kl_coeff + if vf_loss_coeff is not NotProvided: + self.vf_loss_coeff = vf_loss_coeff + if entropy_coeff is not NotProvided: + self.entropy_coeff = entropy_coeff + if clip_param is not NotProvided: + self.clip_param = clip_param + if vf_clip_param is not NotProvided: + self.vf_clip_param = vf_clip_param + if grad_clip is not NotProvided: + self.grad_clip = grad_clip + if kl_target is not NotProvided: + self.kl_target = kl_target + if inner_adaptation_steps is not NotProvided: + self.inner_adaptation_steps = inner_adaptation_steps + if maml_optimizer_steps is not NotProvided: + self.maml_optimizer_steps = maml_optimizer_steps + if inner_lr is not NotProvided: + self.inner_lr = inner_lr + if use_meta_env is not NotProvided: + self.use_meta_env = use_meta_env + + return self + + @override(AlgorithmConfig) + def validate(self) -> None: + # Call super's validation method. + super().validate() + + if self.num_gpus > 1: + raise ValueError("`num_gpus` > 1 not yet supported for MAML!") + if self.inner_adaptation_steps <= 0: + raise ValueError("Inner Adaptation Steps must be >=1!") + if self.maml_optimizer_steps <= 0: + raise ValueError("PPO steps for meta-update needs to be >=0!") + if self.entropy_coeff < 0: + raise ValueError("`entropy_coeff` must be >=0.0!") + if self.batch_mode != "complete_episodes": + raise ValueError("`batch_mode`=truncate_episodes not supported!") + if self.num_rollout_workers <= 0: + raise ValueError("Must have at least 1 worker/task!") + if self.create_env_on_local_worker is False: + raise ValueError( + "Must have an actual Env created on the driver " + "(local) worker! Try setting `config.environment(" + "create_env_on_local_worker=True)`." + ) + + +# @mluo: TODO +def set_worker_tasks(workers, use_meta_env): + if use_meta_env: + n_tasks = len(workers.remote_workers()) + tasks = workers.local_worker().foreach_env(lambda x: x)[0].sample_tasks(n_tasks) + for i, worker in enumerate(workers.remote_workers()): + worker.foreach_env.remote(lambda env: env.set_task(tasks[i])) + + +class MetaUpdate: + def __init__(self, workers, maml_steps, metric_gen, use_meta_env): + self.workers = workers + self.maml_optimizer_steps = maml_steps + self.metric_gen = metric_gen + self.use_meta_env = use_meta_env + + def __call__(self, data_tuple): + # Metaupdate Step + samples = data_tuple[0] + adapt_metrics_dict = data_tuple[1] + + # Metric Updating + metrics = _get_shared_metrics() + metrics.counters[STEPS_SAMPLED_COUNTER] += samples.count + fetches = None + for i in range(self.maml_optimizer_steps): + fetches = self.workers.local_worker().learn_on_batch(samples) + learner_stats = get_learner_stats(fetches) + + # Sync workers with meta policy + self.workers.sync_weights() + + # Set worker tasks + set_worker_tasks(self.workers, self.use_meta_env) + + # Update KLS + def update(pi, pi_id): + assert "inner_kl" not in learner_stats, ( + "inner_kl should be nested under policy id key", + learner_stats, + ) + if pi_id in learner_stats: + assert "inner_kl" in learner_stats[pi_id], (learner_stats, pi_id) + pi.update_kls(learner_stats[pi_id]["inner_kl"]) + else: + logger.warning("No data for {}, not updating kl".format(pi_id)) + + self.workers.local_worker().foreach_policy_to_train(update) + + # Modify Reporting Metrics + metrics = _get_shared_metrics() + metrics.info[LEARNER_INFO] = fetches + metrics.counters[STEPS_TRAINED_THIS_ITER_COUNTER] = samples.count + metrics.counters[STEPS_TRAINED_COUNTER] += samples.count + + res = self.metric_gen.__call__(None) + res.update(adapt_metrics_dict) + + return res + + +def post_process_metrics(adapt_iter, workers, metrics): + # Obtain Current Dataset Metrics and filter out + name = "_adapt_" + str(adapt_iter) if adapt_iter > 0 else "" + + # Only workers are collecting data + res = collect_metrics(workers=workers) + + metrics["episode_reward_max" + str(name)] = res["episode_reward_max"] + metrics["episode_reward_mean" + str(name)] = res["episode_reward_mean"] + metrics["episode_reward_min" + str(name)] = res["episode_reward_min"] + + return metrics + + +def inner_adaptation(workers, samples): + # Each worker performs one gradient descent + for i, e in enumerate(workers.remote_workers()): + e.learn_on_batch.remote(samples[i]) + + +class MAML(Algorithm): + @classmethod + @override(Algorithm) + def get_default_config(cls) -> AlgorithmConfig: + return MAMLConfig() + + @classmethod + @override(Algorithm) + def get_default_policy_class( + cls, config: AlgorithmConfig + ) -> Optional[Type[Policy]]: + if config["framework"] == "torch": + from ray.rllib.algorithms.maml.maml_torch_policy import MAMLTorchPolicy + + return MAMLTorchPolicy + elif config["framework"] == "tf": + from ray.rllib.algorithms.maml.maml_tf_policy import MAMLTF1Policy + + return MAMLTF1Policy + else: + from ray.rllib.algorithms.maml.maml_tf_policy import MAMLTF2Policy + + return MAMLTF2Policy + + @staticmethod + @override(Algorithm) + def execution_plan( + workers: WorkerSet, config: AlgorithmConfig, **kwargs + ) -> LocalIterator[dict]: + assert ( + len(kwargs) == 0 + ), "MAML execution_plan does NOT take any additional parameters" + + # Sync workers with meta policy + workers.sync_weights() + + # Samples and sets worker tasks + use_meta_env = config.use_meta_env + set_worker_tasks(workers, use_meta_env) + + # Metric Collector + metric_collect = CollectMetrics( + workers, + min_history=config.metrics_num_episodes_for_smoothing, + timeout_seconds=config.metrics_episode_collection_timeout_s, + ) + + # Iterator for Inner Adaptation Data gathering (from pre->post + # adaptation) + inner_steps = config.inner_adaptation_steps + + def inner_adaptation_steps(itr): + buf = [] + split = [] + metrics = {} + for samples in itr: + # Processing Samples (Standardize Advantages) + split_lst = [] + for sample in samples: + sample = convert_ma_batch_to_sample_batch(sample) + sample["advantages"] = standardized(sample["advantages"]) + split_lst.append(sample.count) + buf.append(sample) + + split.append(split_lst) + + adapt_iter = len(split) - 1 + metrics = post_process_metrics(adapt_iter, workers, metrics) + if len(split) > inner_steps: + out = concat_samples(buf) + out["split"] = np.array(split) + buf = [] + split = [] + + # Reporting Adaptation Rew Diff + ep_rew_pre = metrics["episode_reward_mean"] + ep_rew_post = metrics[ + "episode_reward_mean_adapt_" + str(inner_steps) + ] + metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre + yield out, metrics + metrics = {} + else: + inner_adaptation(workers, samples) + + rollouts = from_actors(workers.remote_workers()) + rollouts = rollouts.batch_across_shards() + rollouts = rollouts.transform(inner_adaptation_steps) + + # Metaupdate Step + train_op = rollouts.for_each( + MetaUpdate( + workers, config.maml_optimizer_steps, metric_collect, use_meta_env + ) + ) + return train_op diff --git a/rllib_contrib/maml/src/rllib_maml/maml/maml_tf_policy.py b/rllib_contrib/maml/src/rllib_maml/maml/maml_tf_policy.py new file mode 100644 index 000000000000..d81bf8d834ec --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/maml/maml_tf_policy.py @@ -0,0 +1,520 @@ +import logging +from typing import Dict, List, Type, Union + +from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config +from ray.rllib.evaluation.postprocessing import ( + Postprocessing, + compute_gae_for_sample_batch, +) +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_action_dist import TFActionDistribution +from ray.rllib.models.utils import get_activation_fn +from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 +from ray.rllib.policy.eager_tf_policy_v2 import EagerTFPolicyV2 +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.tf_mixins import ( + LocalOptimizer, + ModelGradients, + ValueNetworkMixin, + compute_gradients, +) +from ray.rllib.utils import try_import_tf +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import TensorType + +tf1, tf, tfv = try_import_tf() + +logger = logging.getLogger(__name__) + + +def PPOLoss( + dist_class, + actions, + curr_logits, + behaviour_logits, + advantages, + value_fn, + value_targets, + vf_preds, + cur_kl_coeff, + entropy_coeff, + clip_param, + vf_clip_param, + vf_loss_coeff, + clip_loss=False, +): + def surrogate_loss( + actions, curr_dist, prev_dist, advantages, clip_param, clip_loss + ): + pi_new_logp = curr_dist.logp(actions) + pi_old_logp = prev_dist.logp(actions) + + logp_ratio = tf.math.exp(pi_new_logp - pi_old_logp) + if clip_loss: + return tf.minimum( + advantages * logp_ratio, + advantages + * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param), + ) + return advantages * logp_ratio + + def kl_loss(curr_dist, prev_dist): + return prev_dist.kl(curr_dist) + + def entropy_loss(dist): + return dist.entropy() + + def vf_loss(value_fn, value_targets, vf_preds, vf_clip_param=0.1): + # GAE Value Function Loss + vf_loss1 = tf.math.square(value_fn - value_targets) + vf_clipped = vf_preds + tf.clip_by_value( + value_fn - vf_preds, -vf_clip_param, vf_clip_param + ) + vf_loss2 = tf.math.square(vf_clipped - value_targets) + vf_loss = tf.maximum(vf_loss1, vf_loss2) + return vf_loss + + pi_new_dist = dist_class(curr_logits, None) + pi_old_dist = dist_class(behaviour_logits, None) + + surr_loss = tf.reduce_mean( + surrogate_loss( + actions, pi_new_dist, pi_old_dist, advantages, clip_param, clip_loss + ) + ) + kl_loss = tf.reduce_mean(kl_loss(pi_new_dist, pi_old_dist)) + vf_loss = tf.reduce_mean(vf_loss(value_fn, value_targets, vf_preds, vf_clip_param)) + entropy_loss = tf.reduce_mean(entropy_loss(pi_new_dist)) + + total_loss = -surr_loss + cur_kl_coeff * kl_loss + total_loss += vf_loss_coeff * vf_loss - entropy_coeff * entropy_loss + return total_loss, surr_loss, kl_loss, vf_loss, entropy_loss + + +# This is the computation graph for workers (inner adaptation steps) +class WorkerLoss(object): + def __init__( + self, + dist_class, + actions, + curr_logits, + behaviour_logits, + advantages, + value_fn, + value_targets, + vf_preds, + cur_kl_coeff, + entropy_coeff, + clip_param, + vf_clip_param, + vf_loss_coeff, + clip_loss=False, + ): + self.loss, surr_loss, kl_loss, vf_loss, ent_loss = PPOLoss( + dist_class=dist_class, + actions=actions, + curr_logits=curr_logits, + behaviour_logits=behaviour_logits, + advantages=advantages, + value_fn=value_fn, + value_targets=value_targets, + vf_preds=vf_preds, + cur_kl_coeff=cur_kl_coeff, + entropy_coeff=entropy_coeff, + clip_param=clip_param, + vf_clip_param=vf_clip_param, + vf_loss_coeff=vf_loss_coeff, + clip_loss=clip_loss, + ) + self.loss = tf1.Print(self.loss, ["Worker Adapt Loss", self.loss]) + + +# This is the Meta-Update computation graph for main (meta-update step) +class MAMLLoss(object): + def __init__( + self, + model, + config, + dist_class, + value_targets, + advantages, + actions, + behaviour_logits, + vf_preds, + cur_kl_coeff, + policy_vars, + obs, + num_tasks, + split, + inner_adaptation_steps=1, + entropy_coeff=0, + clip_param=0.3, + vf_clip_param=0.1, + vf_loss_coeff=1.0, + use_gae=True, + ): + self.config = config + self.num_tasks = num_tasks + self.inner_adaptation_steps = inner_adaptation_steps + self.clip_param = clip_param + self.dist_class = dist_class + self.cur_kl_coeff = cur_kl_coeff + + # Split episode tensors into [inner_adaptation_steps+1, num_tasks, -1] + self.obs = self.split_placeholders(obs, split) + self.actions = self.split_placeholders(actions, split) + self.behaviour_logits = self.split_placeholders(behaviour_logits, split) + self.advantages = self.split_placeholders(advantages, split) + self.value_targets = self.split_placeholders(value_targets, split) + self.vf_preds = self.split_placeholders(vf_preds, split) + + # Construct name to tensor dictionary for easier indexing + self.policy_vars = {} + for var in policy_vars: + self.policy_vars[var.name] = var + + # Calculate pi_new for PPO + pi_new_logits, current_policy_vars, value_fns = [], [], [] + for i in range(self.num_tasks): + pi_new, value_fn = self.feed_forward( + self.obs[0][i], self.policy_vars, policy_config=config["model"] + ) + pi_new_logits.append(pi_new) + value_fns.append(value_fn) + current_policy_vars.append(self.policy_vars) + + inner_kls = [] + inner_ppo_loss = [] + + # Recompute weights for inner-adaptation (same weights as workers) + for step in range(self.inner_adaptation_steps): + kls = [] + for i in range(self.num_tasks): + # PPO Loss Function (only Surrogate) + ppo_loss, _, kl_loss, _, _ = PPOLoss( + dist_class=dist_class, + actions=self.actions[step][i], + curr_logits=pi_new_logits[i], + behaviour_logits=self.behaviour_logits[step][i], + advantages=self.advantages[step][i], + value_fn=value_fns[i], + value_targets=self.value_targets[step][i], + vf_preds=self.vf_preds[step][i], + cur_kl_coeff=0.0, + entropy_coeff=entropy_coeff, + clip_param=clip_param, + vf_clip_param=vf_clip_param, + vf_loss_coeff=vf_loss_coeff, + clip_loss=False, + ) + adapted_policy_vars = self.compute_updated_variables( + ppo_loss, current_policy_vars[i] + ) + pi_new_logits[i], value_fns[i] = self.feed_forward( + self.obs[step + 1][i], + adapted_policy_vars, + policy_config=config["model"], + ) + current_policy_vars[i] = adapted_policy_vars + kls.append(kl_loss) + inner_ppo_loss.append(ppo_loss) + + self.kls = kls + inner_kls.append(kls) + + mean_inner_kl = tf.stack( + [tf.reduce_mean(tf.stack(inner_kl)) for inner_kl in inner_kls] + ) + self.mean_inner_kl = mean_inner_kl + + ppo_obj = [] + for i in range(self.num_tasks): + ppo_loss, surr_loss, kl_loss, val_loss, entropy_loss = PPOLoss( + dist_class=dist_class, + actions=self.actions[self.inner_adaptation_steps][i], + curr_logits=pi_new_logits[i], + behaviour_logits=self.behaviour_logits[self.inner_adaptation_steps][i], + advantages=self.advantages[self.inner_adaptation_steps][i], + value_fn=value_fns[i], + value_targets=self.value_targets[self.inner_adaptation_steps][i], + vf_preds=self.vf_preds[self.inner_adaptation_steps][i], + cur_kl_coeff=0.0, + entropy_coeff=entropy_coeff, + clip_param=clip_param, + vf_clip_param=vf_clip_param, + vf_loss_coeff=vf_loss_coeff, + clip_loss=True, + ) + ppo_obj.append(ppo_loss) + self.mean_policy_loss = surr_loss + self.mean_kl = kl_loss + self.mean_vf_loss = val_loss + self.mean_entropy = entropy_loss + self.inner_kl_loss = tf.reduce_mean( + tf.multiply(self.cur_kl_coeff, mean_inner_kl) + ) + self.loss = tf.reduce_mean(tf.stack(ppo_obj, axis=0)) + self.inner_kl_loss + self.loss = tf1.Print( + self.loss, ["Meta-Loss", self.loss, "Inner KL", self.mean_inner_kl] + ) + + def feed_forward(self, obs, policy_vars, policy_config): + # Hacky for now, reconstruct FC network with adapted weights + # @mluo: TODO for any network + def fc_network( + inp, network_vars, hidden_nonlinearity, output_nonlinearity, policy_config + ): + bias_added = False + x = inp + for name, param in network_vars.items(): + if "kernel" in name: + x = tf.matmul(x, param) + elif "bias" in name: + x = tf.add(x, param) + bias_added = True + else: + raise NameError + + if bias_added: + if "out" not in name: + x = hidden_nonlinearity(x) + elif "out" in name: + x = output_nonlinearity(x) + else: + raise NameError + bias_added = False + return x + + policyn_vars = {} + valuen_vars = {} + log_std = None + for name, param in policy_vars.items(): + if "value" in name: + valuen_vars[name] = param + elif "log_std" in name: + log_std = param + else: + policyn_vars[name] = param + + output_nonlinearity = tf.identity + hidden_nonlinearity = get_activation_fn(policy_config["fcnet_activation"]) + + pi_new_logits = fc_network( + obs, policyn_vars, hidden_nonlinearity, output_nonlinearity, policy_config + ) + if log_std is not None: + pi_new_logits = tf.concat([pi_new_logits, 0.0 * pi_new_logits + log_std], 1) + value_fn = fc_network( + obs, valuen_vars, hidden_nonlinearity, output_nonlinearity, policy_config + ) + + return pi_new_logits, tf.reshape(value_fn, [-1]) + + def compute_updated_variables(self, loss, network_vars): + grad = tf.gradients(loss, list(network_vars.values())) + adapted_vars = {} + for i, tup in enumerate(network_vars.items()): + name, var = tup + if grad[i] is None: + adapted_vars[name] = var + else: + adapted_vars[name] = var - self.config["inner_lr"] * grad[i] + return adapted_vars + + def split_placeholders(self, placeholder, split): + inner_placeholder_list = tf.split( + placeholder, tf.math.reduce_sum(split, axis=1), axis=0 + ) + placeholder_list = [] + for index, split_placeholder in enumerate(inner_placeholder_list): + placeholder_list.append(tf.split(split_placeholder, split[index], axis=0)) + return placeholder_list + + +class KLCoeffMixin: + def __init__(self, config): + self.kl_coeff_val = [config["kl_coeff"]] * config["inner_adaptation_steps"] + self.kl_target = self.config["kl_target"] + self.kl_coeff = tf1.get_variable( + initializer=tf.keras.initializers.Constant(self.kl_coeff_val), + name="kl_coeff", + shape=(config["inner_adaptation_steps"]), + trainable=False, + dtype=tf.float32, + ) + + def update_kls(self, sampled_kls): + for i, kl in enumerate(sampled_kls): + if kl < self.kl_target / 1.5: + self.kl_coeff_val[i] *= 0.5 + elif kl > 1.5 * self.kl_target: + self.kl_coeff_val[i] *= 2.0 + print(self.kl_coeff_val) + self.kl_coeff.load(self.kl_coeff_val, session=self.get_session()) + return self.kl_coeff_val + + +# We need this builder function because we want to share the same +# custom logics between TF1 dynamic and TF2 eager policies. +def get_maml_tf_policy(name: str, base: type) -> type: + """Construct a MAMLTFPolicy inheriting either dynamic or eager base policies. + + Args: + base: Base class for this policy. DynamicTFPolicyV2 or EagerTFPolicyV2. + + Returns: + A TF Policy to be used with MAML. + """ + + class MAMLTFPolicy(KLCoeffMixin, ValueNetworkMixin, base): + def __init__( + self, + observation_space, + action_space, + config, + existing_model=None, + existing_inputs=None, + ): + # First thing first, enable eager execution if necessary. + base.enable_eager_execution_if_necessary() + + validate_config(config) + + # Initialize base class. + base.__init__( + self, + observation_space, + action_space, + config, + existing_inputs=existing_inputs, + existing_model=existing_model, + ) + + KLCoeffMixin.__init__(self, config) + ValueNetworkMixin.__init__(self, config) + + # Create the `split` placeholder before initialize loss. + if self.framework == "tf": + self._loss_input_dict["split"] = tf1.placeholder( + tf.int32, + name="Meta-Update-Splitting", + shape=( + self.config["inner_adaptation_steps"] + 1, + self.config["num_workers"], + ), + ) + + # Note: this is a bit ugly, but loss and optimizer initialization must + # happen after all the MixIns are initialized. + self.maybe_initialize_optimizer_and_loss() + + @override(base) + def loss( + self, + model: Union[ModelV2, "tf.keras.Model"], + dist_class: Type[TFActionDistribution], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + logits, state = model(train_batch) + self.cur_lr = self.config["lr"] + + if self.config["worker_index"]: + self.loss_obj = WorkerLoss( + dist_class=dist_class, + actions=train_batch[SampleBatch.ACTIONS], + curr_logits=logits, + behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], + advantages=train_batch[Postprocessing.ADVANTAGES], + value_fn=model.value_function(), + value_targets=train_batch[Postprocessing.VALUE_TARGETS], + vf_preds=train_batch[SampleBatch.VF_PREDS], + cur_kl_coeff=0.0, + entropy_coeff=self.config["entropy_coeff"], + clip_param=self.config["clip_param"], + vf_clip_param=self.config["vf_clip_param"], + vf_loss_coeff=self.config["vf_loss_coeff"], + clip_loss=False, + ) + else: + self.var_list = tf1.get_collection( + tf1.GraphKeys.TRAINABLE_VARIABLES, tf1.get_variable_scope().name + ) + self.loss_obj = MAMLLoss( + model=model, + dist_class=dist_class, + value_targets=train_batch[Postprocessing.VALUE_TARGETS], + advantages=train_batch[Postprocessing.ADVANTAGES], + actions=train_batch[SampleBatch.ACTIONS], + behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], + vf_preds=train_batch[SampleBatch.VF_PREDS], + cur_kl_coeff=self.kl_coeff, + policy_vars=self.var_list, + obs=train_batch[SampleBatch.CUR_OBS], + num_tasks=self.config["num_workers"], + split=train_batch["split"], + config=self.config, + inner_adaptation_steps=self.config["inner_adaptation_steps"], + entropy_coeff=self.config["entropy_coeff"], + clip_param=self.config["clip_param"], + vf_clip_param=self.config["vf_clip_param"], + vf_loss_coeff=self.config["vf_loss_coeff"], + use_gae=self.config["use_gae"], + ) + + return self.loss_obj.loss + + @override(base) + def optimizer( + self, + ) -> Union[ + "tf.keras.optimizers.Optimizer", List["tf.keras.optimizers.Optimizer"] + ]: + """ + Workers use simple SGD for inner adaptation + Meta-Policy uses Adam optimizer for meta-update + """ + if not self.config["worker_index"]: + return tf1.train.AdamOptimizer(learning_rate=self.config["lr"]) + return tf1.train.GradientDescentOptimizer( + learning_rate=self.config["inner_lr"] + ) + + @override(base) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + if self.config["worker_index"]: + return {"worker_loss": self.loss_obj.loss} + else: + return { + "cur_kl_coeff": tf.cast(self.kl_coeff, tf.float64), + "cur_lr": tf.cast(self.cur_lr, tf.float64), + "total_loss": self.loss_obj.loss, + "policy_loss": self.loss_obj.mean_policy_loss, + "vf_loss": self.loss_obj.mean_vf_loss, + "kl": self.loss_obj.mean_kl, + "inner_kl": self.loss_obj.mean_inner_kl, + "entropy": self.loss_obj.mean_entropy, + } + + @override(base) + def postprocess_trajectory( + self, sample_batch, other_agent_batches=None, episode=None + ): + sample_batch = super().postprocess_trajectory(sample_batch) + return compute_gae_for_sample_batch( + self, sample_batch, other_agent_batches, episode + ) + + @override(base) + def compute_gradients_fn( + self, optimizer: LocalOptimizer, loss: TensorType + ) -> ModelGradients: + return compute_gradients(self, optimizer, loss) + + MAMLTFPolicy.__name__ = name + MAMLTFPolicy.__qualname__ = name + + return MAMLTFPolicy + + +MAMLTF1Policy = get_maml_tf_policy("MAMLTF1Policy", DynamicTFPolicyV2) +MAMLTF2Policy = get_maml_tf_policy("MAMLTF2Policy", EagerTFPolicyV2) diff --git a/rllib_contrib/maml/src/rllib_maml/maml/maml_torch_policy.py b/rllib_contrib/maml/src/rllib_maml/maml/maml_torch_policy.py new file mode 100644 index 000000000000..4a16f5eb950a --- /dev/null +++ b/rllib_contrib/maml/src/rllib_maml/maml/maml_torch_policy.py @@ -0,0 +1,449 @@ +import logging +from typing import Dict, List, Type, Union + +import ray +from ray.rllib.algorithms.ppo.ppo_tf_policy import validate_config +from ray.rllib.evaluation.postprocessing import ( + Postprocessing, + compute_gae_for_sample_batch, +) +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.torch.torch_action_dist import TorchDistributionWrapper +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.torch_mixins import ValueNetworkMixin +from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.numpy import convert_to_numpy +from ray.rllib.utils.torch_utils import apply_grad_clipping +from ray.rllib.utils.typing import TensorType + +torch, nn = try_import_torch() +logger = logging.getLogger(__name__) + +try: + import higher +except (ImportError, ModuleNotFoundError): + raise ImportError( + ( + "The MAML and MB-MPO algorithms require the `higher` module to be " + "installed! However, there was no installation found. You can install it " + "via `pip install higher`." + ) + ) + + +def PPOLoss( + dist_class, + actions, + curr_logits, + behaviour_logits, + advantages, + value_fn, + value_targets, + vf_preds, + cur_kl_coeff, + entropy_coeff, + clip_param, + vf_clip_param, + vf_loss_coeff, + clip_loss=False, +): + def surrogate_loss( + actions, curr_dist, prev_dist, advantages, clip_param, clip_loss + ): + pi_new_logp = curr_dist.logp(actions) + pi_old_logp = prev_dist.logp(actions) + + logp_ratio = torch.exp(pi_new_logp - pi_old_logp) + if clip_loss: + return torch.min( + advantages * logp_ratio, + advantages * torch.clamp(logp_ratio, 1 - clip_param, 1 + clip_param), + ) + return advantages * logp_ratio + + def kl_loss(curr_dist, prev_dist): + return prev_dist.kl(curr_dist) + + def entropy_loss(dist): + return dist.entropy() + + def vf_loss(value_fn, value_targets, vf_preds, vf_clip_param=0.1): + # GAE Value Function Loss + vf_loss1 = torch.pow(value_fn - value_targets, 2.0) + vf_clipped = vf_preds + torch.clamp( + value_fn - vf_preds, -vf_clip_param, vf_clip_param + ) + vf_loss2 = torch.pow(vf_clipped - value_targets, 2.0) + vf_loss = torch.max(vf_loss1, vf_loss2) + return vf_loss + + pi_new_dist = dist_class(curr_logits, None) + pi_old_dist = dist_class(behaviour_logits, None) + + surr_loss = torch.mean( + surrogate_loss( + actions, pi_new_dist, pi_old_dist, advantages, clip_param, clip_loss + ) + ) + kl_loss = torch.mean(kl_loss(pi_new_dist, pi_old_dist)) + vf_loss = torch.mean(vf_loss(value_fn, value_targets, vf_preds, vf_clip_param)) + entropy_loss = torch.mean(entropy_loss(pi_new_dist)) + + total_loss = -surr_loss + cur_kl_coeff * kl_loss + total_loss += vf_loss_coeff * vf_loss + total_loss -= entropy_coeff * entropy_loss + return total_loss, surr_loss, kl_loss, vf_loss, entropy_loss + + +# This is the computation graph for workers (inner adaptation steps) +class WorkerLoss(object): + def __init__( + self, + model, + dist_class, + actions, + curr_logits, + behaviour_logits, + advantages, + value_fn, + value_targets, + vf_preds, + cur_kl_coeff, + entropy_coeff, + clip_param, + vf_clip_param, + vf_loss_coeff, + clip_loss=False, + ): + self.loss, surr_loss, kl_loss, vf_loss, ent_loss = PPOLoss( + dist_class=dist_class, + actions=actions, + curr_logits=curr_logits, + behaviour_logits=behaviour_logits, + advantages=advantages, + value_fn=value_fn, + value_targets=value_targets, + vf_preds=vf_preds, + cur_kl_coeff=cur_kl_coeff, + entropy_coeff=entropy_coeff, + clip_param=clip_param, + vf_clip_param=vf_clip_param, + vf_loss_coeff=vf_loss_coeff, + clip_loss=clip_loss, + ) + + +# This is the Meta-Update computation graph for main (meta-update step) +class MAMLLoss(object): + def __init__( + self, + model, + config, + dist_class, + value_targets, + advantages, + actions, + behaviour_logits, + vf_preds, + cur_kl_coeff, + policy_vars, + obs, + num_tasks, + split, + meta_opt, + inner_adaptation_steps=1, + entropy_coeff=0, + clip_param=0.3, + vf_clip_param=0.1, + vf_loss_coeff=1.0, + use_gae=True, + ): + self.config = config + self.num_tasks = num_tasks + self.inner_adaptation_steps = inner_adaptation_steps + self.clip_param = clip_param + self.dist_class = dist_class + self.cur_kl_coeff = cur_kl_coeff + self.model = model + self.vf_clip_param = vf_clip_param + self.vf_loss_coeff = vf_loss_coeff + self.entropy_coeff = entropy_coeff + + # Split episode tensors into [inner_adaptation_steps+1, num_tasks, -1] + self.obs = self.split_placeholders(obs, split) + self.actions = self.split_placeholders(actions, split) + self.behaviour_logits = self.split_placeholders(behaviour_logits, split) + self.advantages = self.split_placeholders(advantages, split) + self.value_targets = self.split_placeholders(value_targets, split) + self.vf_preds = self.split_placeholders(vf_preds, split) + + inner_opt = torch.optim.SGD(model.parameters(), lr=config["inner_lr"]) + surr_losses = [] + val_losses = [] + kl_losses = [] + entropy_losses = [] + meta_losses = [] + kls = [] + + meta_opt.zero_grad() + for i in range(self.num_tasks): + with higher.innerloop_ctx(model, inner_opt, copy_initial_weights=False) as ( + fnet, + diffopt, + ): + inner_kls = [] + for step in range(self.inner_adaptation_steps): + ppo_loss, _, inner_kl_loss, _, _ = self.compute_losses( + fnet, step, i + ) + diffopt.step(ppo_loss) + inner_kls.append(inner_kl_loss) + kls.append(inner_kl_loss.detach()) + + # Meta Update + ppo_loss, s_loss, kl_loss, v_loss, ent = self.compute_losses( + fnet, self.inner_adaptation_steps - 1, i, clip_loss=True + ) + + inner_loss = torch.mean( + torch.stack( + [ + a * b + for a, b in zip( + self.cur_kl_coeff[ + i + * self.inner_adaptation_steps : (i + 1) + * self.inner_adaptation_steps + ], + inner_kls, + ) + ] + ) + ) + meta_loss = (ppo_loss + inner_loss) / self.num_tasks + meta_loss.backward() + + surr_losses.append(s_loss.detach()) + kl_losses.append(kl_loss.detach()) + val_losses.append(v_loss.detach()) + entropy_losses.append(ent.detach()) + meta_losses.append(meta_loss.detach()) + + meta_opt.step() + + # Stats Logging + self.mean_policy_loss = torch.mean(torch.stack(surr_losses)) + self.mean_kl_loss = torch.mean(torch.stack(kl_losses)) + self.mean_vf_loss = torch.mean(torch.stack(val_losses)) + self.mean_entropy = torch.mean(torch.stack(entropy_losses)) + self.mean_inner_kl = kls + self.loss = torch.sum(torch.stack(meta_losses)) + # Hacky, needed to bypass RLlib backend + self.loss.requires_grad = True + + def compute_losses(self, model, inner_adapt_iter, task_iter, clip_loss=False): + obs = self.obs[inner_adapt_iter][task_iter] + obs_dict = {"obs": obs, "obs_flat": obs} + curr_logits, _ = model.forward(obs_dict, None, None) + value_fns = model.value_function() + ppo_loss, surr_loss, kl_loss, val_loss, ent_loss = PPOLoss( + dist_class=self.dist_class, + actions=self.actions[inner_adapt_iter][task_iter], + curr_logits=curr_logits, + behaviour_logits=self.behaviour_logits[inner_adapt_iter][task_iter], + advantages=self.advantages[inner_adapt_iter][task_iter], + value_fn=value_fns, + value_targets=self.value_targets[inner_adapt_iter][task_iter], + vf_preds=self.vf_preds[inner_adapt_iter][task_iter], + cur_kl_coeff=0.0, + entropy_coeff=self.entropy_coeff, + clip_param=self.clip_param, + vf_clip_param=self.vf_clip_param, + vf_loss_coeff=self.vf_loss_coeff, + clip_loss=clip_loss, + ) + return ppo_loss, surr_loss, kl_loss, val_loss, ent_loss + + def split_placeholders(self, placeholder, split): + inner_placeholder_list = torch.split( + placeholder, torch.sum(split, dim=1).tolist(), dim=0 + ) + placeholder_list = [] + for index, split_placeholder in enumerate(inner_placeholder_list): + placeholder_list.append( + torch.split(split_placeholder, split[index].tolist(), dim=0) + ) + return placeholder_list + + +class KLCoeffMixin: + def __init__(self, config): + self.kl_coeff_val = ( + [config["kl_coeff"]] + * config["inner_adaptation_steps"] + * config["num_workers"] + ) + self.kl_target = self.config["kl_target"] + + def update_kls(self, sampled_kls): + for i, kl in enumerate(sampled_kls): + if kl < self.kl_target / 1.5: + self.kl_coeff_val[i] *= 0.5 + elif kl > 1.5 * self.kl_target: + self.kl_coeff_val[i] *= 2.0 + return self.kl_coeff_val + + +class MAMLTorchPolicy(ValueNetworkMixin, KLCoeffMixin, TorchPolicyV2): + """PyTorch policy class used with MAML.""" + + def __init__(self, observation_space, action_space, config): + config = dict(ray.rllib.algorithms.maml.maml.MAMLConfig(), **config) + validate_config(config) + + TorchPolicyV2.__init__( + self, + observation_space, + action_space, + config, + max_seq_len=config["model"]["max_seq_len"], + ) + + KLCoeffMixin.__init__(self, config) + ValueNetworkMixin.__init__(self, config) + + # TODO: Don't require users to call this manually. + self._initialize_loss_from_dummy_batch() + + @override(TorchPolicyV2) + def loss( + self, + model: ModelV2, + dist_class: Type[TorchDistributionWrapper], + train_batch: SampleBatch, + ) -> Union[TensorType, List[TensorType]]: + """Constructs the loss function. + + Args: + model: The Model to calculate the loss for. + dist_class: The action distr. class. + train_batch: The training data. + + Returns: + The PPO loss tensor given the input batch. + """ + logits, state = model(train_batch) + self.cur_lr = self.config["lr"] + + if self.config["worker_index"]: + self.loss_obj = WorkerLoss( + model=model, + dist_class=dist_class, + actions=train_batch[SampleBatch.ACTIONS], + curr_logits=logits, + behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], + advantages=train_batch[Postprocessing.ADVANTAGES], + value_fn=model.value_function(), + value_targets=train_batch[Postprocessing.VALUE_TARGETS], + vf_preds=train_batch[SampleBatch.VF_PREDS], + cur_kl_coeff=0.0, + entropy_coeff=self.config["entropy_coeff"], + clip_param=self.config["clip_param"], + vf_clip_param=self.config["vf_clip_param"], + vf_loss_coeff=self.config["vf_loss_coeff"], + clip_loss=False, + ) + else: + self.var_list = model.named_parameters() + + # `split` may not exist yet (during test-loss call), use a dummy value. + # Cannot use get here due to train_batch being a TrackingDict. + if "split" in train_batch: + split = train_batch["split"] + else: + split_shape = ( + self.config["inner_adaptation_steps"], + self.config["num_workers"], + ) + split_const = int( + train_batch["obs"].shape[0] // (split_shape[0] * split_shape[1]) + ) + split = torch.ones(split_shape, dtype=int) * split_const + self.loss_obj = MAMLLoss( + model=model, + dist_class=dist_class, + value_targets=train_batch[Postprocessing.VALUE_TARGETS], + advantages=train_batch[Postprocessing.ADVANTAGES], + actions=train_batch[SampleBatch.ACTIONS], + behaviour_logits=train_batch[SampleBatch.ACTION_DIST_INPUTS], + vf_preds=train_batch[SampleBatch.VF_PREDS], + cur_kl_coeff=self.kl_coeff_val, + policy_vars=self.var_list, + obs=train_batch[SampleBatch.CUR_OBS], + num_tasks=self.config["num_workers"], + split=split, + config=self.config, + inner_adaptation_steps=self.config["inner_adaptation_steps"], + entropy_coeff=self.config["entropy_coeff"], + clip_param=self.config["clip_param"], + vf_clip_param=self.config["vf_clip_param"], + vf_loss_coeff=self.config["vf_loss_coeff"], + use_gae=self.config["use_gae"], + meta_opt=self.meta_opt, + ) + + return self.loss_obj.loss + + @override(TorchPolicyV2) + def optimizer( + self, + ) -> Union[List["torch.optim.Optimizer"], "torch.optim.Optimizer"]: + """ + Workers use simple SGD for inner adaptation + Meta-Policy uses Adam optimizer for meta-update + """ + if not self.config["worker_index"]: + self.meta_opt = torch.optim.Adam( + self.model.parameters(), lr=self.config["lr"] + ) + return self.meta_opt + return torch.optim.SGD(self.model.parameters(), lr=self.config["inner_lr"]) + + @override(TorchPolicyV2) + def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: + if self.config["worker_index"]: + return convert_to_numpy({"worker_loss": self.loss_obj.loss}) + else: + return convert_to_numpy( + { + "cur_kl_coeff": self.kl_coeff_val, + "cur_lr": self.cur_lr, + "total_loss": self.loss_obj.loss, + "policy_loss": self.loss_obj.mean_policy_loss, + "vf_loss": self.loss_obj.mean_vf_loss, + "kl_loss": self.loss_obj.mean_kl_loss, + "inner_kl": self.loss_obj.mean_inner_kl, + "entropy": self.loss_obj.mean_entropy, + } + ) + + @override(TorchPolicyV2) + def extra_grad_process( + self, optimizer: "torch.optim.Optimizer", loss: TensorType + ) -> Dict[str, TensorType]: + return apply_grad_clipping(self, optimizer, loss) + + @override(TorchPolicyV2) + def postprocess_trajectory( + self, sample_batch, other_agent_batches=None, episode=None + ): + # Do all post-processing always with no_grad(). + # Not using this here will introduce a memory leak + # in torch (issue #6962). + # TODO: no_grad still necessary? + with torch.no_grad(): + return compute_gae_for_sample_batch( + self, sample_batch, other_agent_batches, episode + ) diff --git a/rllib_contrib/maml/tests/test_maml.py b/rllib_contrib/maml/tests/test_maml.py new file mode 100644 index 000000000000..774be4ecde41 --- /dev/null +++ b/rllib_contrib/maml/tests/test_maml.py @@ -0,0 +1,61 @@ +import unittest + +from gymnasium.wrappers import TimeLimit +from rllib_maml.envs.cartpole_mass import CartPoleMassEnv +from rllib_maml.envs.pendulum_mass import PendulumMassEnv +from rllib_maml.maml import MAMLConfig + +import ray +from ray.rllib.utils.test_utils import ( + check_compute_single_action, + check_train_results, + framework_iterator, +) +from ray.tune.registry import register_env + + +class TestMAML(unittest.TestCase): + @classmethod + def setUpClass(cls): + ray.init() + register_env( + "cartpole", + lambda env_cfg: TimeLimit(CartPoleMassEnv(), max_episode_steps=200), + ) + register_env( + "pendulum", + lambda env_cfg: TimeLimit(PendulumMassEnv(), max_episode_steps=200), + ) + + @classmethod + def tearDownClass(cls): + ray.shutdown() + + def test_maml_compilation(self): + """Test whether MAML can be built with all frameworks.""" + config = MAMLConfig().rollouts(num_rollout_workers=1) + + num_iterations = 1 + + # Test for tf framework (torch not implemented yet). + for fw in framework_iterator(config, frameworks=("tf", "torch")): + for env in ["cartpole", "pendulum"]: + if fw == "tf" and env.startswith("cartpole"): + continue + print("env={}".format(env)) + config.environment(env) + algo = config.build() + for i in range(num_iterations): + results = algo.train() + check_train_results(results) + print(results) + check_compute_single_action(algo, include_prev_action_reward=True) + algo.stop() + + +if __name__ == "__main__": + import sys + + import pytest + + sys.exit(pytest.main(["-v", __file__])) diff --git a/src/mock/ray/gcs/gcs_server/gcs_resource_manager.h b/src/mock/ray/gcs/gcs_server/gcs_resource_manager.h index 5f37fa0220d3..44bbaf0910b5 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_resource_manager.h +++ b/src/mock/ray/gcs/gcs_server/gcs_resource_manager.h @@ -16,16 +16,20 @@ namespace ray { namespace gcs { - +static instrumented_io_context __mock_io_context_; +static ClusterResourceManager __mock_cluster_resource_manager_(__mock_io_context_); class MockGcsResourceManager : public GcsResourceManager { public: using GcsResourceManager::GcsResourceManager; explicit MockGcsResourceManager() - : GcsResourceManager( - io_context_, cluster_resource_manager_, NodeID::FromRandom(), nullptr) {} + : GcsResourceManager(__mock_io_context_, + __mock_cluster_resource_manager_, + NodeID::FromRandom(), + nullptr) {} explicit MockGcsResourceManager(ClusterResourceManager &cluster_resource_manager) : GcsResourceManager( - io_context_, cluster_resource_manager, NodeID::FromRandom(), nullptr) {} + __mock_io_context_, cluster_resource_manager, NodeID::FromRandom(), nullptr) { + } MOCK_METHOD(void, HandleGetResources, @@ -51,10 +55,6 @@ class MockGcsResourceManager : public GcsResourceManager { rpc::GetAllResourceUsageReply *reply, rpc::SendReplyCallback send_reply_callback), (override)); - - private: - instrumented_io_context io_context_; - ClusterResourceManager cluster_resource_manager_; }; } // namespace gcs diff --git a/src/mock/ray/pubsub/publisher.h b/src/mock/ray/pubsub/publisher.h index 77fd2fd68802..e3f5a4447999 100644 --- a/src/mock/ray/pubsub/publisher.h +++ b/src/mock/ray/pubsub/publisher.h @@ -59,7 +59,7 @@ class MockPublisherInterface : public PublisherInterface { const SubscriberID &subscriber_id, const std::optional &key_id), (override)); - MOCK_METHOD(void, Publish, (const rpc::PubMessage &pub_message), (override)); + MOCK_METHOD(void, Publish, (rpc::PubMessage pub_message), (override)); MOCK_METHOD(void, PublishFailure, (const rpc::ChannelType channel_type, const std::string &key_id), @@ -86,7 +86,7 @@ class MockPublisher : public Publisher { const SubscriberID &subscriber_id, const std::optional &key_id), (override)); - MOCK_METHOD(void, Publish, (const rpc::PubMessage &pub_message), (override)); + MOCK_METHOD(void, Publish, (rpc::PubMessage pub_message), (override)); MOCK_METHOD(void, PublishFailure, (const rpc::ChannelType channel_type, const std::string &key_id), diff --git a/src/ray/common/asio/asio_util.h b/src/ray/common/asio/asio_util.h index 0fa69d0f8e2e..1bd513c2fe4f 100644 --- a/src/ray/common/asio/asio_util.h +++ b/src/ray/common/asio/asio_util.h @@ -15,22 +15,23 @@ #pragma once #include +#include -inline std::shared_ptr execute_after_us( +template +std::shared_ptr execute_after( instrumented_io_context &io_context, std::function fn, - int64_t delay_microseconds) { + Duration delay_duration) { auto timer = std::make_shared(io_context); - timer->expires_from_now(boost::posix_time::microseconds(delay_microseconds)); + auto delay = boost::posix_time::microseconds( + std::chrono::duration_cast(delay_duration).count()); + timer->expires_from_now(delay); + timer->async_wait([timer, fn = std::move(fn)](const boost::system::error_code &error) { if (error != boost::asio::error::operation_aborted && fn) { fn(); } }); - return timer; -} -inline std::shared_ptr execute_after( - instrumented_io_context &io_context, std::function fn, int64_t milliseconds) { - return execute_after_us(io_context, fn, milliseconds * 1000); + return timer; } diff --git a/src/ray/common/asio/instrumented_io_context.cc b/src/ray/common/asio/instrumented_io_context.cc index e0fadbda15fb..3e577826f519 100644 --- a/src/ray/common/asio/instrumented_io_context.cc +++ b/src/ray/common/asio/instrumented_io_context.cc @@ -41,7 +41,7 @@ void instrumented_io_context::post(std::function handler, boost::asio::io_context::post(std::move(handler)); } else { RAY_LOG(DEBUG) << "Deferring " << name << " by " << defer_us << "us"; - execute_after_us(*this, std::move(handler), defer_us); + execute_after(*this, std::move(handler), std::chrono::microseconds(defer_us)); } } @@ -65,7 +65,7 @@ void instrumented_io_context::post(std::function handler, } else { RAY_LOG(DEBUG) << "Deferring " << stats_handle->event_name << " by " << defer_us << "us"; - execute_after_us(*this, std::move(handler), defer_us); + execute_after(*this, std::move(handler), std::chrono::microseconds(defer_us)); } } diff --git a/src/ray/common/constants.h b/src/ray/common/constants.h index f7646ee0ebe5..bf83ecc5189c 100644 --- a/src/ray/common/constants.h +++ b/src/ray/common/constants.h @@ -17,6 +17,11 @@ #include #include +/// The precision of fractional resource quantity. +constexpr int kResourceUnitScaling = 10000; + +constexpr char kWorkerSetupHookKeyName[] = "FunctionsToRun"; + /// Length of Ray full-length IDs in bytes. constexpr size_t kUniqueIDSize = 28; diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 674c69ed4eeb..affb9ec7c9a5 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -476,6 +476,10 @@ RAY_CONFIG(uint64_t, task_events_max_buffer_size, 100 * 1000) /// the message size, and also the processing work on GCS. RAY_CONFIG(uint64_t, task_events_send_batch_size, 10 * 1000) +/// Max number of dropped task attempt info to be sent in a single rpc call to +/// GCS for task events in rpc::TaskEventsData +RAY_CONFIG(uint64_t, task_events_drop_task_attempt_batch_size, 10 * 1000) + /// Max number of profile events allowed for a single task when sent to GCS. /// NOTE: this limit only applies to the profile events per task in a single /// report gRPC call. A task could have more profile events in GCS from multiple @@ -800,7 +804,7 @@ RAY_CONFIG(bool, kill_idle_workers_of_terminated_job, true) RAY_CONFIG(std::vector, preload_python_modules, {}) // By default, raylet send a self liveness check to GCS every 60s -RAY_CONFIG(int64_t, raylet_liveness_self_check_interval_ms, 60000) +RAY_CONFIG(int64_t, raylet_liveness_self_check_interval_ms, 5000) // Instruct the CoreWorker to kill its child processes while // it exits. This prevents certain classes of resource leaks diff --git a/src/ray/common/ray_syncer/ray_syncer-inl.h b/src/ray/common/ray_syncer/ray_syncer-inl.h index 89e6758f18b0..8ec216a43f1c 100644 --- a/src/ray/common/ray_syncer/ray_syncer-inl.h +++ b/src/ray/common/ray_syncer/ray_syncer-inl.h @@ -63,6 +63,9 @@ class NodeState { return cluster_view_; } + /// Remove a node from the cluster view. + bool RemoveNode(const std::string &node_id); + private: /// For local nodes std::array reporters_ = {nullptr}; diff --git a/src/ray/common/ray_syncer/ray_syncer.cc b/src/ray/common/ray_syncer/ray_syncer.cc index 3f56ffc94550..b050fc06aad0 100644 --- a/src/ray/common/ray_syncer/ray_syncer.cc +++ b/src/ray/common/ray_syncer/ray_syncer.cc @@ -16,7 +16,9 @@ #include +#include "ray/common/asio/asio_util.h" #include "ray/common/ray_config.h" + namespace ray { namespace syncer { @@ -52,6 +54,10 @@ std::optional NodeState::CreateSyncMessage(MessageType message_t return message; } +bool NodeState::RemoveNode(const std::string &node_id) { + return cluster_view_.erase(node_id) != 0; +} + bool NodeState::ConsumeSyncMessage(std::shared_ptr message) { auto ¤t = cluster_view_[message->node_id()][message->message_type()]; @@ -171,33 +177,42 @@ RaySyncer::RaySyncer(instrumented_io_context &io_context, RaySyncer::~RaySyncer() { *stopped_ = true; - io_context_.dispatch( - [reactors = sync_reactors_]() { - for (auto [_, reactor] : reactors) { - reactor->Disconnect(); + boost::asio::dispatch(io_context_.get_executor(), [reactors = sync_reactors_]() { + for (auto [_, reactor] : reactors) { + reactor->Disconnect(); + } + }); +} + +std::shared_ptr RaySyncer::GetSyncMessage( + const std::string &node_id, MessageType message_type) const { + auto task = std::packaged_task()>( + [this, &node_id, message_type]() -> std::shared_ptr { + auto &view = node_state_->GetClusterView(); + if (auto iter = view.find(node_id); iter != view.end()) { + return iter->second[message_type]; } - }, - ""); + return nullptr; + }); + + return boost::asio::dispatch(io_context_.get_executor(), std::move(task)).get(); } std::vector RaySyncer::GetAllConnectedNodeIDs() const { - std::promise> promise; - io_context_.dispatch( - [&]() { - std::vector nodes; - for (auto [node_id, _] : sync_reactors_) { - nodes.push_back(node_id); - } - promise.set_value(std::move(nodes)); - }, - ""); - return promise.get_future().get(); + auto task = std::packaged_task()>([&]() { + std::vector nodes; + for (auto [node_id, _] : sync_reactors_) { + nodes.push_back(node_id); + } + return nodes; + }); + return boost::asio::dispatch(io_context_.get_executor(), std::move(task)).get(); } void RaySyncer::Connect(const std::string &node_id, std::shared_ptr channel) { - io_context_.dispatch( - [=]() { + boost::asio::dispatch( + io_context_.get_executor(), std::packaged_task([=]() { auto stub = ray::rpc::syncer::RaySyncer::NewStub(channel); auto reactor = new RayClientBidiReactor( /* remote_node_id */ node_id, @@ -208,21 +223,28 @@ void RaySyncer::Connect(const std::string &node_id, [this, channel](const std::string &node_id, bool restart) { sync_reactors_.erase(node_id); if (restart) { - RAY_LOG(INFO) << "Connection is broken. Reconnect to node: " - << NodeID::FromBinary(node_id); - Connect(node_id, channel); + execute_after( + io_context_, + [this, node_id, channel]() { + RAY_LOG(INFO) << "Connection is broken. Reconnect to node: " + << NodeID::FromBinary(node_id); + Connect(node_id, channel); + }, + /* delay_microseconds = */ std::chrono::milliseconds(2000)); + } else { + node_state_->RemoveNode(node_id); } }, /* stub */ std::move(stub)); Connect(reactor); reactor->StartCall(); - }, - ""); + })) + .get(); } void RaySyncer::Connect(RaySyncerBidiReactor *reactor) { - io_context_.dispatch( - [this, reactor]() { + boost::asio::dispatch( + io_context_.get_executor(), std::packaged_task([this, reactor]() { RAY_CHECK(sync_reactors_.find(reactor->GetRemoteNodeID()) == sync_reactors_.end()); sync_reactors_[reactor->GetRemoteNodeID()] = reactor; @@ -239,29 +261,24 @@ void RaySyncer::Connect(RaySyncerBidiReactor *reactor) { reactor->PushToSendingQueue(message); } } - }, - "RaySyncerConnect"); + })) + .get(); } void RaySyncer::Disconnect(const std::string &node_id) { - std::promise promise; - io_context_.dispatch( - [&]() { - auto iter = sync_reactors_.find(node_id); - if (iter == sync_reactors_.end()) { - promise.set_value(); - return; - } - - auto reactor = iter->second; - if (iter != sync_reactors_.end()) { - sync_reactors_.erase(iter); - } - reactor->Disconnect(); - promise.set_value(); - }, - "RaySyncerDisconnect"); - promise.get_future().get(); + auto task = std::packaged_task([&]() { + auto iter = sync_reactors_.find(node_id); + if (iter == sync_reactors_.end()) { + return; + } + + auto reactor = iter->second; + if (iter != sync_reactors_.end()) { + sync_reactors_.erase(iter); + } + reactor->Disconnect(); + }); + boost::asio::dispatch(io_context_.get_executor(), std::move(task)).get(); } void RaySyncer::Register(MessageType message_type, @@ -335,6 +352,7 @@ ServerBidiReactor *RaySyncerService::StartSync(grpc::CallbackServerContext *cont // No need to reconnect for server side. RAY_CHECK(!reconnect); syncer_.sync_reactors_.erase(node_id); + syncer_.node_state_->RemoveNode(node_id); }); RAY_LOG(DEBUG) << "Get connection from " << NodeID::FromBinary(reactor->GetRemoteNodeID()) << " to " diff --git a/src/ray/common/ray_syncer/ray_syncer.h b/src/ray/common/ray_syncer/ray_syncer.h index abe835c60e0a..0674ba704fcd 100644 --- a/src/ray/common/ray_syncer/ray_syncer.h +++ b/src/ray/common/ray_syncer/ray_syncer.h @@ -102,6 +102,16 @@ class RaySyncer { void Disconnect(const std::string &node_id); + /// Get the latest sync message sent from a specific node. + /// + /// \param node_id The node id where the message comes from. + /// \param message_type The message type of the component. + /// + /// \return The latest sync message sent from the node. If the node doesn't + /// have one, nullptr will be returned. + std::shared_ptr GetSyncMessage(const std::string &node_id, + MessageType message_type) const; + /// Register the components to the syncer module. Syncer will make sure eventually /// it'll have a global view of the cluster. /// @@ -129,6 +139,10 @@ class RaySyncer { /// version of message, false will be returned. bool OnDemandBroadcasting(MessageType message_type); + /// WARNING: DON'T USE THIS METHOD. It breaks the abstraction of the syncer. + /// Instead, register the component to the syncer and call + /// OnDemandBroadcasting. + /// /// Request trigger a broadcasting for a constructed message immediately instead of /// waiting for ray syncer to poll the message. /// diff --git a/src/ray/common/status.h b/src/ray/common/status.h index c0477e652383..bda9860ddc4a 100644 --- a/src/ray/common/status.h +++ b/src/ray/common/status.h @@ -114,6 +114,7 @@ enum class StatusCode : char { OutOfDisk = 28, ObjectUnknownOwner = 29, RpcError = 30, + OutOfResource = 31 }; #if defined(__clang__) @@ -241,6 +242,10 @@ class RAY_EXPORT Status { return Status(StatusCode::RpcError, msg, rpc_code); } + static Status OutOfResource(const std::string &msg) { + return Status(StatusCode::OutOfResource, msg); + } + static StatusCode StringToCode(const std::string &str); // Returns true iff the status indicates success. @@ -287,6 +292,8 @@ class RAY_EXPORT Status { bool IsRpcError() const { return code() == StatusCode::RpcError; } + bool IsOutOfResource() const { return code() == StatusCode::OutOfResource; } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; diff --git a/src/ray/common/test/ray_syncer_test.cc b/src/ray/common/test/ray_syncer_test.cc index ae9391f1a6dc..de5e0617d824 100644 --- a/src/ray/common/test/ray_syncer_test.cc +++ b/src/ray/common/test/ray_syncer_test.cc @@ -593,7 +593,6 @@ TEST_F(SyncerTest, Broadcast) { // Change the resource in s2 and make sure s1 && s3 are correct s2.local_versions[0] = 1; - ASSERT_TRUE(s1.WaitUntil( [&s1, node_id = s2.syncer->GetLocalNodeID()]() mutable { return s1.received_versions[node_id][0] == 1; @@ -605,6 +604,27 @@ TEST_F(SyncerTest, Broadcast) { return s3.received_versions[node_id][0] == 1; }, 5)); + ASSERT_EQ( + 0, + s1.syncer->GetSyncMessage(s1.syncer->GetLocalNodeID(), MessageType::RESOURCE_VIEW) + ->version()); + ASSERT_EQ(nullptr, + s1.syncer->GetSyncMessage(NodeID::FromRandom().Binary(), + MessageType::RESOURCE_VIEW)); + s1.syncer->Disconnect(s3.syncer->GetLocalNodeID()); + RAY_LOG(INFO) << "s1.id=" << NodeID::FromBinary(s1.syncer->GetLocalNodeID()); + RAY_LOG(INFO) << "s3.id=" << NodeID::FromBinary(s3.syncer->GetLocalNodeID()); + + EXPECT_TRUE(s3.WaitUntil( + [&s3, node_id = s1.syncer->GetLocalNodeID()]() mutable { + return s3.syncer->node_state_->GetClusterView().count(node_id) == 0; + }, + 5)); + EXPECT_TRUE(s1.WaitUntil( + [&s1, node_id = s3.syncer->GetLocalNodeID()]() mutable { + return s1.syncer->node_state_->GetClusterView().count(node_id) == 0; + }, + 5)); } bool CompareViews(const std::vector &servers, diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 2b001719a72b..1d0f313c527e 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -20,6 +20,7 @@ #include +#include "absl/strings/str_format.h" #include "boost/fiber/all.hpp" #include "ray/common/bundle_spec.h" #include "ray/common/ray_config.h" @@ -116,7 +117,8 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ num_executed_tasks_(0), resource_ids_(new ResourceMappingType()), grpc_service_(io_service_, *this), - task_execution_service_work_(task_execution_service_) { + task_execution_service_work_(task_execution_service_), + exiting_detail_(std::nullopt) { RAY_LOG(DEBUG) << "Constructing CoreWorker, worker_id: " << worker_id; // Initialize task receivers. @@ -223,8 +225,8 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ // Initialize the task state event buffer. auto task_event_gcs_client = std::make_unique(options_.gcs_options); - task_event_buffer_ = - std::make_unique(std::move(task_event_gcs_client)); + task_event_buffer_ = std::make_unique( + std::move(task_event_gcs_client), worker_context_.GetCurrentJobID()); if (RayConfig::instance().task_events_report_interval_ms() > 0) { if (!task_event_buffer_->Start().ok()) { RAY_CHECK(!task_event_buffer_->Enabled()) << "TaskEventBuffer should be disabled."; @@ -242,7 +244,8 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ /*periodical_runner=*/&periodical_runner_, /*get_time_ms=*/[]() { return absl::GetCurrentTimeNanos() / 1e6; }, /*subscriber_timeout_ms=*/RayConfig::instance().subscriber_timeout_ms(), - /*publish_batch_size_=*/RayConfig::instance().publish_batch_size()); + /*publish_batch_size_=*/RayConfig::instance().publish_batch_size(), + GetWorkerID()); object_info_subscriber_ = std::make_unique( /*subscriber_id=*/GetWorkerID(), /*channels=*/ @@ -762,7 +765,11 @@ void CoreWorker::Exit( "tasks have finished" << ", exit_type=" << rpc::WorkerExitType_Name(exit_type) << ", detail=" << detail; - exiting_ = true; + { + absl::MutexLock lock(&mutex_); + RAY_CHECK_NE(detail, ""); + exiting_detail_ = std::optional{detail}; + } // Release the resources early in case draining takes a long time. RAY_CHECK_OK( local_raylet_client_->NotifyDirectCallTaskBlocked(/*release_resources*/ true)); @@ -782,9 +789,13 @@ void CoreWorker::Exit( exit_type, detail = std::move(detail), creation_task_exception_pb_bytes]() { - rpc::DrainAndResetServerCallExecutor(); - Disconnect(exit_type, detail, creation_task_exception_pb_bytes); + rpc::DrainServerCallExecutor(); KillChildProcs(); + // Disconnect should be put close to Shutdown + // https://github.com/ray-project/ray/pull/34883 + // TODO (iycheng) Improve the Process.h and make it able to monitor + // process liveness + Disconnect(exit_type, detail, creation_task_exception_pb_bytes); Shutdown(); }, "CoreWorker.Shutdown"); @@ -828,9 +839,13 @@ void CoreWorker::ForceExit(const rpc::WorkerExitType exit_type, const std::string &detail) { RAY_LOG(WARNING) << "Force exit the process. " << " Details: " << detail; - Disconnect(exit_type, detail); KillChildProcs(); + // Disconnect should be put close to Exit + // https://github.com/ray-project/ray/pull/34883 + // TODO (iycheng) Improve the Process.h and make it able to monitor + // process liveness + Disconnect(exit_type, detail); // NOTE(hchen): Use `QuickExit()` to force-exit this process without doing cleanup. // `exit()` will destruct static objects in an incorrect order, which will lead to @@ -1450,20 +1465,11 @@ void RetryObjectInPlasmaErrors(std::shared_ptr &memory_st for (auto iter = memory_object_ids.begin(); iter != memory_object_ids.end();) { auto current = iter++; const auto &mem_id = *current; - auto ready_iter = ready.find(mem_id); - if (ready_iter != ready.end()) { - std::vector> found; - RAY_CHECK_OK(memory_store->Get({mem_id}, - /*num_objects=*/1, - /*timeout=*/0, - worker_context, - /*remote_after_get=*/false, - &found)); - if (found.size() == 1 && found[0]->IsInPlasmaError()) { - plasma_object_ids.insert(mem_id); - ready.erase(ready_iter); - memory_object_ids.erase(current); - } + auto found = memory_store->GetIfExists(mem_id); + if (found != nullptr && found->IsInPlasmaError()) { + plasma_object_ids.insert(mem_id); + ready.erase(mem_id); + memory_object_ids.erase(current); } } } @@ -2174,17 +2180,27 @@ Status CoreWorker::WaitPlacementGroupReady(const PlacementGroupID &placement_gro } } -std::optional> CoreWorker::SubmitActorTask( - const ActorID &actor_id, - const RayFunction &function, - const std::vector> &args, - const TaskOptions &task_options) { +Status CoreWorker::SubmitActorTask(const ActorID &actor_id, + const RayFunction &function, + const std::vector> &args, + const TaskOptions &task_options, + std::vector &task_returns) { absl::ReleasableMutexLock lock(&actor_task_mutex_); + task_returns.clear(); + if (!direct_actor_submitter_->CheckActorExists(actor_id)) { + std::string err_msg = absl::StrFormat( + "Can't find actor %s. It might be dead or it's from a different cluster", + actor_id.Hex()); + return Status::NotFound(std::move(err_msg)); + } /// Check whether backpressure may happen at the very beginning of submitting a task. if (direct_actor_submitter_->PendingTasksFull(actor_id)) { RAY_LOG(DEBUG) << "Back pressure occurred while submitting the task to " << actor_id << ". " << direct_actor_submitter_->DebugString(actor_id); - return std::nullopt; + return Status::OutOfResource(absl::StrFormat( + "Too many tasks (%d) pending to be executed for actor %s. Please try later", + direct_actor_submitter_->NumPendingTasks(actor_id), + actor_id.Hex())); } auto actor_handle = actor_manager_->GetActorHandle(actor_id); @@ -2247,7 +2263,8 @@ std::optional> CoreWorker::SubmitActorTask( rpc_address_, task_spec, CurrentCallSite(), actor_handle->MaxTaskRetries()); RAY_CHECK_OK(direct_actor_submitter_->SubmitTask(task_spec)); } - return {std::move(returned_refs)}; + task_returns = std::move(returned_refs); + return Status::OK(); } Status CoreWorker::CancelTask(const ObjectID &object_id, @@ -2531,16 +2548,39 @@ Status CoreWorker::ExecuteTask( bool *is_retryable_error, std::string *application_error) { RAY_LOG(DEBUG) << "Executing task, task info = " << task_spec.DebugString(); + + // If the worker is exitted via Exit API, we shouldn't execute + // tasks anymore. + if (IsExiting()) { + absl::MutexLock lock(&mutex_); + return Status::IntentionalSystemExit( + absl::StrCat("Worker has already exited. Detail: ", exiting_detail_.value())); + } + task_queue_length_ -= 1; num_executed_tasks_ += 1; // Modify the worker's per function counters. std::string func_name = task_spec.FunctionDescriptor()->CallString(); + std::string actor_repr_name = ""; + { + absl::MutexLock lock(&mutex_); + actor_repr_name = actor_repr_name_; + } if (!options_.is_local_mode) { task_counter_.MovePendingToRunning(func_name, task_spec.IsRetry()); - task_manager_->RecordTaskStatusEvent( - task_spec.AttemptNumber(), task_spec, rpc::TaskStatus::RUNNING); + if (task_spec.IsActorTask() && !actor_repr_name.empty()) { + task_manager_->RecordTaskStatusEvent( + task_spec.AttemptNumber(), + task_spec, + rpc::TaskStatus::RUNNING, + /* include_task_info */ false, + worker::TaskStatusEvent::TaskStateUpdate(actor_repr_name)); + } else { + task_manager_->RecordTaskStatusEvent( + task_spec.AttemptNumber(), task_spec, rpc::TaskStatus::RUNNING); + } worker_context_.SetCurrentTask(task_spec); SetCurrentTaskId(task_spec.TaskId(), task_spec.AttemptNumber(), task_spec.GetName()); @@ -2925,6 +2965,8 @@ Status CoreWorker::GetAndPinArgsForExecutor(const TaskSpecification &task, void CoreWorker::HandlePushTask(rpc::PushTaskRequest request, rpc::PushTaskReply *reply, rpc::SendReplyCallback send_reply_callback) { + RAY_LOG(DEBUG) << "Received Handle Push Task " + << TaskID::FromBinary(request.task_spec().task_id()); if (HandleWrongRecipient(WorkerID::FromBinary(request.intended_worker_id()), send_reply_callback)) { return; @@ -2946,10 +2988,18 @@ void CoreWorker::HandlePushTask(rpc::PushTaskRequest request, // execution service. if (request.task_spec().type() == TaskType::ACTOR_TASK) { task_execution_service_.post( - [this, request, reply, send_reply_callback = std::move(send_reply_callback)] { + [this, + request, + reply, + send_reply_callback = std::move(send_reply_callback), + func_name] { // We have posted an exit task onto the main event loop, // so shouldn't bother executing any further work. - if (exiting_) return; + if (IsExiting()) { + RAY_LOG(INFO) << "Queued task " << func_name + << " won't be executed because the worker already exited."; + return; + } direct_task_receiver_->HandleTask(request, reply, send_reply_callback); }, "CoreWorker.HandlePushTaskActor"); @@ -2958,10 +3008,14 @@ void CoreWorker::HandlePushTask(rpc::PushTaskRequest request, // the task execution service. direct_task_receiver_->HandleTask(request, reply, send_reply_callback); task_execution_service_.post( - [=] { + [this, func_name] { // We have posted an exit task onto the main event loop, // so shouldn't bother executing any further work. - if (exiting_) return; + if (IsExiting()) { + RAY_LOG(INFO) << "Queued task " << func_name + << " won't be executed because the worker already exited."; + return; + } direct_task_receiver_->RunNormalTasksFromQueue(); }, "CoreWorker.HandlePushTask"); @@ -3121,7 +3175,7 @@ void CoreWorker::ProcessSubscribeForObjectEviction( pub_message.mutable_worker_object_eviction_message()->set_object_id( object_id.Binary()); - object_info_publisher_->Publish(pub_message); + object_info_publisher_->Publish(std::move(pub_message)); }; const auto object_id = ObjectID::FromBinary(message.object_id()); @@ -3483,6 +3537,7 @@ void CoreWorker::HandleGetCoreWorkerStats(rpc::GetCoreWorkerStatsRequest request stats->set_task_queue_length(task_queue_length_); stats->set_num_executed_tasks(num_executed_tasks_); stats->set_num_object_refs_in_scope(reference_counter_->NumObjectIDsInScope()); + stats->set_num_owned_objects(reference_counter_->NumObjectOwnedByUs()); stats->set_ip_address(rpc_address_.ip_address()); stats->set_port(rpc_address_.port()); stats->set_pid(getpid()); @@ -3804,13 +3859,19 @@ void CoreWorker::SetActorTitle(const std::string &title) { void CoreWorker::SetActorReprName(const std::string &repr_name) { RAY_CHECK(direct_task_receiver_ != nullptr); direct_task_receiver_->SetActorReprName(repr_name); + + absl::MutexLock lock(&mutex_); + actor_repr_name_ = repr_name; } rpc::JobConfig CoreWorker::GetJobConfig() const { return worker_context_.GetCurrentJobConfig(); } -bool CoreWorker::IsExiting() const { return exiting_; } +bool CoreWorker::IsExiting() const { + absl::MutexLock lock(&mutex_); + return exiting_detail_.has_value(); +} std::unordered_map> CoreWorker::GetActorCallStats() const { diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 088fd620644e..3ca65a09594e 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -348,6 +348,12 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { const TaskID &GetCurrentTaskId() const { return worker_context_.GetCurrentTaskID(); } + int64_t GetCurrentTaskAttemptNumber() const { + return worker_context_.GetCurrentTask() != nullptr + ? worker_context_.GetCurrentTask()->AttemptNumber() + : 0; + } + JobID GetCurrentJobId() const { return worker_context_.GetCurrentJobID(); } const int64_t GetTaskDepth() const { return worker_context_.GetTaskDepth(); } @@ -806,12 +812,14 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// \param[in] function The remote function to execute. /// \param[in] args Arguments of this task. /// \param[in] task_options Options for this task. - /// \return ObjectRefs returned by this task. - std::optional> SubmitActorTask( - const ActorID &actor_id, - const RayFunction &function, - const std::vector> &args, - const TaskOptions &task_options); + /// \param[out] task_returns The object returned by this task + /// + /// \return Status of this submission + Status SubmitActorTask(const ActorID &actor_id, + const RayFunction &function, + const std::vector> &args, + const TaskOptions &task_options, + std::vector &task_returns); /// Tell an actor to exit immediately, without completing outstanding work. /// @@ -1123,6 +1131,9 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Return true if the core worker is in the exit process. bool IsExiting() const; + /// Mark this worker is exiting. + void SetIsExiting(); + /// Retrieve the current statistics about tasks being received and executing. /// \return an unordered_map mapping function name to list of (num_received, /// num_executing, num_executed). It is a std map instead of absl due to its @@ -1150,6 +1161,20 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// \param stderr_end_offset End offset of the stderr for this task. void RecordTaskLogEnd(int64_t stdout_end_offset, int64_t stderr_end_offset) const; + /// (WORKER mode only) Gracefully exit the worker. `Graceful` means the worker will + /// exit when it drains all tasks and cleans all owned objects. + /// After this method is called, all the tasks in the queue will not be + /// executed. + /// + /// \param exit_type The reason why this worker process is disconnected. + /// \param exit_detail The detailed reason for a given exit. + /// \param creation_task_exception_pb_bytes It is given when the worker is + /// disconnected because the actor is failed due to its exception in its init method. + void Exit(const rpc::WorkerExitType exit_type, + const std::string &detail, + const std::shared_ptr &creation_task_exception_pb_bytes = + nullptr); + private: static json OverrideRuntimeEnv(json &child, const std::shared_ptr parent); @@ -1194,18 +1219,6 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Run the io_service_ event loop. This should be called in a background thread. void RunIOService(); - /// (WORKER mode only) Gracefully exit the worker. `Graceful` means the worker will - /// exit when it drains all tasks and cleans all owned objects. - /// - /// \param exit_type The reason why this worker process is disconnected. - /// \param exit_detail The detailed reason for a given exit. - /// \param creation_task_exception_pb_bytes It is given when the worker is - /// disconnected because the actor is failed due to its exception in its init method. - void Exit(const rpc::WorkerExitType exit_type, - const std::string &detail, - const std::shared_ptr &creation_task_exception_pb_bytes = - nullptr); - /// Forcefully exit the worker. `Force` means it will exit actor without draining /// or cleaning any resources. /// \param exit_type The reason why this worker process is disconnected. @@ -1549,6 +1562,9 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// Actor title that consists of class name, args, kwargs for actor construction. std::string actor_title_ GUARDED_BY(mutex_); + /// Actor repr name if overrides by the user, empty string if not. + std::string actor_repr_name_ GUARDED_BY(mutex_) = ""; + /// Number of tasks that have been pushed to the actor but not executed. std::atomic task_queue_length_; @@ -1599,9 +1615,9 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { ObjectID object_id, void *py_future); - /// we are shutting down and not running further tasks. - /// when exiting_ is set to true HandlePushTask becomes no-op. - std::atomic exiting_ = false; + /// The detail reason why the core worker has exited. + /// If this value is set, it means the exit process has begun. + std::optional exiting_detail_ GUARDED_BY(mutex_); std::atomic is_shutdown_ = false; diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc index a5542eb700fb..13a420f16972 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc @@ -187,8 +187,8 @@ inline ActorCreationOptions ToActorCreationOptions(JNIEnv *env, max_restarts = env->GetIntField(actorCreationOptions, java_actor_creation_options_max_restarts); - max_task_retries = - env->GetIntField(actorCreationOptions, java_actor_creation_options_max_task_retries); + max_task_retries = env->GetIntField(actorCreationOptions, + java_actor_creation_options_max_task_retries); jobject java_resources = env->GetObjectField(actorCreationOptions, java_base_task_options_resources); resources = ToResources(env, java_resources); @@ -278,22 +278,21 @@ inline ActorCreationOptions ToActorCreationOptions(JNIEnv *env, placement_options.second); placement_group_scheduling_strategy->set_placement_group_capture_child_tasks(false); } - ActorCreationOptions actor_creation_options{ - max_restarts, - max_task_retries, - static_cast(max_concurrency), - resources, - resources, - dynamic_worker_options, - is_detached, - name, - ray_namespace, - is_async, - /*scheduling_strategy=*/scheduling_strategy, - serialized_runtime_env, - concurrency_groups, - /*execute_out_of_order*/ false, - max_pending_calls}; + ActorCreationOptions actor_creation_options{max_restarts, + max_task_retries, + static_cast(max_concurrency), + resources, + resources, + dynamic_worker_options, + is_detached, + name, + ray_namespace, + is_async, + /*scheduling_strategy=*/scheduling_strategy, + serialized_runtime_env, + concurrency_groups, + /*execute_out_of_order*/ false, + max_pending_calls}; return actor_creation_options; } @@ -439,10 +438,10 @@ Java_io_ray_runtime_task_NativeTaskSubmitter_nativeSubmitActorTask( auto task_args = ToTaskArgs(env, args); RAY_CHECK(callOptions != nullptr); auto task_options = ToTaskOptions(env, numReturns, callOptions); - - auto return_refs = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( - actor_id, ray_function, task_args, task_options); - if (!return_refs.has_value()) { + std::vector return_refs; + auto status = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( + actor_id, ray_function, task_args, task_options, return_refs); + if (!status.ok()) { std::stringstream ss; ss << "The task " << ray_function.GetFunctionDescriptor()->ToString() << " could not be submitted to " << actor_id; @@ -456,7 +455,7 @@ Java_io_ray_runtime_task_NativeTaskSubmitter_nativeSubmitActorTask( } std::vector return_ids; - for (const auto &ref : return_refs.value()) { + for (const auto &ref : return_refs) { return_ids.push_back(ObjectID::FromBinary(ref.object_id())); } diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index 3c285cd97b4c..ba5321828207 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -251,6 +251,7 @@ bool ReferenceCounter::AddOwnedObjectInternal( if (object_id_refs_.count(object_id) != 0) { return false; } + num_objects_owned_by_us_++; RAY_LOG(DEBUG) << "Adding owned object " << object_id; // If the entry doesn't exist, we initialize the direct reference count to zero // because this corresponds to a submitted task whose return ObjectID will be created @@ -666,6 +667,9 @@ void ReferenceCounter::EraseReference(ReferenceTable::iterator it) { reconstructable_owned_objects_index_.erase(index_it); } freed_objects_.erase(it->first); + if (it->second.owned_by_us) { + num_objects_owned_by_us_--; + } object_id_refs_.erase(it); ShutdownIfNeeded(); } @@ -811,6 +815,11 @@ size_t ReferenceCounter::NumObjectIDsInScope() const { return object_id_refs_.size(); } +size_t ReferenceCounter::NumObjectOwnedByUs() const { + absl::MutexLock lock(&mutex_); + return num_objects_owned_by_us_; +} + std::unordered_set ReferenceCounter::GetAllInScopeObjectIDs() const { absl::MutexLock lock(&mutex_); std::unordered_set in_scope_object_ids; @@ -1169,7 +1178,7 @@ void ReferenceCounter::HandleRefRemoved(const ObjectID &object_id) { RAY_LOG(DEBUG) << "Publishing WaitForRefRemoved message for " << object_id << ", message has " << worker_ref_removed_message->borrowed_refs().size() << " borrowed references."; - object_info_publisher_->Publish(pub_message); + object_info_publisher_->Publish(std::move(pub_message)); } void ReferenceCounter::SetRefRemovedCallback( @@ -1459,7 +1468,7 @@ void ReferenceCounter::PushToLocationSubscribers(ReferenceTable::iterator it) { auto object_locations_msg = pub_message.mutable_worker_object_locations_message(); FillObjectInformationInternal(it, object_locations_msg); - object_info_publisher_->Publish(pub_message); + object_info_publisher_->Publish(std::move(pub_message)); } Status ReferenceCounter::FillObjectInformation( diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index daf79082dd9a..c16ee0392119 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -315,6 +315,8 @@ class ReferenceCounter : public ReferenceCounterInterface, /// Returns the total number of ObjectIDs currently in scope. size_t NumObjectIDsInScope() const LOCKS_EXCLUDED(mutex_); + size_t NumObjectOwnedByUs() const LOCKS_EXCLUDED(mutex_); + /// Returns a set of all ObjectIDs currently in scope (i.e., nonzero reference count). std::unordered_set GetAllInScopeObjectIDs() const LOCKS_EXCLUDED(mutex_); @@ -1010,6 +1012,9 @@ class ReferenceCounter : public ReferenceCounterInterface, /// due to node failure. These objects are still in scope and need to be /// recovered. std::vector objects_to_recover_ GUARDED_BY(mutex_); + + /// Keep track of objects owend by this worker. + size_t num_objects_owned_by_us_ GUARDED_BY(mutex_) = 0; }; } // namespace core diff --git a/src/ray/core_worker/task_event_buffer.cc b/src/ray/core_worker/task_event_buffer.cc index 9bb80e5a04ea..086ecf314411 100644 --- a/src/ray/core_worker/task_event_buffer.cc +++ b/src/ray/core_worker/task_event_buffer.cc @@ -95,6 +95,10 @@ bool TaskStatusEvent::ToRpcTaskEventsOrDrop(rpc::TaskEvents *rpc_task_events) { state_update_->task_log_info_.value()); } + if (!state_update_->actor_repr_name_.empty()) { + dst_state_update->set_actor_repr_name(state_update_->actor_repr_name_); + } + return false; } @@ -134,8 +138,10 @@ bool TaskProfileEvent::ToRpcTaskEventsOrDrop(rpc::TaskEvents *rpc_task_events) { return false; } -TaskEventBufferImpl::TaskEventBufferImpl(std::unique_ptr gcs_client) - : work_guard_(boost::asio::make_work_guard(io_service_)), +TaskEventBufferImpl::TaskEventBufferImpl(std::unique_ptr gcs_client, + const JobID &job_id) + : job_id_(job_id), + work_guard_(boost::asio::make_work_guard(io_service_)), periodical_runner_(io_service_), gcs_client_(std::move(gcs_client)), buffer_() {} @@ -212,32 +218,37 @@ void TaskEventBufferImpl::AddTaskEvent(std::unique_ptr task_event) { if (!enabled_) { return; } - size_t num_profile_events_dropped = 0; - size_t num_status_events_dropped = 0; size_t num_add = 0; absl::MutexLock lock(&mutex_); size_t prev_size = buffer_.size(); + size_t num_profile_events_dropped = 0; { + if (task_attempts_dropped_.count(task_event->GetTaskAttempt())) { + // We are already dropping events for this task attempt. + // So don't add it to the buffer. + if (task_event->IsProfileEvent()) { + num_profile_events_dropped++; + } + return; + } + if (buffer_.full()) { const auto &to_evict = buffer_.front(); if (to_evict->IsProfileEvent()) { num_profile_events_dropped++; } else { - num_status_events_dropped++; + // Mark task attempt to be dropped. + task_attempts_dropped_.insert(to_evict->GetTaskAttempt()); } } buffer_.push_back(std::move(task_event)); num_add = buffer_.size() - prev_size; } - + stats_counter_.Increment(TaskEventBufferCounter::kNumTaskEventsStored, num_add); stats_counter_.Increment( TaskEventBufferCounter::kNumTaskProfileEventDroppedSinceLastFlush, num_profile_events_dropped); - stats_counter_.Increment( - TaskEventBufferCounter::kNumTaskStatusEventDroppedSinceLastFlush, - num_status_events_dropped); - stats_counter_.Increment(TaskEventBufferCounter::kNumTaskEventsStored, num_add); } void TaskEventBufferImpl::FlushEvents(bool forced) { @@ -246,7 +257,7 @@ void TaskEventBufferImpl::FlushEvents(bool forced) { } std::vector> to_send; to_send.reserve(RayConfig::instance().task_events_send_batch_size()); - + absl::flat_hash_set task_attempts_dropped; { absl::MutexLock lock(&mutex_); @@ -260,6 +271,18 @@ void TaskEventBufferImpl::FlushEvents(bool forced) { return; } + // Get the data loss info. + size_t task_attempt_count = 0; + // iterate and erase task attempt dropped. + while (task_attempt_count < + RayConfig::instance().task_events_drop_task_attempt_batch_size() && + !task_attempts_dropped_.empty()) { + auto itr = task_attempts_dropped_.begin(); + task_attempts_dropped.insert(*itr); + task_attempts_dropped_.erase(itr); + task_attempt_count++; + } + // No data to send. if (buffer_.empty()) { return; @@ -274,9 +297,15 @@ void TaskEventBufferImpl::FlushEvents(bool forced) { buffer_.erase(buffer_.begin(), buffer_.begin() + num_to_send); } - // Aggregate + // Aggregate data to be sent. absl::flat_hash_map agg_task_events; - auto to_rpc_event_fn = [this, &agg_task_events](std::unique_ptr &event) { + auto to_rpc_event_fn = [this, &agg_task_events, &task_attempts_dropped]( + std::unique_ptr &event) { + if (task_attempts_dropped.count(event->GetTaskAttempt())) { + // We are dropping all events from the task attempt due to data loss. + return; + } + if (!agg_task_events.count(event->GetTaskAttempt())) { auto inserted = agg_task_events.insert({event->GetTaskAttempt(), rpc::TaskEvents()}); @@ -284,11 +313,16 @@ void TaskEventBufferImpl::FlushEvents(bool forced) { } auto itr = agg_task_events.find(event->GetTaskAttempt()); - - if (event->ToRpcTaskEventsOrDrop(&(itr->second))) { - RAY_CHECK(event->IsProfileEvent()); - stats_counter_.Increment( - TaskEventBufferCounter::kNumTaskProfileEventDroppedSinceLastFlush); + if (event->IsProfileEvent()) { + if (event->ToRpcTaskEventsOrDrop(&(itr->second))) { + // We are dropping profile events since there are too many for a single task + // attempt. This happens frequently for driver task submitting many tasks. + stats_counter_.Increment( + TaskEventBufferCounter::kNumTaskProfileEventDroppedSinceLastFlush); + } + } else { + // We will not be dropping any status changes during conversion to rpc::TaskEvents. + RAY_CHECK(!event->ToRpcTaskEventsOrDrop(&(itr->second))); } }; std::for_each(to_send.begin(), to_send.end(), to_rpc_event_fn); @@ -296,39 +330,30 @@ void TaskEventBufferImpl::FlushEvents(bool forced) { // Convert to rpc::TaskEventsData auto data = std::make_unique(); size_t num_task_events = to_send.size(); - size_t num_profile_event_to_send = 0; - size_t num_status_event_to_send = 0; for (auto &[_task_attempt, task_event] : agg_task_events) { auto events_by_task = data->add_events_by_task(); - if (task_event.has_profile_events()) { - num_profile_event_to_send++; - } - if (task_event.has_state_updates()) { - num_status_event_to_send++; - } *events_by_task = std::move(task_event); } - // Send and reset the counters - stats_counter_.Decrement(TaskEventBufferCounter::kNumTaskEventsStored, to_send.size()); - size_t num_profile_task_events_dropped = stats_counter_.Get( + // Add the data loss info. + auto num_profile_events_dropped_since_last_flush = stats_counter_.Get( TaskEventBufferCounter::kNumTaskProfileEventDroppedSinceLastFlush); + data->set_num_profile_events_dropped(num_profile_events_dropped_since_last_flush); + // Reset the counter stats_counter_.Decrement( TaskEventBufferCounter::kNumTaskProfileEventDroppedSinceLastFlush, - num_profile_task_events_dropped); - stats_counter_.Increment(TaskEventBufferCounter::kTotalNumTaskProfileEventDropped, - num_profile_task_events_dropped); - - size_t num_status_task_events_dropped = stats_counter_.Get( - TaskEventBufferCounter::kNumTaskStatusEventDroppedSinceLastFlush); - stats_counter_.Decrement( - TaskEventBufferCounter::kNumTaskStatusEventDroppedSinceLastFlush, - num_status_task_events_dropped); - stats_counter_.Increment(TaskEventBufferCounter::kTotalNumTaskStatusEventDropped, - num_status_task_events_dropped); + num_profile_events_dropped_since_last_flush); + data->set_job_id(job_id_.Binary()); + + for (auto &task_attempt : task_attempts_dropped) { + rpc::TaskAttempt rpc_task_attempt; + rpc_task_attempt.set_task_id(task_attempt.first.Binary()); + rpc_task_attempt.set_attempt_number(task_attempt.second); + *(data->add_dropped_task_attempts()) = rpc_task_attempt; + } - data->set_num_profile_task_events_dropped(num_profile_task_events_dropped); - data->set_num_status_task_events_dropped(num_status_task_events_dropped); + // Send and reset the counters + stats_counter_.Decrement(TaskEventBufferCounter::kNumTaskEventsStored, to_send.size()); gcs::TaskInfoAccessor *task_accessor; { @@ -344,6 +369,11 @@ void TaskEventBufferImpl::FlushEvents(bool forced) { RAY_LOG(WARNING) << "Failed to push " << num_task_events << " task state events to GCS. Data will be lost. [status=" << status.ToString() << "]"; + stats_counter_.Increment(TaskEventBufferCounter::kTotalNumTaskEventsDropped, + num_task_events); + } else { + stats_counter_.Increment(TaskEventBufferCounter::kTotalNumTaskEventsReported, + num_task_events); } grpc_in_progress_ = false; }; @@ -360,10 +390,8 @@ void TaskEventBufferImpl::FlushEvents(bool forced) { grpc_in_progress_ = false; // Fail to send, currently dropping events. - stats_counter_.Increment(TaskEventBufferCounter::kTotalNumTaskProfileEventDropped, - num_profile_event_to_send); - stats_counter_.Increment(TaskEventBufferCounter::kTotalNumTaskStatusEventDropped, - num_status_event_to_send); + stats_counter_.Increment(TaskEventBufferCounter::kTotalNumTaskEventsDropped, + num_task_events); } } } @@ -387,11 +415,7 @@ const std::string TaskEventBufferImpl::DebugString() { << 1.0 * stats[TaskEventBufferCounter::kTotalTaskEventsBytesReported] / 1024 / 1024 << " MiB" << "\n\ttotal number of task events sent: " - << stats[TaskEventBufferCounter::kTotalTaskEventsReported] - << "\n\tnum status task events dropped: " - << stats[TaskEventBufferCounter::kTotalNumTaskProfileEventDropped] - << "\n\tnum profile task events dropped: " - << stats[TaskEventBufferCounter::kTotalNumTaskStatusEventDropped] << "\n"; + << stats[TaskEventBufferCounter::kTotalNumTaskEventsReported]; return ss.str(); } diff --git a/src/ray/core_worker/task_event_buffer.h b/src/ray/core_worker/task_event_buffer.h index 1279471f86ca..fddc873bf601 100644 --- a/src/ray/core_worker/task_event_buffer.h +++ b/src/ray/core_worker/task_event_buffer.h @@ -50,14 +50,16 @@ class TaskEvent { virtual ~TaskEvent() = default; - /// Convert itself a rpc::TaskEvents or drop itself due to data limit. + /// Convert itself a rpc::TaskEvents or drop it if there is data loss. /// /// NOTE: this method will modify internal states by moving fields to the /// rpc::TaskEvents. /// \param[out] rpc_task_events The rpc task event to be filled. - /// \return If it's dropped due to data limit. + /// \return True if data is dropped, false otherwise. virtual bool ToRpcTaskEventsOrDrop(rpc::TaskEvents *rpc_task_events) = 0; + virtual JobID GetJobId() const { return job_id_; } + /// If it is a profile event. virtual bool IsProfileEvent() const = 0; @@ -88,6 +90,9 @@ class TaskStatusEvent : public TaskEvent { TaskStateUpdate(const rpc::TaskLogInfo &task_log_info) : task_log_info_(task_log_info) {} + TaskStateUpdate(const std::string &actor_repr_name) + : actor_repr_name_(actor_repr_name) {} + private: friend class TaskStatusEvent; @@ -99,6 +104,8 @@ class TaskStatusEvent : public TaskEvent { const absl::optional error_info_ = absl::nullopt; /// Task log info. const absl::optional task_log_info_ = absl::nullopt; + /// Actor task repr name. + const std::string actor_repr_name_ = ""; }; explicit TaskStatusEvent( @@ -158,13 +165,17 @@ class TaskProfileEvent : public TaskEvent { /// @brief An enum class defining counters to be used in TaskEventBufferImpl. enum TaskEventBufferCounter { - kNumTaskProfileEventDroppedSinceLastFlush, - kNumTaskStatusEventDroppedSinceLastFlush, + /// Number of task events stored in the buffer. kNumTaskEventsStored, - /// Below stats are updated every flush. - kTotalNumTaskProfileEventDropped, - kTotalNumTaskStatusEventDropped, - kTotalTaskEventsReported, + /// Number of dropped task attempt stored in the buffer. + kNumTaskAttemptsDroppedStored, + /// Total number of task events dropped on the worker due to network issue. + kTotalNumTaskEventsDropped, + /// Number of profile events dropped since the last report. + kNumTaskProfileEventDroppedSinceLastFlush, + /// Total number of task events reported to GCS. + kTotalNumTaskEventsReported, + /// Total bytes of task events reported to GCS. kTotalTaskEventsBytesReported, }; @@ -173,13 +184,24 @@ enum TaskEventBufferCounter { /// /// Dropping of task events /// ======================== -/// Task events will be lost in the below cases for now: +/// Task events from task attempts will be lost in the below cases for now: /// 1. If any of the gRPC call failed, the task events will be dropped and warnings /// logged. This is probably fine since this usually indicated a much worse issue. /// /// 2. More than `RAY_task_events_max_buffer_size` tasks have been stored -/// in the buffer, any new task events will be dropped. In this case, the number of -/// dropped task events will also be included in the next flush to surface this. +/// in the buffer, oldest events in the buffer will be dropped. In this case, the task +/// attempts info will also be included in subsequent flush to GCS. +/// +/// For profiling events: +/// - If the number of profiling events for a task attempt exceeds the limit specified +/// by `RAY_task_events_max_num_profile_events_for_task`, any new profiling events will +/// be dropped. Dropping of profile events will not result in the entire task attempt +/// being dropped. +/// +/// For task status events: +/// - If any task status change event is dropped, the entire task attempt will be +/// dropped. The dropped task attempt info will be sent to GCS, and GCS will then drop +/// all new and existing events from the task attempt. /// /// No overloading of GCS /// ===================== @@ -244,7 +266,8 @@ class TaskEventBufferImpl : public TaskEventBuffer { /// Constructor /// /// \param gcs_client GCS client - TaskEventBufferImpl(std::unique_ptr gcs_client); + /// \param job_id Corresponding Job ID + TaskEventBufferImpl(std::unique_ptr gcs_client, const JobID &job_id); void AddTaskEvent(std::unique_ptr task_event) LOCKS_EXCLUDED(mutex_) override; @@ -266,22 +289,16 @@ class TaskEventBufferImpl : public TaskEventBuffer { } /// Test only functions. - size_t GetTotalNumStatusTaskEventsDropped() { - return stats_counter_.Get(TaskEventBufferCounter::kTotalNumTaskStatusEventDropped); + size_t GetNumTaskEventsDropped() { + return stats_counter_.Get(TaskEventBufferCounter::kTotalNumTaskEventsDropped); } - /// Test only functions. - size_t GetNumStatusTaskEventsDroppedSinceLastFlush() { - return stats_counter_.Get( - TaskEventBufferCounter::kNumTaskStatusEventDroppedSinceLastFlush); + /// Test only function. + size_t GetNumTaskEventsReported() { + return stats_counter_.Get(TaskEventBufferCounter::kTotalNumTaskEventsReported); } - /// Test only functions. - size_t GetTotalNumProfileTaskEventsDropped() { - return stats_counter_.Get(TaskEventBufferCounter::kTotalNumTaskProfileEventDropped); - } - - /// Test only functions. + /// Test only function. size_t GetNumProfileTaskEventsDroppedSinceLastFlush() { return stats_counter_.Get( TaskEventBufferCounter::kNumTaskProfileEventDroppedSinceLastFlush); @@ -293,9 +310,15 @@ class TaskEventBufferImpl : public TaskEventBuffer { return gcs_client_.get(); } + /// Test only functions. + const JobID &GetJobId() const { return job_id_; } + /// Mutex guarding task_events_data_. absl::Mutex mutex_; + /// Job id. + const JobID job_id_; + /// IO service event loop owned by TaskEventBuffer. instrumented_io_context io_service_; @@ -325,6 +348,10 @@ class TaskEventBufferImpl : public TaskEventBuffer { /// process them quick enough. std::atomic grpc_in_progress_ = false; + /// Task attempts dropped on this worker that are to be reported to GCS. Reported + /// data loss will be removed. + absl::flat_hash_set task_attempts_dropped_ GUARDED_BY(mutex_); + FRIEND_TEST(TaskEventBufferTestManualStart, TestGcsClientFail); FRIEND_TEST(TaskEventBufferTestBatchSend, TestBatchedSend); FRIEND_TEST(TaskEventBufferTest, TestAddEvent); diff --git a/src/ray/core_worker/test/core_worker_test.cc b/src/ray/core_worker/test/core_worker_test.cc index cba46a14d733..31a97db7bd4f 100644 --- a/src/ray/core_worker/test/core_worker_test.cc +++ b/src/ray/core_worker/test/core_worker_test.cc @@ -202,10 +202,10 @@ int CoreWorkerTest::GetActorPid(const ActorID &actor_id, TaskOptions options{"", 1, resources}; RayFunction func{Language::PYTHON, FunctionDescriptorBuilder::BuildPython("GetWorkerPid", "", "", "")}; - - auto return_ids = ObjectRefsToIds(CoreWorkerProcess::GetCoreWorker() - .SubmitActorTask(actor_id, func, args, options) - .value()); + std::vector task_returns; + auto status = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( + actor_id, func, args, options, task_returns); + auto return_ids = ObjectRefsToIds(task_returns); std::vector> results; RAY_CHECK_OK(CoreWorkerProcess::GetCoreWorker().Get(return_ids, -1, &results)); @@ -298,8 +298,10 @@ void CoreWorkerTest::TestActorTask(std::unordered_map &reso Language::PYTHON, FunctionDescriptorBuilder::BuildPython("MergeInputArgsAsOutput", "", "", "")); - auto return_ids = - ObjectRefsToIds(driver.SubmitActorTask(actor_id, func, args, options).value()); + std::vector task_returns; + auto status = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( + actor_id, func, args, options, task_returns); + auto return_ids = ObjectRefsToIds(task_returns); ASSERT_EQ(return_ids.size(), 1); std::vector> results; @@ -344,8 +346,10 @@ void CoreWorkerTest::TestActorTask(std::unordered_map &reso RayFunction func( Language::PYTHON, FunctionDescriptorBuilder::BuildPython("MergeInputArgsAsOutput", "", "", "")); - auto return_ids = - ObjectRefsToIds(driver.SubmitActorTask(actor_id, func, args, options).value()); + std::vector task_returns; + auto status = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( + actor_id, func, args, options, task_returns); + auto return_ids = ObjectRefsToIds(task_returns); ASSERT_EQ(return_ids.size(), 1); @@ -409,8 +413,10 @@ void CoreWorkerTest::TestActorRestart( Language::PYTHON, FunctionDescriptorBuilder::BuildPython("MergeInputArgsAsOutput", "", "", "")); - auto return_ids = - ObjectRefsToIds(driver.SubmitActorTask(actor_id, func, args, options).value()); + std::vector task_returns; + auto status = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( + actor_id, func, args, options, task_returns); + auto return_ids = ObjectRefsToIds(task_returns); ASSERT_EQ(return_ids.size(), 1); // Verify if it's expected data. std::vector> results; @@ -453,8 +459,10 @@ void CoreWorkerTest::TestActorFailure( Language::PYTHON, FunctionDescriptorBuilder::BuildPython("MergeInputArgsAsOutput", "", "", "")); - auto return_ids = - ObjectRefsToIds(driver.SubmitActorTask(actor_id, func, args, options).value()); + std::vector task_returns; + auto status = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( + actor_id, func, args, options, task_returns); + auto return_ids = ObjectRefsToIds(task_returns); ASSERT_EQ(return_ids.size(), 1); all_results.emplace_back(std::make_pair(return_ids[0], buffer1)); @@ -611,8 +619,10 @@ TEST_F(SingleNodeTest, TestDirectActorTaskSubmissionPerf) { Language::PYTHON, FunctionDescriptorBuilder::BuildPython("MergeInputArgsAsOutput", "", "", "")); - auto return_ids = - ObjectRefsToIds(driver.SubmitActorTask(actor_id, func, args, options).value()); + std::vector task_returns; + auto status = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( + actor_id, func, args, options, task_returns); + auto return_ids = ObjectRefsToIds(task_returns); ASSERT_EQ(return_ids.size(), 1); object_ids.emplace_back(return_ids[0]); } diff --git a/src/ray/core_worker/test/reference_count_test.cc b/src/ray/core_worker/test/reference_count_test.cc index de5de4146411..51b5d51523ac 100644 --- a/src/ray/core_worker/test/reference_count_test.cc +++ b/src/ray/core_worker/test/reference_count_test.cc @@ -136,7 +136,8 @@ class MockDistributedSubscriber : public pubsub::SubscriberInterface { subscriber_id, /*get_time_ms=*/[]() { return 1.0; }, /*subscriber_timeout_ms=*/1000, - /*publish_batch_size=*/1000)), + /*publish_batch_size=*/1000, + UniqueID::FromRandom())), client_factory_(client_factory) {} ~MockDistributedSubscriber() = default; @@ -249,7 +250,7 @@ class MockDistributedPublisher : public pubsub::PublisherInterface { void PublishFailure(const rpc::ChannelType channel_type, const std::string &key_id_binary) {} - void Publish(const rpc::PubMessage &pub_message) { + void Publish(rpc::PubMessage pub_message) { if (pub_message.channel_type() == rpc::ChannelType::WORKER_OBJECT_LOCATIONS_CHANNEL) { // TODO(swang): Test object locations pubsub too. return; diff --git a/src/ray/core_worker/test/task_event_buffer_test.cc b/src/ray/core_worker/test/task_event_buffer_test.cc index 7621294d6efc..40a46f3cd1f3 100644 --- a/src/ray/core_worker/test/task_event_buffer_test.cc +++ b/src/ray/core_worker/test/task_event_buffer_test.cc @@ -44,7 +44,7 @@ class TaskEventBufferTest : public ::testing::Test { )"); task_event_buffer_ = std::make_unique( - std::make_unique()); + std::make_unique(), JobID::FromInt(1)); } virtual void SetUp() { RAY_CHECK_OK(task_event_buffer_->Start(/*auto_flush*/ false)); } @@ -66,13 +66,15 @@ class TaskEventBufferTest : public ::testing::Test { task_id, JobID::FromInt(0), attempt_num, rpc::TaskStatus::RUNNING, running_ts); } - std::unique_ptr GenProfileTaskEvent(TaskID task_id, int32_t attempt_num) { + std::unique_ptr GenProfileTaskEvent(TaskID task_id, + int32_t attempt_num, + JobID job_id = JobID::FromInt(0)) { return std::make_unique( - task_id, JobID::FromInt(0), attempt_num, "", "", "", "test_event", 1); + task_id, job_id, attempt_num, "", "", "", "test_event", 1); } - static void CompareTaskEventData(const rpc::TaskEventData &actual_data, - const rpc::TaskEventData &expect_data) { + static void CompareTaskEventData(rpc::TaskEventData &actual_data, + rpc::TaskEventData &expect_data) { // Sort and compare std::vector actual_events; std::vector expect_events; @@ -89,10 +91,22 @@ class TaskEventBufferTest : public ::testing::Test { EXPECT_EQ(actual_events[i], expect_events[i]); } - EXPECT_EQ(actual_data.num_profile_task_events_dropped(), - expect_data.num_profile_task_events_dropped()); - EXPECT_EQ(actual_data.num_status_task_events_dropped(), - expect_data.num_status_task_events_dropped()); + // sort and compare data loss + std::vector actual_attempts; + std::vector expect_attempts; + for (const auto &t : actual_data.dropped_task_attempts()) { + actual_attempts.push_back(t.DebugString()); + } + for (const auto &t : expect_data.dropped_task_attempts()) { + expect_attempts.push_back(t.DebugString()); + } + std::sort(actual_attempts.begin(), actual_attempts.end()); + std::sort(expect_attempts.begin(), expect_attempts.end()); + + EXPECT_EQ(actual_attempts.size(), expect_attempts.size()); + for (size_t i = 0; i < actual_attempts.size(); ++i) { + EXPECT_EQ(actual_attempts[i], expect_attempts[i]); + } } std::unique_ptr task_event_buffer_ = nullptr; @@ -174,8 +188,6 @@ TEST_F(TaskEventBufferTest, TestFlushEvents) { // Expect data flushed match rpc::TaskEventData expected_data; - expected_data.set_num_profile_task_events_dropped(0); - expected_data.set_num_status_task_events_dropped(0); for (const auto &task_event : task_events) { auto event = expected_data.add_events_by_task(); task_event->ToRpcTaskEventsOrDrop(event); @@ -232,9 +244,9 @@ TEST_F(TaskEventBufferTest, TestFailedFlush) { task_event_buffer_->FlushEvents(false); // Expect the number of dropped events incremented. - ASSERT_EQ(task_event_buffer_->GetTotalNumStatusTaskEventsDropped(), num_status_events); - ASSERT_EQ(task_event_buffer_->GetTotalNumProfileTaskEventsDropped(), - num_profile_events); + ASSERT_EQ(task_event_buffer_->GetNumTaskEventsDropped(), + num_status_events + num_profile_events); + ASSERT_EQ(task_event_buffer_->GetNumTaskEventsReported(), 0); // Adding some more events for (size_t i = 0; i < num_status_events + num_profile_events; ++i) { @@ -246,11 +258,12 @@ TEST_F(TaskEventBufferTest, TestFailedFlush) { } } - // Flush successfully will reset the num events dropped. + ASSERT_EQ(task_event_buffer_->GetNumTaskEventsStored(), + num_status_events + num_profile_events); task_event_buffer_->FlushEvents(false); - ASSERT_EQ(task_event_buffer_->GetTotalNumStatusTaskEventsDropped(), num_status_events); - ASSERT_EQ(task_event_buffer_->GetTotalNumProfileTaskEventsDropped(), - num_profile_events); + ASSERT_EQ(task_event_buffer_->GetNumTaskEventsDropped(), + num_status_events + num_profile_events); + ASSERT_EQ(task_event_buffer_->GetNumTaskEventsStored(), 0); } TEST_F(TaskEventBufferTest, TestBackPressure) { @@ -339,33 +352,18 @@ TEST_F(TaskEventBufferTestBatchSend, TestBatchedSend) { TEST_F(TaskEventBufferTest, TestBufferSizeLimit) { size_t num_limit = 100; // Synced with test setup - size_t num_profile = 50; - size_t num_status = 50; // Generate 2 batches of events each, where batch 1 will be evicted by batch 2. - std::vector> profile_events_1; std::vector> status_events_1; - std::vector> profile_events_2; std::vector> status_events_2; // Generate data - for (size_t i = 0; i < 50; ++i) { + for (size_t i = 0; i < num_limit; ++i) { status_events_1.push_back(GenStatusTaskEvent(RandomTaskId(), 0)); status_events_2.push_back(GenStatusTaskEvent(RandomTaskId(), 0)); - profile_events_1.push_back(GenProfileTaskEvent(RandomTaskId(), 0)); - profile_events_2.push_back(GenProfileTaskEvent(RandomTaskId(), 0)); } rpc::TaskEventData expected_data; - expected_data.set_num_profile_task_events_dropped(num_profile); - expected_data.set_num_status_task_events_dropped(num_status); - for (const auto &event_ptr : profile_events_2) { - auto expect_event = expected_data.add_events_by_task(); - // Copy the data - auto event = std::make_unique( - *static_cast(event_ptr.get())); - event->ToRpcTaskEventsOrDrop(expect_event); - } for (const auto &event_ptr : status_events_2) { auto expect_event = expected_data.add_events_by_task(); // Copy the data @@ -374,19 +372,26 @@ TEST_F(TaskEventBufferTest, TestBufferSizeLimit) { event->ToRpcTaskEventsOrDrop(expect_event); } - // Add the data - for (auto &event : profile_events_1) { - task_event_buffer_->AddTaskEvent(std::move(event)); - } + // Add the data profile_events_1 and status_events_1 will be evicted. for (auto &event : status_events_1) { - task_event_buffer_->AddTaskEvent(std::move(event)); + rpc::TaskAttempt rpc_attempt; + rpc_attempt.set_task_id(event->GetTaskAttempt().first.Binary()); + rpc_attempt.set_attempt_number(event->GetTaskAttempt().second); + *(expected_data.add_dropped_task_attempts()) = rpc_attempt; + + // Copy the data + auto event_copy = + std::make_unique(*static_cast(event.get())); + task_event_buffer_->AddTaskEvent(std::move(event_copy)); } - for (auto &event : profile_events_2) { + for (auto &event : status_events_2) { task_event_buffer_->AddTaskEvent(std::move(event)); } - for (auto &event : status_events_2) { + // Status events from the same task attempt that were dropped should be dropped + for (auto &event : status_events_1) { task_event_buffer_->AddTaskEvent(std::move(event)); } + // Expect only limit in buffer. ASSERT_EQ(task_event_buffer_->GetNumTaskEventsStored(), num_limit); @@ -398,30 +403,25 @@ TEST_F(TaskEventBufferTest, TestBufferSizeLimit) { EXPECT_CALL(*task_gcs_accessor, AsyncAddTaskEventData(_, _)) .WillOnce([&](std::unique_ptr actual_data, ray::gcs::StatusCallback callback) { - // Sort and compare CompareTaskEventData(*actual_data, expected_data); return Status::OK(); }); task_event_buffer_->FlushEvents(false); - // Expect data flushed. ASSERT_EQ(task_event_buffer_->GetNumTaskEventsStored(), 0); - ASSERT_EQ(task_event_buffer_->GetNumProfileTaskEventsDroppedSinceLastFlush(), 0); - ASSERT_EQ(task_event_buffer_->GetNumStatusTaskEventsDroppedSinceLastFlush(), 0); - ASSERT_EQ(task_event_buffer_->GetTotalNumProfileTaskEventsDropped(), num_profile); - ASSERT_EQ(task_event_buffer_->GetTotalNumStatusTaskEventsDropped(), num_status); } TEST_F(TaskEventBufferTestLimitProfileEvents, TestLimitProfileEventsPerTask) { - size_t num_profile_events_per_task = 10; + size_t num_profile_events_per_task = 10; // sync with class config. size_t num_total_profile_events = 1000; std::vector> profile_events; - auto task_id = RandomTaskId(); + auto task_id1 = RandomTaskId(); + const auto &job_id = task_event_buffer_->GetJobId(); - // Generate data for the same task attempts. + // Generate data for the same task attempts from job 1 for (size_t i = 0; i < num_total_profile_events; ++i) { - profile_events.push_back(GenProfileTaskEvent(task_id, 0)); + profile_events.push_back(GenProfileTaskEvent(task_id1, 0, job_id)); } // Add all @@ -429,11 +429,25 @@ TEST_F(TaskEventBufferTestLimitProfileEvents, TestLimitProfileEventsPerTask) { task_event_buffer_->AddTaskEvent(std::move(event)); } - // Assert dropped count + auto task_gcs_accessor = + static_cast(task_event_buffer_->GetGcsClient()) + ->mock_task_accessor; + + // With batch size = 10, there should be 10 flush calls + EXPECT_CALL(*task_gcs_accessor, AsyncAddTaskEventData) + .WillOnce([&](std::unique_ptr actual_data, + ray::gcs::StatusCallback callback) { + EXPECT_EQ(actual_data->num_profile_events_dropped(), + num_total_profile_events - num_profile_events_per_task); + EXPECT_EQ(actual_data->job_id(), job_id.Binary()); + callback(Status::OK()); + return Status::OK(); + }); + task_event_buffer_->FlushEvents(false); - ASSERT_EQ(task_event_buffer_->GetTotalNumProfileTaskEventsDropped(), - num_total_profile_events - num_profile_events_per_task); - ASSERT_EQ(task_event_buffer_->GetTotalNumStatusTaskEventsDropped(), 0); + + // Counter is reset correctly. + EXPECT_EQ(task_event_buffer_->GetNumProfileTaskEventsDroppedSinceLastFlush(), 0); } } // namespace worker diff --git a/src/ray/core_worker/transport/direct_actor_task_submitter.cc b/src/ray/core_worker/transport/direct_actor_task_submitter.cc index cab04a6cebe5..0451e5c0ae1a 100644 --- a/src/ray/core_worker/transport/direct_actor_task_submitter.cc +++ b/src/ray/core_worker/transport/direct_actor_task_submitter.cc @@ -599,6 +599,19 @@ bool CoreWorkerDirectActorTaskSubmitter::PendingTasksFull(const ActorID &actor_i it->second.cur_pending_calls >= it->second.max_pending_calls; } +size_t CoreWorkerDirectActorTaskSubmitter::NumPendingTasks( + const ActorID &actor_id) const { + absl::MutexLock lock(&mu_); + auto it = client_queues_.find(actor_id); + RAY_CHECK(it != client_queues_.end()); + return it->second.cur_pending_calls; +} + +bool CoreWorkerDirectActorTaskSubmitter::CheckActorExists(const ActorID &actor_id) const { + absl::MutexLock lock(&mu_); + return client_queues_.find(actor_id) != client_queues_.end(); +} + std::string CoreWorkerDirectActorTaskSubmitter::DebugString( const ActorID &actor_id) const { absl::MutexLock lock(&mu_); diff --git a/src/ray/core_worker/transport/direct_actor_task_submitter.h b/src/ray/core_worker/transport/direct_actor_task_submitter.h index fa28fc485824..add2bd2fda91 100644 --- a/src/ray/core_worker/transport/direct_actor_task_submitter.h +++ b/src/ray/core_worker/transport/direct_actor_task_submitter.h @@ -90,6 +90,7 @@ class CoreWorkerDirectActorTaskSubmitter /// /// \param[in] actor_id The actor for whom to add a queue. /// \param[in] max_pending_calls The max pending calls for the actor to be added. + /// \param[in] execute_out_of_order Whether to execute tasks out of order. /// \param[in] fail_if_actor_unreachable Whether to fail newly submitted tasks /// immediately when the actor is unreachable. void AddActorQueueIfNotExists(const ActorID &actor_id, @@ -151,12 +152,31 @@ class CoreWorkerDirectActorTaskSubmitter /// \return Whether the corresponding client queue is full or not. bool PendingTasksFull(const ActorID &actor_id) const; + /// Get the number of pending tasks in the queue. + /// + /// \param[in] actor_id Actor id. + /// \return The number of pending tasks in the queue. + size_t NumPendingTasks(const ActorID &actor_id) const; + + /// Check whether the actor exists + /// + /// \param[in] actor_id Actor id. + /// + /// \return Return true if the actor exists. + bool CheckActorExists(const ActorID &actor_id) const; + /// Returns debug string for class. /// /// \param[in] actor_id The actor whose debug string to return. /// \return string. std::string DebugString(const ActorID &actor_id) const; + /// Whether the specified actor is alive. + /// + /// \param[in] actor_id The actor ID. + /// \return Whether this actor is alive. + bool IsActorAlive(const ActorID &actor_id) const; + private: /// A helper function to get task finisher without holding mu_ /// We should use this function when access @@ -280,12 +300,6 @@ class CoreWorkerDirectActorTaskSubmitter const absl::flat_hash_map> &inflight_task_callbacks) LOCKS_EXCLUDED(mu_); - /// Whether the specified actor is alive. - /// - /// \param[in] actor_id The actor ID. - /// \return Whether this actor is alive. - bool IsActorAlive(const ActorID &actor_id) const; - /// Pool for producing new core worker clients. rpc::CoreWorkerClientPool &core_worker_client_pool_; diff --git a/src/ray/core_worker/transport/direct_actor_transport.cc b/src/ray/core_worker/transport/direct_actor_transport.cc index 7cd62a6cc4dd..c355d5f42108 100644 --- a/src/ray/core_worker/transport/direct_actor_transport.cc +++ b/src/ray/core_worker/transport/direct_actor_transport.cc @@ -65,7 +65,7 @@ void CoreWorkerDirectTaskReceiver::Init( } void CoreWorkerDirectTaskReceiver::HandleTask( - rpc::PushTaskRequest request, + const rpc::PushTaskRequest &request, rpc::PushTaskReply *reply, rpc::SendReplyCallback send_reply_callback) { RAY_CHECK(waiter_ != nullptr) << "Must call init() prior to use"; diff --git a/src/ray/core_worker/transport/direct_actor_transport.h b/src/ray/core_worker/transport/direct_actor_transport.h index d82e05637595..a81899f4127e 100644 --- a/src/ray/core_worker/transport/direct_actor_transport.h +++ b/src/ray/core_worker/transport/direct_actor_transport.h @@ -83,7 +83,7 @@ class CoreWorkerDirectTaskReceiver { /// \param[in] request The request message. /// \param[out] reply The reply message. /// \param[in] send_reply_callback The callback to be called when the request is done. - void HandleTask(rpc::PushTaskRequest request, + void HandleTask(const rpc::PushTaskRequest &request, rpc::PushTaskReply *reply, rpc::SendReplyCallback send_reply_callback); diff --git a/src/ray/gcs/gcs_client/gcs_client.cc b/src/ray/gcs/gcs_client/gcs_client.cc index 40fe55c23c18..ae342b05eec0 100644 --- a/src/ray/gcs/gcs_client/gcs_client.cc +++ b/src/ray/gcs/gcs_client/gcs_client.cc @@ -49,11 +49,14 @@ void GcsSubscriberClient::PubsubLongPolling( const rpc::ClientCallback &callback) { rpc::GcsSubscriberPollRequest req; req.set_subscriber_id(request.subscriber_id()); + req.set_max_processed_sequence_id(request.max_processed_sequence_id()); + req.set_publisher_id(request.publisher_id()); rpc_client_->GcsSubscriberPoll( req, [callback](const Status &status, const rpc::GcsSubscriberPollReply &poll_reply) { rpc::PubsubLongPollingReply reply; *reply.mutable_pub_messages() = poll_reply.pub_messages(); + *reply.mutable_publisher_id() = poll_reply.publisher_id(); callback(status, reply); }); } @@ -125,7 +128,8 @@ Status GcsClient::Connect(instrumented_io_context &io_service) { internal_kv_accessor_ = std::make_unique(this); task_accessor_ = std::make_unique(this); - RAY_LOG(DEBUG) << "GcsClient connected."; + RAY_LOG(DEBUG) << "GcsClient connected " << options_.gcs_address_ << ":" + << options_.gcs_port_; return Status::OK(); } @@ -142,10 +146,7 @@ std::pair GcsClient::GetGcsServerAddress() const { PythonGcsClient::PythonGcsClient(const GcsClientOptions &options) : options_(options) {} Status PythonGcsClient::Connect() { - grpc::ChannelArguments arguments; - arguments.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, 512 * 1024 * 1024); - arguments.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, 60 * 1000); - arguments.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 60 * 1000); + auto arguments = PythonGrpcChannelArguments(); channel_ = rpc::BuildChannel(options_.gcs_address_, options_.gcs_port_, arguments); kv_stub_ = rpc::InternalKVGcsService::NewStub(channel_); runtime_env_stub_ = rpc::RuntimeEnvGcsService::NewStub(channel_); @@ -395,5 +396,11 @@ Status PythonGcsClient::GetAllJobInfo(int64_t timeout_ms, return Status::RpcError(status.error_message(), status.error_code()); } +std::unordered_map PythonGetResourcesTotal( + const rpc::GcsNodeInfo &node_info) { + return std::unordered_map(node_info.resources_total().begin(), + node_info.resources_total().end()); +} + } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/gcs_client/gcs_client.h b/src/ray/gcs/gcs_client/gcs_client.h index 80e41341ad87..032e040c5035 100644 --- a/src/ray/gcs/gcs_client/gcs_client.h +++ b/src/ray/gcs/gcs_client/gcs_client.h @@ -231,6 +231,9 @@ class RAY_EXPORT PythonGcsClient { std::shared_ptr channel_; }; +std::unordered_map PythonGetResourcesTotal( + const rpc::GcsNodeInfo &node_info); + } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/gcs_client/test/gcs_client_test.cc b/src/ray/gcs/gcs_client/test/gcs_client_test.cc index 6039392ca032..d3baeeb964d0 100644 --- a/src/ray/gcs/gcs_client/test/gcs_client_test.cc +++ b/src/ray/gcs/gcs_client/test/gcs_client_test.cc @@ -105,13 +105,14 @@ class GcsClientTest : public ::testing::TestWithParam { gcs_client_.reset(); server_io_service_->stop(); - rpc::DrainAndResetServerCallExecutor(); + rpc::DrainServerCallExecutor(); server_io_service_thread_->join(); gcs_server_->Stop(); gcs_server_.reset(); if (!no_redis_) { TestSetupUtil::FlushAllRedisServers(); } + rpc::ResetServerCallExecutor(); } void RestartGcsServer() { diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc index 85f1f14b5240..4b3ce52e3b4b 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc @@ -341,7 +341,8 @@ void GcsActorScheduler::RetryLeasingWorkerFromNode( RAY_UNUSED(execute_after( io_context_, [this, node, actor] { DoRetryLeasingWorkerFromNode(actor, node); }, - RayConfig::instance().gcs_lease_worker_retry_interval_ms())); + std::chrono::milliseconds( + RayConfig::instance().gcs_lease_worker_retry_interval_ms()))); } void GcsActorScheduler::DoRetryLeasingWorkerFromNode( @@ -504,7 +505,8 @@ void GcsActorScheduler::RetryCreatingActorOnWorker( RAY_UNUSED(execute_after( io_context_, [this, actor, worker] { DoRetryCreatingActorOnWorker(actor, worker); }, - RayConfig::instance().gcs_create_actor_retry_interval_ms())); + std::chrono::milliseconds( + RayConfig::instance().gcs_create_actor_retry_interval_ms()))); } void GcsActorScheduler::DoRetryCreatingActorOnWorker( diff --git a/src/ray/gcs/gcs_server/gcs_function_manager.h b/src/ray/gcs/gcs_server/gcs_function_manager.h index 705958cf4a50..530b2dbb8ad9 100644 --- a/src/ray/gcs/gcs_server/gcs_function_manager.h +++ b/src/ray/gcs/gcs_server/gcs_function_manager.h @@ -14,6 +14,7 @@ #pragma once #include "absl/container/flat_hash_map.h" +#include "ray/common/constants.h" #include "ray/gcs/gcs_server/gcs_kv_manager.h" namespace ray { @@ -48,7 +49,10 @@ class GcsFunctionManager { kv_.Del("fun", "IsolatedExports:" + job_id_hex + ":", true, nullptr); kv_.Del("fun", "RemoteFunction:" + job_id_hex + ":", true, nullptr); kv_.Del("fun", "ActorClass:" + job_id_hex + ":", true, nullptr); - kv_.Del("fun", "FunctionsToRun:" + job_id_hex + ":", true, nullptr); + kv_.Del("fun", + std::string(kWorkerSetupHookKeyName) + ":" + job_id_hex + ":", + true, + nullptr); } // Handler for internal KV diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.h b/src/ray/gcs/gcs_server/gcs_node_manager.h index b1b9526599c8..d76e94fbd8ea 100644 --- a/src/ray/gcs/gcs_server/gcs_node_manager.h +++ b/src/ray/gcs/gcs_server/gcs_node_manager.h @@ -17,6 +17,7 @@ #include #include +#include #include #include "absl/container/flat_hash_map.h" @@ -173,7 +174,7 @@ class GcsNodeManager : public rpc::NodeInfoHandler { /// A map of NodeId <-> ip:port of raylet using NodeIDAddrBiMap = boost::bimap>, - boost::bimaps::unordered_set_of>; + boost::bimaps::unordered_multiset_of>; NodeIDAddrBiMap node_map_; friend GcsMonitorServerTest; diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc index 2851fe41f494..fb19bea10756 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc @@ -756,11 +756,13 @@ void GcsPlacementGroupManager::OnNodeDead(const NodeID &node_id) { iter->second->GetMutableStats()->set_scheduling_state( rpc::PlacementGroupStats::QUEUED); AddToPendingQueue(iter->second, 0); + RAY_CHECK_OK(gcs_table_storage_->PlacementGroupTable().Put( + iter->second->GetPlacementGroupID(), + iter->second->GetPlacementGroupTableData(), + [this](Status status) { SchedulePendingPlacementGroups(); })); } } } - - SchedulePendingPlacementGroups(); } void GcsPlacementGroupManager::OnNodeAdd(const NodeID &node_id) { @@ -826,7 +828,9 @@ void GcsPlacementGroupManager::Tick() { // added as a safety check. https://github.com/ray-project/ray/pull/18419 SchedulePendingPlacementGroups(); execute_after( - io_context_, [this] { Tick(); }, 1000 /* milliseconds */); + io_context_, + [this] { Tick(); }, + std::chrono::milliseconds(1000) /* milliseconds */); } void GcsPlacementGroupManager::UpdatePlacementGroupLoad() { @@ -966,7 +970,10 @@ bool GcsPlacementGroupManager::RescheduleIfStillHasUnplacedBundles( << placement_group->GetPlacementGroupID(); placement_group->UpdateState(rpc::PlacementGroupTableData::RESCHEDULING); AddToPendingQueue(placement_group, 0); - SchedulePendingPlacementGroups(); + RAY_CHECK_OK(gcs_table_storage_->PlacementGroupTable().Put( + placement_group->GetPlacementGroupID(), + placement_group->GetPlacementGroupTableData(), + [this](Status status) { SchedulePendingPlacementGroups(); })); return true; } } diff --git a/src/ray/gcs/gcs_server/gcs_resource_report_poller.cc b/src/ray/gcs/gcs_server/gcs_resource_report_poller.cc index 9fa2a3efe14d..a1a7fcbd4a48 100644 --- a/src/ray/gcs/gcs_server/gcs_resource_report_poller.cc +++ b/src/ray/gcs/gcs_server/gcs_resource_report_poller.cc @@ -39,7 +39,9 @@ GcsResourceReportPoller::~GcsResourceReportPoller() { Stop(); } void GcsResourceReportPoller::Initialize(const GcsInitData &gcs_init_data) { for (const auto &pair : gcs_init_data.Nodes()) { - HandleNodeAdded(pair.second); + if (pair.second.state() == rpc::GcsNodeInfo::ALIVE) { + HandleNodeAdded(pair.second); + } } } diff --git a/src/ray/gcs/gcs_server/gcs_server.cc b/src/ray/gcs/gcs_server/gcs_server.cc index 55b4bb61f301..1c5b3c40df8b 100644 --- a/src/ray/gcs/gcs_server/gcs_server.cc +++ b/src/ray/gcs/gcs_server/gcs_server.cc @@ -99,7 +99,8 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, /*periodical_runner=*/&pubsub_periodical_runner_, /*get_time_ms=*/[]() { return absl::GetCurrentTimeNanos() / 1e6; }, /*subscriber_timeout_ms=*/RayConfig::instance().subscriber_timeout_ms(), - /*publish_batch_size_=*/RayConfig::instance().publish_batch_size()); + /*publish_batch_size_=*/RayConfig::instance().publish_batch_size(), + /*publisher_id=*/NodeID::FromRandom()); gcs_publisher_ = std::make_shared(std::move(inner_publisher)); } @@ -325,6 +326,7 @@ void GcsServer::InitGcsResourceManager(const GcsInitData &gcs_init_data) { void GcsServer::InitClusterResourceScheduler() { cluster_resource_scheduler_ = std::make_shared( + main_service_, scheduling::NodeID(kGCSNodeID.Binary()), NodeResources(), /*is_node_available_fn=*/ @@ -556,7 +558,7 @@ void GcsServer::InitRuntimeEnvManager() { main_service_, *runtime_env_manager_, /*delay_executor=*/ [this](std::function task, uint32_t delay_ms) { - return execute_after(main_service_, task, delay_ms); + return execute_after(main_service_, task, std::chrono::milliseconds(delay_ms)); }); runtime_env_service_ = std::make_unique(main_service_, *runtime_env_handler_); @@ -700,7 +702,8 @@ void GcsServer::RecordMetrics() const { execute_after( main_service_, [this] { RecordMetrics(); }, - (RayConfig::instance().metrics_report_interval_ms() / 2) /* milliseconds */); + std::chrono::milliseconds(RayConfig::instance().metrics_report_interval_ms() / + 2) /* milliseconds */); } void GcsServer::DumpDebugStateToFile() const { diff --git a/src/ray/gcs/gcs_server/gcs_server_main.cc b/src/ray/gcs/gcs_server/gcs_server_main.cc index 682830597bc3..151a56efdc64 100644 --- a/src/ray/gcs/gcs_server/gcs_server_main.cc +++ b/src/ray/gcs/gcs_server/gcs_server_main.cc @@ -107,7 +107,7 @@ int main(int argc, char *argv[]) { int signal_number) { RAY_LOG(INFO) << "GCS server received SIGTERM, shutting down..."; main_service.stop(); - ray::rpc::DrainAndResetServerCallExecutor(); + ray::rpc::DrainServerCallExecutor(); gcs_server.Stop(); ray::stats::Shutdown(); }; diff --git a/src/ray/gcs/gcs_server/gcs_task_manager.cc b/src/ray/gcs/gcs_server/gcs_task_manager.cc index 6771e042bb24..12628c82a20e 100644 --- a/src/ray/gcs/gcs_server/gcs_task_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_task_manager.cc @@ -313,16 +313,17 @@ void GcsTaskManager::HandleGetTaskEvents(rpc::GetTaskEventsRequest request, rpc::SendReplyCallback send_reply_callback) { RAY_LOG(DEBUG) << "Getting task status:" << request.ShortDebugString(); - // Select candidate events by indexing. + // Select candidate events by indexing if possible. std::vector task_events; - if (request.has_task_ids()) { + const auto &filters = request.filters(); + if (filters.task_ids_size() > 0) { absl::flat_hash_set task_ids; - for (const auto &task_id_str : request.task_ids().vals()) { + for (const auto &task_id_str : filters.task_ids()) { task_ids.insert(TaskID::FromBinary(task_id_str)); } task_events = task_event_storage_->GetTaskEvents(task_ids); - } else if (request.has_job_id()) { - task_events = task_event_storage_->GetTaskEvents(JobID::FromBinary(request.job_id())); + } else if (filters.has_job_id()) { + task_events = task_event_storage_->GetTaskEvents(JobID::FromBinary(filters.job_id())); } else { task_events = task_event_storage_->GetTaskEvents(); } @@ -334,15 +335,34 @@ void GcsTaskManager::HandleGetTaskEvents(rpc::GetTaskEventsRequest request, int32_t num_profile_event_limit = 0; int32_t num_status_event_limit = 0; - for (auto itr = task_events.rbegin(); itr != task_events.rend(); ++itr) { - auto &task_event = *itr; + // A lambda filter fn, where it returns true for task events to be included in the + // result. Task ids and job ids are already filtered by the storage with indexing above. + auto filter_fn = [&filters](const rpc::TaskEvents &task_event) { if (!task_event.has_task_info()) { // Skip task events w/o task info. - continue; + return false; } - - if (request.exclude_driver() && + if (filters.exclude_driver() && task_event.task_info().type() == rpc::TaskType::DRIVER_TASK) { + return false; + } + + if (filters.has_actor_id() && task_event.task_info().has_actor_id() && + ActorID::FromBinary(task_event.task_info().actor_id()) != + ActorID::FromBinary(filters.actor_id())) { + return false; + } + + if (filters.has_name() && task_event.task_info().name() != filters.name()) { + return false; + } + + return true; + }; + + for (auto itr = task_events.rbegin(); itr != task_events.rend(); ++itr) { + auto &task_event = *itr; + if (!filter_fn(task_event)) { continue; } @@ -367,15 +387,25 @@ void GcsTaskManager::HandleGetTaskEvents(rpc::GetTaskEventsRequest request, return; } +void GcsTaskManager::RecordDataLossFromWorker(const rpc::TaskEventData &data) { + // TODO(rickyx): GCS side GC will be changed in another PR. This is a temporary + // routine for supporting legacy behaviour with worker side changes. + if (data.dropped_task_attempts_size() > 0) { + stats_counter_.Increment(kTotalNumStatusTaskEventsDropped, + data.dropped_task_attempts_size()); + } + + if (data.num_profile_events_dropped() > 0) { + stats_counter_.Increment(kTotalNumProfileTaskEventsDropped, + data.num_profile_events_dropped()); + } +} + void GcsTaskManager::HandleAddTaskEventData(rpc::AddTaskEventDataRequest request, rpc::AddTaskEventDataReply *reply, rpc::SendReplyCallback send_reply_callback) { auto data = std::move(request.data()); - // Update counters. - stats_counter_.Increment(kTotalNumProfileTaskEventsDropped, - data.num_profile_task_events_dropped()); - stats_counter_.Increment(kTotalNumStatusTaskEventsDropped, - data.num_status_task_events_dropped()); + RecordDataLossFromWorker(data); for (auto events_by_task : *data.mutable_events_by_task()) { stats_counter_.Increment(kTotalNumTaskEventsReported); diff --git a/src/ray/gcs/gcs_server/gcs_task_manager.h b/src/ray/gcs/gcs_server/gcs_task_manager.h index c1d1b204a2d1..41c20bcbecc6 100644 --- a/src/ray/gcs/gcs_server/gcs_task_manager.h +++ b/src/ray/gcs/gcs_server/gcs_task_manager.h @@ -277,6 +277,13 @@ class GcsTaskManager : public rpc::TaskInfoHandler { }; private: + /// Record data loss from worker. + /// + /// TODO(rickyx): This will be updated to record task attempt loss properly. + /// + /// \param data The task event data. + void RecordDataLossFromWorker(const rpc::TaskEventData &data); + /// Test only size_t GetTotalNumStatusTaskEventsDropped() { return stats_counter_.Get(kTotalNumStatusTaskEventsDropped); diff --git a/src/ray/gcs/gcs_server/grpc_based_resource_broadcaster.cc b/src/ray/gcs/gcs_server/grpc_based_resource_broadcaster.cc index 5dc8bf3cefcd..e62c1a0825c5 100644 --- a/src/ray/gcs/gcs_server/grpc_based_resource_broadcaster.cc +++ b/src/ray/gcs/gcs_server/grpc_based_resource_broadcaster.cc @@ -36,7 +36,9 @@ GrpcBasedResourceBroadcaster::~GrpcBasedResourceBroadcaster() {} void GrpcBasedResourceBroadcaster::Initialize(const GcsInitData &gcs_init_data) { for (const auto &pair : gcs_init_data.Nodes()) { - HandleNodeAdded(pair.second); + if (pair.second.state() == rpc::GcsNodeInfo::ALIVE) { + HandleNodeAdded(pair.second); + } } } diff --git a/src/ray/gcs/gcs_server/pubsub_handler.cc b/src/ray/gcs/gcs_server/pubsub_handler.cc index a089b8ca765a..cf34b4f1e8a6 100644 --- a/src/ray/gcs/gcs_server/pubsub_handler.cc +++ b/src/ray/gcs/gcs_server/pubsub_handler.cc @@ -40,6 +40,7 @@ void InternalPubSubHandler::HandleGcsPublish(rpc::GcsPublishRequest request, nullptr); return; } + RAY_LOG(DEBUG) << "received publish request: " << request.DebugString(); for (const auto &msg : request.pub_messages()) { gcs_publisher_->GetPublisher()->Publish(msg); } @@ -63,6 +64,8 @@ void InternalPubSubHandler::HandleGcsSubscriberPoll( } rpc::PubsubLongPollingRequest pubsub_req; pubsub_req.set_subscriber_id(request.subscriber_id()); + pubsub_req.set_publisher_id(request.publisher_id()); + pubsub_req.set_max_processed_sequence_id(request.max_processed_sequence_id()); auto pubsub_reply = std::make_shared(); auto pubsub_reply_ptr = pubsub_reply.get(); gcs_publisher_->GetPublisher()->ConnectToSubscriber( @@ -74,6 +77,7 @@ void InternalPubSubHandler::HandleGcsSubscriberPoll( std::function success_cb, std::function failure_cb) { reply->mutable_pub_messages()->Swap(pubsub_reply->mutable_pub_messages()); + reply->set_publisher_id(std::move(*pubsub_reply->mutable_publisher_id())); reply_cb(std::move(status), std::move(success_cb), std::move(failure_cb)); }); } diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc index 825ac8e6cbaa..c58311f86927 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc @@ -45,6 +45,7 @@ class GcsActorSchedulerMockTest : public Test { [this](const rpc::Address &) { return raylet_client; }); local_node_id = NodeID::FromRandom(); auto cluster_resource_scheduler = std::make_shared( + io_context, scheduling::NodeID(local_node_id.Binary()), NodeResources(), /*is_node_available_fn=*/ diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc index f29467fceead..681d73bae010 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc @@ -44,6 +44,7 @@ class GcsActorSchedulerTest : public ::testing::Test { std::make_shared(store_client_); local_node_id_ = NodeID::FromRandom(); auto cluster_resource_scheduler = std::make_shared( + io_service_, scheduling::NodeID(local_node_id_.Binary()), NodeResources(), /*is_node_available_fn=*/ diff --git a/src/ray/gcs/gcs_server/test/gcs_monitor_server_test.cc b/src/ray/gcs/gcs_server/test/gcs_monitor_server_test.cc index d7d743cb602f..eef847dcc667 100644 --- a/src/ray/gcs/gcs_server/test/gcs_monitor_server_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_monitor_server_test.cc @@ -73,7 +73,7 @@ class GcsMonitorServerTest : public ::testing::Test { public: GcsMonitorServerTest() : mock_node_manager_(std::make_shared()), - cluster_resource_manager_(), + cluster_resource_manager_(io_context_), mock_resource_manager_( std::make_shared(cluster_resource_manager_)), mock_placement_group_manager_( diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc index 9a5f2417f045..a737124769d9 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc @@ -33,6 +33,8 @@ namespace gcs { class GcsPlacementGroupManagerMockTest : public Test { public: + GcsPlacementGroupManagerMockTest() : cluster_resource_manager_(io_context_) {} + void SetUp() override { store_client_ = std::make_shared(); gcs_table_storage_ = std::make_shared(store_client_); @@ -50,6 +52,7 @@ class GcsPlacementGroupManagerMockTest : public Test { counter_.reset(new CounterMap()); } + instrumented_io_context io_context_; std::unique_ptr gcs_placement_group_manager_; std::shared_ptr gcs_placement_group_scheduler_; std::shared_ptr gcs_table_storage_; @@ -57,7 +60,6 @@ class GcsPlacementGroupManagerMockTest : public Test { ClusterResourceManager cluster_resource_manager_; std::shared_ptr resource_manager_; std::shared_ptr> counter_; - instrumented_io_context io_context_; }; TEST_F(GcsPlacementGroupManagerMockTest, PendingQueuePriorityReschedule) { diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc index df9e74a632b6..e0cdced97ae6 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc @@ -81,7 +81,8 @@ class MockPlacementGroupScheduler : public gcs::GcsPlacementGroupSchedulerInterf class GcsPlacementGroupManagerTest : public ::testing::Test { public: GcsPlacementGroupManagerTest() - : mock_placement_group_scheduler_(new MockPlacementGroupScheduler()) { + : mock_placement_group_scheduler_(new MockPlacementGroupScheduler()), + cluster_resource_manager_(io_service_) { gcs_publisher_ = std::make_shared(std::make_unique()); gcs_table_storage_ = std::make_shared(io_service_); @@ -461,6 +462,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestReschedulingRetry) { placement_group->GetPlacementGroupID(); mock_placement_group_scheduler_->bundles_on_dead_node_.push_back(0); gcs_placement_group_manager_->OnNodeDead(NodeID::FromRandom()); + WaitUntilIoServiceDone(); const auto &bundles = mock_placement_group_scheduler_->placement_groups_[0]->GetBundles(); EXPECT_TRUE(NodeID::FromBinary(bundles[0]->GetMessage().node_id()).IsNil()); @@ -502,6 +504,7 @@ TEST_F(GcsPlacementGroupManagerTest, TestRescheduleWhenNodeDead) { placement_group->GetPlacementGroupID(); mock_placement_group_scheduler_->bundles_on_dead_node_.push_back(0); gcs_placement_group_manager_->OnNodeDead(NodeID::FromRandom()); + WaitUntilIoServiceDone(); ASSERT_EQ(mock_placement_group_scheduler_->placement_groups_[0]->GetPlacementGroupID(), placement_group->GetPlacementGroupID()); const auto &bundles = diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc index 1bdabd46eda0..fe639cd40d37 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc @@ -49,6 +49,7 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { std::make_unique()); auto local_node_id = NodeID::FromRandom(); cluster_resource_scheduler_ = std::make_shared( + io_service_, scheduling::NodeID(local_node_id.Binary()), NodeResources(), /*is_node_available_fn=*/ diff --git a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc index 39fd9e9a6c26..553a7270f351 100644 --- a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc @@ -27,7 +27,7 @@ using ::testing::_; class GcsResourceManagerTest : public ::testing::Test { public: - GcsResourceManagerTest() { + GcsResourceManagerTest() : cluster_resource_manager_(io_service_) { gcs_resource_manager_ = std::make_shared( io_service_, cluster_resource_manager_, NodeID::FromRandom()); } diff --git a/src/ray/gcs/gcs_server/test/gcs_server_rpc_test.cc b/src/ray/gcs/gcs_server/test/gcs_server_rpc_test.cc index cabad9872701..cf5078762e1f 100644 --- a/src/ray/gcs/gcs_server/test/gcs_server_rpc_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_server_rpc_test.cc @@ -59,11 +59,12 @@ class GcsServerTest : public ::testing::Test { void TearDown() override { io_service_.stop(); - rpc::DrainAndResetServerCallExecutor(); + rpc::DrainServerCallExecutor(); gcs_server_->Stop(); thread_io_service_->join(); gcs_server_.reset(); ray::gcs::RedisCallbackManager::instance().Clear(); + rpc::ResetServerCallExecutor(); } bool AddJob(const rpc::AddJobRequest &request) { diff --git a/src/ray/gcs/gcs_server/test/gcs_task_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_task_manager_test.cc index 91070fe1cf35..d60ea97f100f 100644 --- a/src/ray/gcs/gcs_server/test/gcs_task_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_task_manager_test.cc @@ -115,26 +115,36 @@ class GcsTaskManagerTest : public ::testing::Test { rpc::GetTaskEventsReply SyncGetTaskEvents(absl::flat_hash_set task_ids, absl::optional job_id = absl::nullopt, int64_t limit = -1, - bool exclude_driver = true) { + bool exclude_driver = true, + const std::string &name = "", + const ActorID &actor_id = ActorID::Nil()) { rpc::GetTaskEventsRequest request; rpc::GetTaskEventsReply reply; std::promise promise; if (!task_ids.empty()) { for (const auto &task_id : task_ids) { - request.mutable_task_ids()->add_vals(task_id.Binary()); + request.mutable_filters()->add_task_ids(task_id.Binary()); } } + if (!name.empty()) { + request.mutable_filters()->set_name(name); + } + + if (!actor_id.IsNil()) { + request.mutable_filters()->set_actor_id(actor_id.Binary()); + } + if (job_id) { - request.set_job_id(job_id->Binary()); + request.mutable_filters()->set_job_id(job_id->Binary()); } if (limit >= 0) { request.set_limit(limit); } - request.set_exclude_driver(exclude_driver); + request.mutable_filters()->set_exclude_driver(exclude_driver); task_manager->GetIoContext().dispatch( [this, &promise, &request, &reply]() { task_manager->HandleGetTaskEvents( @@ -155,11 +165,15 @@ class GcsTaskManagerTest : public ::testing::Test { static rpc::TaskInfoEntry GenTaskInfo( JobID job_id, TaskID parent_task_id = TaskID::Nil(), - rpc::TaskType task_type = rpc::TaskType::NORMAL_TASK) { + rpc::TaskType task_type = rpc::TaskType::NORMAL_TASK, + const ActorID actor_id = ActorID::Nil(), + const std::string name = "") { rpc::TaskInfoEntry task_info; task_info.set_job_id(job_id.Binary()); task_info.set_parent_task_id(parent_task_id.Binary()); task_info.set_type(task_type); + task_info.set_actor_id(actor_id.Binary()); + task_info.set_name(name); return task_info; } @@ -490,6 +504,66 @@ TEST_F(GcsTaskManagerTest, TestGetTaskEventsByJob) { reply_job2.mutable_events_by_task()); } +TEST_F(GcsTaskManagerTest, TestGetTaskEventsFilters) { + // Generate task events + + // A task event with actor id + ActorID actor_id = ActorID::Of(JobID::FromInt(1), TaskID::Nil(), 1); + { + auto task_ids = GenTaskIDs(1); + auto task_info_actor_id = + GenTaskInfo(JobID::FromInt(1), TaskID::Nil(), rpc::ACTOR_TASK, actor_id); + auto events = GenTaskEvents(task_ids, + /* attempt_number */ + 0, + /* job_id */ 1, + absl::nullopt, + absl::nullopt, + task_info_actor_id); + auto data = Mocker::GenTaskEventsData(events); + SyncAddTaskEventData(data); + } + + // A task event with name. + { + auto task_ids = GenTaskIDs(1); + auto task_info_name = GenTaskInfo( + JobID::FromInt(1), TaskID::Nil(), rpc::NORMAL_TASK, ActorID::Nil(), "task_name"); + auto events = GenTaskEvents(task_ids, + /* attempt_number */ + 0, + /* job_id */ 1, + absl::nullopt, + absl::nullopt, + task_info_name); + auto data = Mocker::GenTaskEventsData(events); + SyncAddTaskEventData(data); + } + + auto reply_name = SyncGetTaskEvents({}, + /* job_id */ absl::nullopt, + /* limit */ -1, + /* exclude_driver */ false, + "task_name"); + EXPECT_EQ(reply_name.events_by_task_size(), 1); + + auto reply_actor_id = SyncGetTaskEvents({}, + /* job_id */ absl::nullopt, + /* limit */ -1, + /* exclude_driver */ false, + /* name */ "", + actor_id); + EXPECT_EQ(reply_name.events_by_task_size(), 1); + + auto reply_both_and = SyncGetTaskEvents({}, + /* job_id */ absl::nullopt, + /* limit */ -1, + /* exclude_driver */ false, + "task_name", + actor_id); + EXPECT_EQ(reply_both_and.events_by_task_size(), 0); +} + TEST_F(GcsTaskManagerTest, TestMarkTaskAttemptFailedIfNeeded) { auto tasks = GenTaskIDs(3); auto tasks_running = tasks[0]; diff --git a/src/ray/gcs/pubsub/gcs_pub_sub.cc b/src/ray/gcs/pubsub/gcs_pub_sub.cc index c7ac4294dc8e..b03a9157da46 100644 --- a/src/ray/gcs/pubsub/gcs_pub_sub.cc +++ b/src/ray/gcs/pubsub/gcs_pub_sub.cc @@ -15,6 +15,7 @@ #include "ray/gcs/pubsub/gcs_pub_sub.h" #include "absl/strings/str_cat.h" +#include "ray/rpc/grpc_client.h" namespace ray { namespace gcs { @@ -26,7 +27,7 @@ Status GcsPublisher::PublishActor(const ActorID &id, msg.set_channel_type(rpc::ChannelType::GCS_ACTOR_CHANNEL); msg.set_key_id(id.Binary()); *msg.mutable_actor_message() = message; - publisher_->Publish(msg); + publisher_->Publish(std::move(msg)); if (done != nullptr) { done(Status::OK()); } @@ -40,7 +41,7 @@ Status GcsPublisher::PublishJob(const JobID &id, msg.set_channel_type(rpc::ChannelType::GCS_JOB_CHANNEL); msg.set_key_id(id.Binary()); *msg.mutable_job_message() = message; - publisher_->Publish(msg); + publisher_->Publish(std::move(msg)); if (done != nullptr) { done(Status::OK()); } @@ -54,7 +55,7 @@ Status GcsPublisher::PublishNodeInfo(const NodeID &id, msg.set_channel_type(rpc::ChannelType::GCS_NODE_INFO_CHANNEL); msg.set_key_id(id.Binary()); *msg.mutable_node_info_message() = message; - publisher_->Publish(msg); + publisher_->Publish(std::move(msg)); if (done != nullptr) { done(Status::OK()); } @@ -68,7 +69,7 @@ Status GcsPublisher::PublishWorkerFailure(const WorkerID &id, msg.set_channel_type(rpc::ChannelType::GCS_WORKER_DELTA_CHANNEL); msg.set_key_id(id.Binary()); *msg.mutable_worker_delta_message() = message; - publisher_->Publish(msg); + publisher_->Publish(std::move(msg)); if (done != nullptr) { done(Status::OK()); } @@ -82,7 +83,7 @@ Status GcsPublisher::PublishError(const std::string &id, msg.set_channel_type(rpc::ChannelType::RAY_ERROR_INFO_CHANNEL); msg.set_key_id(id); *msg.mutable_error_info_message() = message; - publisher_->Publish(msg); + publisher_->Publish(std::move(msg)); if (done != nullptr) { done(Status::OK()); } @@ -212,5 +213,91 @@ Status GcsSubscriber::SubscribeAllWorkerFailures( return Status::OK(); } +grpc::ChannelArguments PythonGrpcChannelArguments() { + grpc::ChannelArguments arguments; + arguments.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, 512 * 1024 * 1024); + arguments.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, 60 * 1000); + arguments.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 60 * 1000); + return arguments; +} + +PythonGcsPublisher::PythonGcsPublisher(const std::string &gcs_address) { + std::vector address = absl::StrSplit(gcs_address, ':'); + RAY_LOG(DEBUG) << "Connect to gcs server via address: " << gcs_address; + RAY_CHECK(address.size() == 2); + gcs_address_ = address[0]; + gcs_port_ = std::stoi(address[1]); +} + +Status PythonGcsPublisher::Connect() { + auto arguments = PythonGrpcChannelArguments(); + channel_ = rpc::BuildChannel(gcs_address_, gcs_port_, arguments); + pubsub_stub_ = rpc::InternalPubSubGcsService::NewStub(channel_); + return Status::OK(); +} + +constexpr int MAX_GCS_PUBLISH_RETRIES = 60; + +Status PythonGcsPublisher::DoPublishWithRetries(const rpc::GcsPublishRequest &request, + int64_t num_retries, + int64_t timeout_ms) { + int count = num_retries == -1 ? MAX_GCS_PUBLISH_RETRIES : num_retries; + rpc::GcsPublishReply reply; + grpc::Status status; + while (count > 0) { + grpc::ClientContext context; + if (timeout_ms != -1) { + context.set_deadline(std::chrono::system_clock::now() + + std::chrono::milliseconds(timeout_ms)); + } + status = pubsub_stub_->GcsPublish(&context, request, &reply); + if (status.error_code() == grpc::StatusCode::OK) { + if (reply.status().code() != static_cast(StatusCode::OK)) { + return Status::Invalid(reply.status().message()); + } + return Status::OK(); + } else if (status.error_code() == grpc::StatusCode::UNAVAILABLE || + status.error_code() == grpc::StatusCode::UNKNOWN) { + // This is the case in which we will retry + count -= 1; + std::this_thread::sleep_for(std::chrono::seconds(1)); + continue; + } else { + return Status::Invalid(status.error_message()); + } + } + return Status::TimedOut("Failed to publish after retries: " + status.error_message()); +} + +Status PythonGcsPublisher::PublishError(const std::string &key_id, + const rpc::ErrorTableData &error_info, + int64_t num_retries) { + rpc::GcsPublishRequest request; + auto *message = request.add_pub_messages(); + message->set_channel_type(rpc::RAY_ERROR_INFO_CHANNEL); + message->set_key_id(key_id); + message->mutable_error_info_message()->MergeFrom(error_info); + return DoPublishWithRetries(request, num_retries, 1000); +} + +Status PythonGcsPublisher::PublishLogs(const std::string &key_id, + const rpc::LogBatch &log_batch) { + rpc::GcsPublishRequest request; + auto *message = request.add_pub_messages(); + message->set_channel_type(rpc::RAY_LOG_CHANNEL); + message->set_key_id(key_id); + message->mutable_log_batch_message()->MergeFrom(log_batch); + return DoPublishWithRetries(request, -1, -1); +} + +Status PythonGcsPublisher::PublishFunctionKey( + const rpc::PythonFunction &python_function) { + rpc::GcsPublishRequest request; + auto *message = request.add_pub_messages(); + message->set_channel_type(rpc::RAY_PYTHON_FUNCTION_CHANNEL); + message->mutable_python_function_message()->MergeFrom(python_function); + return DoPublishWithRetries(request, -1, -1); +} + } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/pubsub/gcs_pub_sub.h b/src/ray/gcs/pubsub/gcs_pub_sub.h index ffd79a6adfab..db621938dc98 100644 --- a/src/ray/gcs/pubsub/gcs_pub_sub.h +++ b/src/ray/gcs/pubsub/gcs_pub_sub.h @@ -25,6 +25,7 @@ #include "ray/pubsub/publisher.h" #include "ray/pubsub/subscriber.h" #include "src/ray/protobuf/gcs.pb.h" +#include "src/ray/protobuf/gcs_service.grpc.pb.h" #include "src/ray/protobuf/gcs_service.pb.h" namespace ray { @@ -132,5 +133,41 @@ class GcsSubscriber { const std::unique_ptr subscriber_; }; +// This client is only supposed to be used from Cython / Python +class RAY_EXPORT PythonGcsPublisher { + public: + explicit PythonGcsPublisher(const std::string &gcs_address); + + /// Connect to the publisher service of the GCS. + /// This function must be called before calling other functions. + /// + /// \return Status + Status Connect(); + + /// Publish error information to GCS. + Status PublishError(const std::string &key_id, + const rpc::ErrorTableData &data, + int64_t num_retries); + + /// Publish logs to GCS. + Status PublishLogs(const std::string &key_id, const rpc::LogBatch &log_batch); + + /// Publish a function key to GCS. + Status PublishFunctionKey(const rpc::PythonFunction &python_function); + + private: + Status DoPublishWithRetries(const rpc::GcsPublishRequest &request, + int64_t num_retries, + int64_t timeout_ms); + std::unique_ptr pubsub_stub_; + std::shared_ptr channel_; + std::string gcs_address_; + int gcs_port_; +}; + +/// Construct the arguments for synchronous gRPC clients +/// (the ones wrapped in Python) +grpc::ChannelArguments PythonGrpcChannelArguments(); + } // namespace gcs } // namespace ray diff --git a/src/ray/gcs/test/gcs_test_util.h b/src/ray/gcs/test/gcs_test_util.h index a0746add894c..fdef576c32e3 100644 --- a/src/ray/gcs/test/gcs_test_util.h +++ b/src/ray/gcs/test/gcs_test_util.h @@ -266,9 +266,16 @@ struct Mocker { auto new_events = data.add_events_by_task(); new_events->CopyFrom(events); } - data.set_num_profile_task_events_dropped(num_profile_task_events_dropped); - data.set_num_status_task_events_dropped(num_status_task_events_dropped); + for (int i = 0; i < num_status_task_events_dropped; ++i) { + rpc::TaskAttempt rpc_task_attempt; + rpc_task_attempt.set_task_id(RandomTaskId().Binary()); + rpc_task_attempt.set_attempt_number(0); + *(data.add_dropped_task_attempts()) = rpc_task_attempt; + } + + data.set_num_profile_events_dropped(num_profile_task_events_dropped); + data.set_job_id(JobID::FromInt(0).Binary()); return data; } }; diff --git a/src/ray/internal/internal.cc b/src/ray/internal/internal.cc deleted file mode 100644 index 7821c2b1ff86..000000000000 --- a/src/ray/internal/internal.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2020 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/internal/internal.h" - -#include "ray/core_worker/core_worker.h" - -namespace ray { -namespace internal { - -using ray::core::CoreWorkerProcess; -using ray::core::TaskOptions; - -std::vector SendInternal(const ActorID &peer_actor_id, - std::shared_ptr buffer, - RayFunction &function, - int return_num) { - std::unordered_map resources; - std::string name = function.GetFunctionDescriptor()->DefaultTaskName(); - TaskOptions options{name, return_num, resources}; - - char meta_data[3] = {'R', 'A', 'W'}; - std::shared_ptr meta = - std::make_shared((uint8_t *)meta_data, 3, true); - - std::vector> args; - if (function.GetLanguage() == Language::PYTHON) { - auto dummy = "__RAY_DUMMY__"; - std::shared_ptr dummyBuffer = - std::make_shared((uint8_t *)dummy, 13, true); - args.emplace_back(new TaskArgByValue(std::make_shared( - std::move(dummyBuffer), meta, std::vector(), true))); - } - args.emplace_back(new TaskArgByValue(std::make_shared( - std::move(buffer), meta, std::vector(), true))); - - std::vector> results; - auto result = CoreWorkerProcess::GetCoreWorker().SubmitActorTask( - peer_actor_id, function, args, options); - if (!result.has_value()) { - RAY_CHECK(false) << "Back pressure should not be enabled."; - } - return result.value(); -} - -const ray::stats::TagKeyType TagRegister(const std::string tag_name) { - return ray::stats::TagKeyType::Register(tag_name); -} - -const ActorID &GetCurrentActorID() { - return CoreWorkerProcess::GetCoreWorker().GetWorkerContext().GetCurrentActorID(); -} - -bool IsInitialized() { return CoreWorkerProcess::IsInitialized(); } - -} // namespace internal -} // namespace ray diff --git a/src/ray/internal/internal.h b/src/ray/internal/internal.h deleted file mode 100644 index 0eb58062c03c..000000000000 --- a/src/ray/internal/internal.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2020 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "ray/common/buffer.h" -#include "ray/common/id.h" -#include "ray/core_worker/common.h" -#include "ray/stats/metric.h" - -// This header is used to warp some internal code so we can reduce suspicious -// symbols export. -namespace ray { -namespace internal { - -using ray::core::RayFunction; - -/// Send buffer internal -/// \param[in] buffer buffer to be sent. -/// \param[in] function the function descriptor of peer's function. -/// \param[in] return_num return value number of the call. -/// \param[out] return_ids return ids from SubmitActorTask. -std::vector SendInternal(const ActorID &peer_actor_id, - std::shared_ptr buffer, - RayFunction &function, - int return_num); - -const stats::TagKeyType TagRegister(const std::string tag_name); - -/// Get current actor id via internal. -const ActorID &GetCurrentActorID(); - -/// Get core worker initialization flag via internal. -bool IsInitialized(); -} // namespace internal -} // namespace ray diff --git a/src/ray/object_manager/plasma/store.cc b/src/ray/object_manager/plasma/store.cc index a4a8ffa2ea9f..a616a652f684 100644 --- a/src/ray/object_manager/plasma/store.cc +++ b/src/ray/object_manager/plasma/store.cc @@ -516,7 +516,7 @@ void PlasmaStore::ProcessCreateRequests() { create_timer_ = nullptr; ProcessCreateRequests(); }, - retry_after_ms); + std::chrono::milliseconds(retry_after_ms)); } } @@ -555,7 +555,7 @@ void PlasmaStore::PrintAndRecordDebugDump() const { stats_timer_ = execute_after( io_context_, [this]() { PrintAndRecordDebugDump(); }, - RayConfig::instance().event_stats_print_interval_ms()); + std::chrono::milliseconds(RayConfig::instance().event_stats_print_interval_ms())); } void PlasmaStore::ScheduleRecordMetrics() const { @@ -567,7 +567,7 @@ void PlasmaStore::ScheduleRecordMetrics() const { [this]() { ScheduleRecordMetrics(); }, // divide by 2 to make sure record happens before reporting // this also matches with NodeManager::RecordMetrics interval - RayConfig::instance().metrics_report_interval_ms() / 2); + std::chrono::milliseconds(RayConfig::instance().metrics_report_interval_ms() / 2)); } std::string PlasmaStore::GetDebugDump() const { diff --git a/src/ray/protobuf/BUILD b/src/ray/protobuf/BUILD index 22054c994261..0bd29efab106 100644 --- a/src/ray/protobuf/BUILD +++ b/src/ray/protobuf/BUILD @@ -46,6 +46,16 @@ proto_library( ], ) +proto_library( + name = "instance_manager_proto", + srcs = ["experimental/instance_manager.proto"], +) + +python_grpc_compile( + name = "instance_manager_py_proto", + deps = [":instance_manager_proto"], +) + proto_library( name = "runtime_env_common_proto", srcs = ["runtime_env_common.proto"], @@ -347,3 +357,21 @@ cc_proto_library( name = "usage_cc_proto", deps = [":usage_proto"], ) + +proto_library( + name = "autoscaler_proto", + srcs = ["experimental/autoscaler.proto"], + deps = [ + ":instance_manager_proto", + ], +) + +python_grpc_compile( + name = "autoscaler_py_proto", + deps = [":autoscaler_proto"], +) + +cc_proto_library( + name = "autoscaler_cc_proto", + deps = [":autoscaler_proto"], +) diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 751194bc8f0d..6ac9b1411135 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -763,6 +763,8 @@ message CoreWorkerStats { WorkerType worker_type = 23; // Length of the number of objects without truncation. int64 objects_total = 24; + // Number of objects owned by the worker. + int64 num_owned_objects = 25; } // Resource usage reported by the node reporter. diff --git a/src/ray/protobuf/experimental/autoscaler.proto b/src/ray/protobuf/experimental/autoscaler.proto new file mode 100644 index 000000000000..6cd8c0f7fbd0 --- /dev/null +++ b/src/ray/protobuf/experimental/autoscaler.proto @@ -0,0 +1,179 @@ +// Copyright 2023 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; +option cc_enable_arenas = true; + +package ray.autoscaler; + +import "src/ray/protobuf/experimental/instance_manager.proto"; + +// ============= Cluster Resources ==================== +// +// Following fields represents the Cluster Resources autoscaler interested +// in. + +// Represents an anti-affinity constraint. A bundle with this constraint +// can't be allocated to a node that has a label with the same name and +// value. This is used to implement placement group anti-affinity. +// +// For placement group, the label_name is "_PG" (reserved), +// and the label_value is the placement group id. +message AntiAffinityConstraint { + string label_name = 1; + string label_value = 2; + // If true, the label will be created on the node + // where the request with this constraint is scheduled. + bool create_label_on_schedule = 3; +} + +message PlacementConstraint { + AntiAffinityConstraint anti_affinity = 1; +} + +message ResourceRequest { + // resource requirements for the request. + map resources_bundle = 1; + // placement constraint for the request. multiple constraints + // form AND semantics. + repeated PlacementConstraint placement_constraints = 2; +} + +message ResourceRequestByCount { + ResourceRequest request = 1; + int64 count = 2; +} + +// All bundles in the same resource request require gang +// allocation semantics: they should be allocated all or nothing. +message GangResourceRequest { + // a map from bundles to the number of bundles requested. + repeated ResourceRequest requests = 1; +} + +// Cluster resource constraint represents minimial cluster size requirement, +// this is issued through ray.autoscaler.sdk.request_resources. +message ClusterResourceConstraint { + // If not empty, the cluster should have the capacity (total resource) to + // fit the min_resources. + map min_resources = 1; + // If not emtpy, the cluster should have the capacity (total resource) to fit + // the min_bundles. + repeated ResourceRequest min_bundles = 2; + // Id of the job who issued this constraint. + string job_id = 3; +} + +message NodeState { + enum NodeStatus { + // Node is alive. + ALIVE = 0; + // Node is dead. + DEAD = 1; + // Node is being drained. + DRAIN_PENDING = 2; + // Node is being drained. + DRAIN_FAILED = 3; + // Node is being drained. + DRAINING = 4; + // Node is already drained, and ready to be removed. + DRAINED = 5; + } + // The node id internal to Ray. + string node_id = 11; + + // The instance id that the node is running on. + // This is passed in when the node is registered. + string instance_id = 12; + + // The available resources on the node. + // Reserved resource names: CPU, GPU, MEMORY, OBJECT_STORE_MEMORY + map available_resources = 13; + + // The corresponding total resources on the node. + map total_resources = 14; + + // Dynamic labels associated with the node. + // Reserved dynamic label names: _PG + map dynamic_labels = 15; + + // A monotonic increasing version of the node resource state. + int64 node_state_version = 16; + + // The status of the node. + NodeStatus status = 17; +} + +// ============= Autoscaling State Service API ======================= +// +// Autoscaler periodically calls to +// two snapshot APIs, GetClusterResourceState +// and ReportAutoscalingState. +// The GetClusterResourceState will return a snapshot +// of Ray state that Autoscaler interested, along with +// the cluster_resource_state_version (version). +// +// Separately, autoscaler will constantly making decisions +// based on the latest Ray state, and also change its +// state based on the information from node provider. +// Autoscaler will periodically report its state to GCS +// through ReportAutoscalingState API. + +message GetClusterResourceStateRequest { + // The last seen cluster resource state version. The default value is reserved for if a + // previous scheduling state has never been seen. + int64 last_seen_cluster_resource_state_version = 1; +} + +message GetClusterResourceStateReply { + // an monotonically increasing version of the cluster resources. + int64 cluster_resource_state_version = 1; + // last seen autoscaler state. + int64 last_seen_autoscaler_state_version = 2; + // Current cluster resources. + repeated NodeState node_states = 3; + // Resource requests pending scheduling. + repeated ResourceRequestByCount pending_resource_requests = 4; + // Gang resource requests pending scheduling. + repeated GangResourceRequest pending_gang_resource_requests = 5; + // Cluster resource constraints. + // There could be multiple constraints issued by different + // jobs. Autoscaler to make sure all constraints are satisfied. + repeated ClusterResourceConstraint cluster_resource_constraints = 6; +} + +message ReportAutoscalingStateRequest { + int64 last_seen_cluster_resource_state_version = 1; + // A monotonically increasing version identifies + // the state of autoscaler. + // Note: for the same cluster resource state, the + // autoscaler state might be different, since + // the autoscaler's state could also be updated by + // node provider. + int64 autoscaler_state_version = 2; + repeated Instance instances = 3; + // infeasible resource requests. + repeated ResourceRequest infeasible_resource_requests = 4; + repeated ClusterResourceConstraint infeasible_gange_resource_requests = 5; + repeated ClusterResourceConstraint infeasible_cluster_resource_constraints = 6; +} + +message ReportAutoscalingStateReply {} + +service AutoscalerStateService { + rpc GetClusterResourceState(GetClusterResourceStateRequest) + returns (GetClusterResourceStateReply); + rpc ReportAutoscalingState(ReportAutoscalingStateRequest) + returns (ReportAutoscalingStateReply); +} \ No newline at end of file diff --git a/src/ray/protobuf/experimental/instance_manager.proto b/src/ray/protobuf/experimental/instance_manager.proto new file mode 100644 index 000000000000..7278899a3b63 --- /dev/null +++ b/src/ray/protobuf/experimental/instance_manager.proto @@ -0,0 +1,151 @@ +// Copyright 2017 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; +option cc_enable_arenas = true; + +package ray.autoscaler; + +enum RayNodeKind { + UNKNOWN = 0; + HEAD = 1; + WORKER = 2; +} + +// A node type that's available for the cluster. +message InstanceType { + // the name of the instance type. e.g. "m4.large" + string type_name = 1; + RayNodeKind node_kind = 2; + // avaialble resources on the node. such as {"CPU": 4, "GPU": 1} + map resources = 3; +} + +message GetAvailableInstanceTypesRequest {} + +message GetAvailableInstanceTypesResponse { + repeated InstanceType instance_types = 1; + // number of instances that are available for starting. + // this can change if the cloud provider has a limit on + // number of instances that can be started. + int64 available_instances = 2; +} + +// Represents the state of a launched instance. +// An instance is considered launched as long as +// it has a unique instance_id associated with it. +// +// Note a launched instance may be DEAD. In this case, +// the state will be garbage collected after some timeout +// period (by default 30 minutes). +message Instance { + enum InstanceStatus { + // The unspecified state - most likey it is queued. + INSTANCE_STATUS_UNSPECIFIED = 0; + // Instance is starting. The first state update received from the + // instance. + STARTING = 1; + // The instance is running - one of two states of a healthy instance. + RUNNING = 2; + // The instance is idle - one of two states of a healthy instance. + IDLE = 3; + // The instance is stopping - usually follows from the RUNNING, IDLE, + // PREEMPT_REQUEST or DRAIN_REQUEST state. + STOPPING = 4; + // The instance is stopped - follows from the STOPPING state. + STOPPED = 5; + // The instance is in a bad state - but it is still able to send updates. + FAILING = 6; + // The subscribe service moves instances to this state if they + // have been idle for too long. This allows the cluster manager to + // make a final decision on whether or not to commence a drain + // sequence for this instance. + DRAIN_CONFIRMATION_PENDING = 7; + // The instance should be drained, Ray should start draining process + // but could reject if failed to drain. + DRAIN_REQUEST = 8; + // The instance will be preempted by the instance manager, regardless + // of whether it is drainable or not. + PREEMPT_REQUEST = 9; + // An optional state that can be used to indicate that the instance + // is allocated from cloud provider, but ray hasn't been installed yet. + INSTANCE_ALLOCATED = 10; + // An optional state that can be used to indicate that the instance + // is currently installing Ray. + INSTALLING_RAY = 11; + // An optional state that can be used to indicate that the instance + // failed to allocate from cloud provider. + ALLOCATION_FAILED = 12; + // Node is deleted. + GARAGE_COLLECTED = 13; + } + // an unique id for the instance that's generated by the + // instance manager. This may be optional if + // the instance hasn't be started yet. + string instance_id = 11; + // the status of the instance. + InstanceStatus status = 12; + // the node type of the instance. + string node_type = 13; + // The corresponding total resources on the node. + map total_resources = 14; + // timestamp of the last state changed. + int64 timestamp_since_last_state_change = 15; + // the external id of the instance that's generated by + // the cloud provider like AWS, GCP, etc. + // Note this id can be reused by different instances. + string cloud_instance_id = 16; + // internal ip address of the instance. + string internal_ip = 17; + // external ip address of the instance. + string external_ip = 18; + // the monotonically increasing version number of the instance. + int64 version = 19; +} + +message UpdateInstanceManagerStateRequest { + int64 expected_version = 1; + repeated InstanceType new_nodes_to_start = 2; + repeated string instance_ids_to_terminate = 3; +} + +message UpdateInstanceManagerStateReply { + bool success = 1; + string error_message = 2; + int64 version = 3; +} + +message InstanceManagerState { + // a monotonically increasing version number. + // the version number is incremented whenever + // the state is updated (either by successful adjusting request, + // or instance state change). + int64 version = 1; + repeated Instance instances = 2; +} + +message GetInstanceManagerStateRequest {} + +message GetInstanceManagerStateReply { + InstanceManagerState state = 1; +} + +service InstanceManagerService { + rpc GetInstanceManagerState(GetInstanceManagerStateRequest) + returns (GetInstanceManagerStateReply); + rpc UpdateInstanceManagerState(UpdateInstanceManagerStateRequest) + returns (UpdateInstanceManagerStateReply); + rpc GetAvailableInstanceTypes(GetAvailableInstanceTypesRequest) + returns (GetAvailableInstanceTypesResponse); +} diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index ff49640628e3..50fbe259286b 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -240,6 +240,8 @@ message TaskStateUpdate { optional RayErrorInfo error_info = 9; // Task logs info. optional TaskLogInfo task_log_info = 10; + // Actor task repr name. + optional string actor_repr_name = 11; } // Represents events and state changes from a single task run. @@ -258,15 +260,26 @@ message TaskEvents { bytes job_id = 6; } +message TaskAttempt { + // The task id of the task attempt. + bytes task_id = 1; + // The attempt number of the task attempt. + int32 attempt_number = 2; +} + // Represents a compact list of task state events by different tasks, // where each task has a list of state change events. message TaskEventData { // A batch of task state change events. repeated TaskEvents events_by_task = 1; - // Number of dropped profile task events due to buffer size limit on workers. - int32 num_profile_task_events_dropped = 3; - // Number of dropped status task events due to buffer size limit on workers. - int32 num_status_task_events_dropped = 4; + // A list of task attempts that were dropped on the worker. + // We only drop task attempts if task state update is lost on the worker + // due to too many events being generated. + repeated TaskAttempt dropped_task_attempts = 2; + // Number of profile events dropped on the worker. + int32 num_profile_events_dropped = 3; + // Current job the worker is reporting data for. + bytes job_id = 4; } message ResourceTableData { diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 9c63d76c4130..7bc382bc0842 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -538,11 +538,20 @@ message GcsPublishReply { message GcsSubscriberPollRequest { /// The id of the subscriber. bytes subscriber_id = 1; + /// The max squence_id that has been processed by the subscriber. The Publisher + /// will drop queued messages with smaller sequence_id for this subscriber. + int64 max_processed_sequence_id = 2; + /// The expected publisher_id. The publisher will ignore the + /// max_processed_sequence_id if the publisher_id doesn't match. + /// This usuall happens when gcs failover. + bytes publisher_id = 3; } message GcsSubscriberPollReply { /// The messages that are published. repeated PubMessage pub_messages = 1; + /// The publisher's id. + bytes publisher_id = 2; // Not populated. GcsStatus status = 100; } @@ -635,22 +644,27 @@ message AddTaskEventDataReply { } message GetTaskEventsRequest { - message TaskIDs { - repeated string vals = 1; - } - oneof select_by { + // Filter object where predicates are AND together. + message Filters { // Get task events from a job. - string job_id = 1; + optional bytes job_id = 1; // Get task events from a set of tasks. - TaskIDs task_ids = 2; + repeated bytes task_ids = 2; + // Get the task events with an actor id. + optional bytes actor_id = 3; + // Get the task events of task with names. + optional string name = 4; + // True if task events from driver (only profiling events) should be excluded. + optional bool exclude_driver = 5; } // Maximum number of TaskEvents to return. // If set, the exact `limit` TaskEvents returned do not have any ordering or selection // guarantee. optional int64 limit = 3; - // True if task events from driver (only profiling events) should be excluded. - bool exclude_driver = 4; + + // Filters to apply to the get query. + optional Filters filters = 4; } message GetTaskEventsReply { diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index d5861747faab..16194eea2af7 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -219,6 +219,8 @@ message ObjectStoreStats { // the node has more pull requests than available object store // memory. bool object_pulls_queued = 13; + // The number of primary copies of objects in the local node. + int64 num_object_store_primary_copies = 14; } message GetNodeStatsReply { diff --git a/src/ray/protobuf/pubsub.proto b/src/ray/protobuf/pubsub.proto index eff40b652fea..aba5f588d4b1 100644 --- a/src/ray/protobuf/pubsub.proto +++ b/src/ray/protobuf/pubsub.proto @@ -77,6 +77,8 @@ message PubMessage { // The message that indicates the given key id is not available anymore. FailureMessage failure_message = 6; } + /// A monotonically increasing sequence_id generated by the publisher. + int64 sequence_id = 16; } message WorkerObjectEvictionMessage { @@ -202,11 +204,20 @@ message WorkerObjectLocationsSubMessage { message PubsubLongPollingRequest { /// The id of the subscriber. bytes subscriber_id = 1; + /// The max squence_id that has been processed by the subscriber. The Publisher + /// will drop queued messages with smaller sequence_id for this subscriber. + int64 max_processed_sequence_id = 2; + /// The expected publisher_id. The publisher will ignore the + /// max_processed_sequence_id if the publisher_id doesn't match. + /// This usuall happens when gcs failover. + bytes publisher_id = 3; } message PubsubLongPollingReply { /// The messages that are published. repeated PubMessage pub_messages = 1; + /// The publisher_id. + bytes publisher_id = 2; } message PubsubCommandBatchRequest { diff --git a/src/ray/protobuf/reporter.proto b/src/ray/protobuf/reporter.proto index cc79e8f10875..b2425ae1eec5 100644 --- a/src/ray/protobuf/reporter.proto +++ b/src/ray/protobuf/reporter.proto @@ -100,6 +100,10 @@ message StreamLogRequest { // if keep_alive is true, this indicates how frequently to poll the // log file for new lines optional float interval = 4; + // Task id to start streaming from from this file. + optional string task_id = 5; + // Attempt number of the task. + optional int64 attempt_number = 6; } message StreamLogReply { diff --git a/src/ray/protobuf/serve.proto b/src/ray/protobuf/serve.proto index 9f47aa518f46..7f6fdbf4a57c 100644 --- a/src/ray/protobuf/serve.proto +++ b/src/ray/protobuf/serve.proto @@ -150,7 +150,8 @@ message ActorNameList { message DeploymentVersion { string code_version = 1; - bytes user_config = 2; + DeploymentConfig deployment_config = 2; + string ray_actor_options = 3; } message ReplicaConfig { @@ -169,6 +170,7 @@ message DeploymentInfo { string actor_name = 5; string version = 6; int64 end_time_ms = 7; + string app_name = 8; } // Wrap DeploymentInfo and route. The "" route value need to be convert to None/null. @@ -185,9 +187,9 @@ message DeploymentRouteList { enum DeploymentStatus { // Keep frontend code of ServeDeploymentStatus in dashboard/client/src/type/serve.ts // in sync with this enum - UPDATING = 0; - HEALTHY = 1; - UNHEALTHY = 2; + DEPLOYMENT_STATUS_UPDATING = 0; + DEPLOYMENT_STATUS_HEALTHY = 1; + DEPLOYMENT_STATUS_UNHEALTHY = 2; } message DeploymentStatusInfo { @@ -204,11 +206,11 @@ message DeploymentStatusInfoList { enum ApplicationStatus { // Keep frontend code of ServeApplicationStatus in dashboard/client/src/type/serve.ts // in sync with this enum - DEPLOYING = 0; - RUNNING = 1; - DEPLOY_FAILED = 2; - DELETING = 3; - NOT_STARTED = 5; + APPLICATION_STATUS_DEPLOYING = 0; + APPLICATION_STATUS_RUNNING = 1; + APPLICATION_STATUS_DEPLOY_FAILED = 2; + APPLICATION_STATUS_DELETING = 3; + APPLICATION_STATUS_NOT_STARTED = 5; } message ApplicationStatusInfo { diff --git a/src/ray/protobuf/usage.proto b/src/ray/protobuf/usage.proto index 3ad666484452..fbbe4bf0afab 100644 --- a/src/ray/protobuf/usage.proto +++ b/src/ray/protobuf/usage.proto @@ -114,14 +114,13 @@ enum TagKey { NUM_ACTOR_TASKS = 306; NUM_NORMAL_TASKS = 307; NUM_DRIVERS = 308; + // State api import usage. + EXPERIMENTAL_STATE_API_IMPORT = 309; // Data // Logical operators, stored in JSON format with operator name and count. // Example: {"MapBatches": 2, "Filter": 1} DATA_LOGICAL_OPS = 400; - // Block formats: simple, pandas, or arrow. - // Example: {"pandas": 2, "numpy": 1} - DATA_BLOCK_FORMATS = 401; // AIR // Name of AIR trainer, or "Custom" if user-defined. @@ -133,4 +132,19 @@ enum TagKey { // Name of Tune scheduler algorithm or "Custom" if user-defined. // Example: "FIFOScheduler" TUNE_SCHEDULER = 502; + // Ray AIR environment variable usage stored in JSON list format + // This lists which of the environment variables exposed by the AIR libraries + // are provided by the user. + // Ex: ["RAY_AIR_LOCAL_CACHE_DIR", "TUNE_FALLBACK_TO_LATEST_CHECKPOINT"] + AIR_ENV_VARS = 503; + // Fully user-controlled experiment tracking integrations ("1" if used) + // NOTE: These tags + the callback metrics can be aggregated to extract + // total experiment tracking integration usage. + AIR_SETUP_WANDB_INTEGRATION_USED = 504; + AIR_SETUP_MLFLOW_INTEGRATION_USED = 505; + // Built-in callbacks, stored in JSON format with callback name -> count. + // Ex: {"WandbLoggerCallback": 1, "MLflowLoggerCallback": 1} + AIR_CALLBACKS = 506; + // Storage configuration for AIR experiment + AIR_STORAGE_CONFIGURATION = 507; } diff --git a/src/ray/pubsub/mock_pubsub.h b/src/ray/pubsub/mock_pubsub.h index 83dec35f72a3..5cb085a83444 100644 --- a/src/ray/pubsub/mock_pubsub.h +++ b/src/ray/pubsub/mock_pubsub.h @@ -67,7 +67,7 @@ class MockPublisher : public pubsub::PublisherInterface { const pubsub::SubscriberID &subscriber_id, const std::optional &key_id)); - MOCK_METHOD1(Publish, void(const rpc::PubMessage &pub_message)); + MOCK_METHOD1(Publish, void(rpc::PubMessage pub_message)); MOCK_METHOD3(UnregisterSubscription, bool(const rpc::ChannelType channel_type, diff --git a/src/ray/pubsub/publisher.cc b/src/ray/pubsub/publisher.cc index 40d6c4412815..fec34b5e32fd 100644 --- a/src/ray/pubsub/publisher.cc +++ b/src/ray/pubsub/publisher.cc @@ -22,24 +22,22 @@ namespace pubsub { namespace pub_internal { -bool BasicEntityState::Publish(const rpc::PubMessage &pub_message) { +bool BasicEntityState::Publish(std::shared_ptr msg) { if (subscribers_.empty()) { return false; } - const auto msg = std::make_shared(pub_message); for (auto &[id, subscriber] : subscribers_) { subscriber->QueueMessage(msg); } return true; } -bool CappedEntityState::Publish(const rpc::PubMessage &pub_message) { +bool CappedEntityState::Publish(std::shared_ptr msg) { if (subscribers_.empty()) { return false; } - const int64_t message_size = pub_message.ByteSizeLong(); - + const int64_t message_size = msg->ByteSizeLong(); while (!pending_messages_.empty()) { // NOTE: if atomic ref counting becomes too expensive, it should be possible // to implement inflight message tracking across subscribers with non-atomic @@ -77,7 +75,6 @@ bool CappedEntityState::Publish(const rpc::PubMessage &pub_message) { message_sizes_.pop(); } - const auto msg = std::make_shared(pub_message); pending_messages_.push(msg); total_size_ += message_size; message_sizes_.push(message_size); @@ -104,10 +101,10 @@ const absl::flat_hash_map &EntityState::Subscri SubscriptionIndex::SubscriptionIndex(rpc::ChannelType channel_type) : channel_type_(channel_type), subscribers_to_all_(CreateEntityState()) {} -bool SubscriptionIndex::Publish(const rpc::PubMessage &pub_message) { +bool SubscriptionIndex::Publish(std::shared_ptr pub_message) { const bool publish_to_all = subscribers_to_all_->Publish(pub_message); bool publish_to_entity = false; - auto it = entities_.find(pub_message.key_id()); + auto it = entities_.find(pub_message->key_id()); if (it != entities_.end()) { publish_to_entity = it->second->Publish(pub_message); } @@ -246,6 +243,22 @@ std::unique_ptr SubscriptionIndex::CreateEntityState() { void SubscriberState::ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request, rpc::PubsubLongPollingReply *reply, rpc::SendReplyCallback send_reply_callback) { + auto max_processed_sequence_id = request.max_processed_sequence_id(); + if (request.publisher_id().empty() || + publisher_id_ != PublisherID::FromBinary(request.publisher_id())) { + // in case the publisher_id mismatches, we should ignore the + // max_processed_sequence_id. + max_processed_sequence_id = 0; + } + + // clean up messages that have already been processed. + while (!mailbox_.empty() && + mailbox_.front()->sequence_id() <= max_processed_sequence_id) { + RAY_LOG(DEBUG) << "removing " << max_processed_sequence_id << " : " + << mailbox_.front()->sequence_id(); + mailbox_.pop_front(); + } + if (long_polling_connection_) { // Because of the new long polling request, flush the current polling request with an // empty reply. @@ -262,7 +275,8 @@ void SubscriberState::ConnectToSubscriber(const rpc::PubsubLongPollingRequest &r void SubscriberState::QueueMessage(const std::shared_ptr &pub_message, bool try_publish) { - mailbox_.push(pub_message); + RAY_LOG(DEBUG) << "enqueue: " << pub_message->sequence_id(); + mailbox_.push_back(pub_message); if (try_publish) { PublishIfPossible(); } @@ -278,28 +292,35 @@ bool SubscriberState::PublishIfPossible(bool force_noop) { // No message should have been added to the reply. RAY_CHECK(long_polling_connection_->reply->pub_messages().empty()); + *long_polling_connection_->reply->mutable_publisher_id() = publisher_id_.Binary(); if (!force_noop) { - for (int i = 0; i < publish_batch_size_ && !mailbox_.empty(); ++i) { - const rpc::PubMessage &msg = *mailbox_.front(); + for (auto it = mailbox_.begin(); it != mailbox_.end(); it++) { + if (long_polling_connection_->reply->pub_messages().size() >= publish_batch_size_) { + break; + } + const rpc::PubMessage &msg = **it; // Avoid sending empty message to the subscriber. The message might have been // cleared because the subscribed entity's buffer was full. if (msg.inner_message_case() != rpc::PubMessage::INNER_MESSAGE_NOT_SET) { *long_polling_connection_->reply->add_pub_messages() = msg; } - mailbox_.pop(); } } + + RAY_LOG(DEBUG) << "sending reply back" + << long_polling_connection_->reply->DebugString(); long_polling_connection_->send_reply_callback(Status::OK(), nullptr, nullptr); // Clean up & update metadata. long_polling_connection_.reset(); + // Clean up & update metadata. last_connection_update_time_ms_ = get_time_ms_(); return true; } bool SubscriberState::CheckNoLeaks() const { // If all message in the mailbox has been replied, consider there is no leak. - return !long_polling_connection_ && mailbox_.empty(); + return mailbox_.empty(); } bool SubscriberState::ConnectionExists() const { @@ -319,7 +340,8 @@ void Publisher::ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request RAY_CHECK(send_reply_callback != nullptr); const auto subscriber_id = SubscriberID::FromBinary(request.subscriber_id()); - RAY_LOG(DEBUG) << "Long polling connection initiated by " << subscriber_id.Hex(); + RAY_LOG(DEBUG) << "Long polling connection initiated by " << subscriber_id.Hex() + << ", publisher_id " << publisher_id_.Hex(); absl::MutexLock lock(&mutex_); auto it = subscribers_.find(subscriber_id); if (it == subscribers_.end()) { @@ -329,7 +351,8 @@ void Publisher::ConnectToSubscriber(const rpc::PubsubLongPollingRequest &request std::make_unique(subscriber_id, get_time_ms_, subscriber_timeout_ms_, - publish_batch_size_)) + publish_batch_size_, + publisher_id_)) .first; } auto &subscriber = it->second; @@ -350,7 +373,8 @@ bool Publisher::RegisterSubscription(const rpc::ChannelType channel_type, std::make_unique(subscriber_id, get_time_ms_, subscriber_timeout_ms_, - publish_batch_size_)) + publish_batch_size_, + publisher_id_)) .first; } pub_internal::SubscriberState *subscriber = it->second.get(); @@ -359,13 +383,15 @@ bool Publisher::RegisterSubscription(const rpc::ChannelType channel_type, return subscription_index_it->second.AddEntry(key_id.value_or(""), subscriber); } -void Publisher::Publish(const rpc::PubMessage &pub_message) { +void Publisher::Publish(rpc::PubMessage pub_message) { + RAY_CHECK_EQ(pub_message.sequence_id(), 0) << "sequence_id should not be set;"; const auto channel_type = pub_message.channel_type(); absl::MutexLock lock(&mutex_); auto &subscription_index = subscription_index_map_.at(channel_type); // TODO(sang): Currently messages are lost if publish happens // before there's any subscriber for the object. - subscription_index.Publish(pub_message); + pub_message.set_sequence_id(++next_sequence_id_); + subscription_index.Publish(std::make_shared(std::move(pub_message))); cum_pub_message_cnt_[channel_type]++; } diff --git a/src/ray/pubsub/publisher.h b/src/ray/pubsub/publisher.h index 20ccdc29ebdb..f14b0c4e3775 100644 --- a/src/ray/pubsub/publisher.h +++ b/src/ray/pubsub/publisher.h @@ -16,6 +16,7 @@ #include +#include #include #include #include @@ -35,6 +36,7 @@ namespace ray { namespace pubsub { using SubscriberID = UniqueID; +using PublisherID = UniqueID; namespace pub_internal { @@ -47,7 +49,7 @@ class EntityState { /// Publishes the message to subscribers of the entity. /// Returns true if there are subscribers, returns false otherwise. - virtual bool Publish(const rpc::PubMessage &pub_message) = 0; + virtual bool Publish(std::shared_ptr pub_message) = 0; /// Manages the set of subscribers of this entity. bool AddSubscriber(SubscriberState *subscriber); @@ -77,14 +79,14 @@ class EntityState { /// Publishes the message to all subscribers, without size cap on buffered messages. class BasicEntityState : public EntityState { public: - bool Publish(const rpc::PubMessage &pub_message) override; + bool Publish(std::shared_ptr pub_message) override; }; /// Publishes the message to all subscribers, and enforce a total size cap on buffered /// messages. class CappedEntityState : public EntityState { public: - bool Publish(const rpc::PubMessage &pub_message) override; + bool Publish(std::shared_ptr pub_message) override; private: // Tracks inflight messages. The messages have shared ownership by @@ -110,7 +112,7 @@ class SubscriptionIndex { /// Publishes the message to relevant subscribers. /// Returns true if there are subscribers listening on the entity key of the message, /// returns false otherwise. - bool Publish(const rpc::PubMessage &pub_message); + bool Publish(std::shared_ptr pub_message); /// Adds a new subscriber and the key it subscribes to. /// When `key_id` is empty, the subscriber subscribes to all keys. @@ -172,12 +174,14 @@ class SubscriberState { SubscriberState(SubscriberID subscriber_id, std::function get_time_ms, uint64_t connection_timeout_ms, - const int publish_batch_size) + const int publish_batch_size, + PublisherID publisher_id) : subscriber_id_(subscriber_id), get_time_ms_(std::move(get_time_ms)), connection_timeout_ms_(connection_timeout_ms), publish_batch_size_(publish_batch_size), - last_connection_update_time_ms_(get_time_ms_()) {} + last_connection_update_time_ms_(get_time_ms_()), + publisher_id_(publisher_id) {} ~SubscriberState() { // Force a push to close the long-polling. @@ -229,7 +233,7 @@ class SubscriberState { /// Inflight long polling reply callback, for replying to the subscriber. std::unique_ptr long_polling_connection_; /// Queued messages to publish. - std::queue> mailbox_; + std::deque> mailbox_; /// Callback to get the current time. const std::function get_time_ms_; /// The time in which the connection is considered as timed out. @@ -238,6 +242,7 @@ class SubscriberState { const int publish_batch_size_; /// The last time long polling was connected in milliseconds. double last_connection_update_time_ms_; + PublisherID publisher_id_; }; } // namespace pub_internal @@ -263,7 +268,7 @@ class PublisherInterface { /// /// \param pub_message The message to publish. /// Required to contain channel_type and key_id fields. - virtual void Publish(const rpc::PubMessage &pub_message) = 0; + virtual void Publish(rpc::PubMessage pub_message) = 0; /// Publish to the subscriber that the given key id is not available anymore. /// It will invoke the failure callback on the subscriber side. @@ -315,11 +320,13 @@ class Publisher : public PublisherInterface { PeriodicalRunner *const periodical_runner, std::function get_time_ms, const uint64_t subscriber_timeout_ms, - const int publish_batch_size) + const int publish_batch_size, + PublisherID publisher_id = NodeID::FromRandom()) : periodical_runner_(periodical_runner), get_time_ms_(std::move(get_time_ms)), subscriber_timeout_ms_(subscriber_timeout_ms), - publish_batch_size_(publish_batch_size) { + publish_batch_size_(publish_batch_size), + publisher_id_(publisher_id) { // Insert index map for each channel. for (auto type : channels) { subscription_index_map_.emplace(type, type); @@ -354,7 +361,7 @@ class Publisher : public PublisherInterface { /// /// \param pub_message The message to publish. /// Required to contain channel_type and key_id fields. - void Publish(const rpc::PubMessage &pub_message) override; + void Publish(rpc::PubMessage pub_message) override; /// Publish to the subscriber that the given key id is not available anymore. /// It will invoke the failure callback on the subscriber side. @@ -461,6 +468,23 @@ class Publisher : public PublisherInterface { int publish_batch_size_; absl::flat_hash_map cum_pub_message_cnt_ GUARDED_BY(mutex_); + + /// The monotonically increasing sequence_id for this publisher. + /// The publisher will add this sequence_id to every message to be published. + /// The sequence_id is used for handling failures: the publisher will not delete + /// a message from the sending queue until the subscriber has acknowledge + /// it has processed beyond the message's sequence_id. + /// + /// Note: + /// - a valide sequence_id starts from 1. + /// - the subscriber doesn't expect the sequences it receives are contiguous. + /// this is due the fact a subscriber can only subscribe a subset + /// of a channel. + int64_t next_sequence_id_ GUARDED_BY(mutex_) = 0; + + /// A unique identifier identifies the publisher_id. + /// TODO(scv119) add docs about the semantics. + const PublisherID publisher_id_; }; } // namespace pubsub diff --git a/src/ray/pubsub/subscriber.cc b/src/ray/pubsub/subscriber.cc index f36fbabf52c9..b5546fe5d17d 100644 --- a/src/ray/pubsub/subscriber.cc +++ b/src/ray/pubsub/subscriber.cc @@ -17,6 +17,9 @@ namespace ray { namespace pubsub { +namespace { +const PublisherID kDefaultPublisherID{}; +} /////////////////////////////////////////////////////////////////////////////// /// SubscriberChannel @@ -349,7 +352,9 @@ void Subscriber::MakeLongPollingPubsubConnection(const rpc::Address &publisher_a auto subscriber_client = get_client_(publisher_address); rpc::PubsubLongPollingRequest long_polling_request; long_polling_request.set_subscriber_id(subscriber_id_.Binary()); - + auto &processed_state = processed_sequences_[publisher_id]; + long_polling_request.set_publisher_id(processed_state.first.Binary()); + long_polling_request.set_max_processed_sequence_id(processed_state.second); subscriber_client->PubsubLongPolling( long_polling_request, [this, publisher_address](Status status, const rpc::PubsubLongPollingReply &reply) { @@ -362,7 +367,7 @@ void Subscriber::HandleLongPollingResponse(const rpc::Address &publisher_address const Status &status, const rpc::PubsubLongPollingReply &reply) { const auto publisher_id = PublisherID::FromBinary(publisher_address.worker_id()); - RAY_LOG(DEBUG) << "Long polling request has replied from " << publisher_id; + RAY_LOG(DEBUG) << "Long polling request has been replied from " << publisher_id; RAY_CHECK(publishers_connected_.count(publisher_id)); if (!status.ok()) { @@ -377,10 +382,38 @@ void Subscriber::HandleLongPollingResponse(const rpc::Address &publisher_address // Empty the command queue because we cannot send commands anymore. commands_.erase(publisher_id); } else { + RAY_CHECK(!reply.publisher_id().empty()) << "publisher_id is empty."; + auto reply_publisher_id = PublisherID::FromBinary(reply.publisher_id()); + if (reply_publisher_id != processed_sequences_[publisher_id].first) { + if (processed_sequences_[publisher_id].first != kDefaultPublisherID) { + RAY_LOG(INFO) << "Received publisher_id " << reply_publisher_id.Hex() + << " is different from last seen publisher_id " + << processed_sequences_[publisher_id].first + << ", this can only happen when gcs failsover."; + } + // reset publisher_id and processed_sequence + // if the publisher_id changes. + processed_sequences_[publisher_id].first = reply_publisher_id; + processed_sequences_[publisher_id].second = 0; + } + for (int i = 0; i < reply.pub_messages_size(); i++) { const auto &msg = reply.pub_messages(i); const auto channel_type = msg.channel_type(); const auto &key_id = msg.key_id(); + RAY_CHECK_GT(msg.sequence_id(), 0) + << "message's sequence_id is invalid " << msg.sequence_id(); + + if (msg.sequence_id() <= processed_sequences_[publisher_id].second) { + RAY_LOG_EVERY_MS(WARNING, 10000) + << "Received message out of order, publisher_id: " + << processed_sequences_[publisher_id].first + << ", received message sequence_id " + << processed_sequences_[publisher_id].second + << ", received message sequence_id " << msg.sequence_id(); + continue; + } + processed_sequences_[publisher_id].second = msg.sequence_id(); // If the published message is a failure message, the publisher indicates // this key id is failed. Invoke the failure callback. At this time, we should not // unsubscribe the publisher because there are other entries that subscribe from the @@ -399,6 +432,7 @@ void Subscriber::HandleLongPollingResponse(const rpc::Address &publisher_address if (SubscriptionExists(publisher_id)) { MakeLongPollingPubsubConnection(publisher_address); } else { + processed_sequences_.erase(publisher_id); publishers_connected_.erase(publisher_id); } } @@ -478,7 +512,7 @@ bool Subscriber::CheckNoLeaks() const { } } return !leaks && publishers_connected_.empty() && command_batch_sent_.empty() && - commands_.empty(); + commands_.empty() && processed_sequences_.empty(); } std::string Subscriber::DebugString() const { diff --git a/src/ray/pubsub/subscriber.h b/src/ray/pubsub/subscriber.h index a76e9cff40cb..8c73716d8844 100644 --- a/src/ray/pubsub/subscriber.h +++ b/src/ray/pubsub/subscriber.h @@ -391,6 +391,7 @@ class Subscriber : public SubscriberInterface { /// FRIEND_TEST(IntegrationTest, SubscribersToOneIDAndAllIDs); + FRIEND_TEST(IntegrationTest, GcsFailsOver); FRIEND_TEST(SubscriberTest, TestBasicSubscription); FRIEND_TEST(SubscriberTest, TestSingleLongPollingWithMultipleSubscriptions); FRIEND_TEST(SubscriberTest, TestMultiLongPollingWithTheSameSubscription); @@ -491,6 +492,11 @@ class Subscriber : public SubscriberInterface { /// Mapping of channel type to channels. absl::flat_hash_map> channels_ GUARDED_BY(mutex_); + + /// Keeps track of last processed by publisher. + /// Note the publisher_id only change if gcs failover. + absl::flat_hash_map> processed_sequences_ + GUARDED_BY(mutex_); }; } // namespace pubsub diff --git a/src/ray/pubsub/test/integration_test.cc b/src/ray/pubsub/test/integration_test.cc index ffca2acd0e05..bb6574e3a2f0 100644 --- a/src/ray/pubsub/test/integration_test.cc +++ b/src/ray/pubsub/test/integration_test.cc @@ -50,8 +50,6 @@ class SubscriberServiceImpl final : public rpc::SubscriberService::CallbackServi std::function failure_cb) { // Long polling should always succeed. RAY_CHECK_OK(status); - RAY_CHECK(success_cb == nullptr); - RAY_CHECK(failure_cb == nullptr); reactor->Finish(grpc::Status::OK); }); return reactor; @@ -149,8 +147,10 @@ class IntegrationTest : public ::testing::Test { } ~IntegrationTest() { + RAY_LOG(INFO) << "Shutting down server."; // Stop callback runners. io_service_.Stop(); + RAY_LOG(INFO) << "Shutting down server1."; // Assume no new subscriber is connected after the unregisteration above. Otherwise // shutdown would hang below. server_->Shutdown(); @@ -179,6 +179,8 @@ class IntegrationTest : public ::testing::Test { server_ = builder.BuildAndStart(); } + void RestartServer() { SetupServer(); } + std::unique_ptr CreateSubscriber() { return std::make_unique( UniqueID::FromRandom(), @@ -300,6 +302,5 @@ TEST_F(IntegrationTest, SubscribersToOneIDAndAllIDs) { absl::SleepFor(absl::Seconds(1)); } } - } // namespace pubsub } // namespace ray diff --git a/src/ray/pubsub/test/publisher_test.cc b/src/ray/pubsub/test/publisher_test.cc index ccc41c10f7d3..604d0d352c45 100644 --- a/src/ray/pubsub/test/publisher_test.cc +++ b/src/ray/pubsub/test/publisher_test.cc @@ -23,6 +23,9 @@ namespace ray { namespace pubsub { +namespace { +const NodeID kDefaultPublisherId = NodeID::FromRandom(); +} using namespace pub_internal; @@ -44,19 +47,28 @@ class PublisherTest : public ::testing::Test { /*periodic_runner=*/periodic_runner_.get(), /*get_time_ms=*/[this]() { return current_time_; }, /*subscriber_timeout_ms=*/subscriber_timeout_ms_, - /*batch_size*/ 100); + /*batch_size*/ 100, + kDefaultPublisherId); current_time_ = 0; request_.set_subscriber_id(subscriber_id_.Binary()); + request_.set_publisher_id(kDefaultPublisherId.Binary()); } void TearDown() {} - const rpc::PubMessage GeneratePubMessage(const ObjectID &object_id) { + void ResetSequenceId() { sequence_id_ = 0; } + + int64_t GetNextSequenceId() { return ++sequence_id_; } + + const rpc::PubMessage GeneratePubMessage(const ObjectID &object_id, + int64_t sequence_id = 0) { rpc::PubMessage pub_message; auto *object_eviction_msg = pub_message.mutable_worker_object_eviction_message(); object_eviction_msg->set_object_id(object_id.Binary()); pub_message.set_key_id(object_id.Binary()); pub_message.set_channel_type(rpc::ChannelType::WORKER_OBJECT_EVICTION); + RAY_LOG(INFO) << "message sequence_id is" << sequence_id; + pub_message.set_sequence_id(sequence_id); return pub_message; } @@ -81,7 +93,8 @@ class PublisherTest : public ::testing::Test { NodeID::FromRandom(), /*get_time_ms=*/[]() { return 1.0; }, /*subscriber_timeout_ms=*/1000, - /*publish_batch_size=*/1000)); + /*publish_batch_size=*/1000, + kDefaultPublisherId)); return subscribers_.back().get(); } @@ -107,6 +120,7 @@ class PublisherTest : public ::testing::Test { const SubscriberID subscriber_id_ = SubscriberID::FromRandom(); rpc::PubsubLongPollingRequest request_; std::vector> subscribers_; + int64_t sequence_id_ = 0; }; TEST_F(PublisherTest, TestSubscriptionIndexSingeNodeSingleObject) { @@ -327,7 +341,11 @@ TEST_F(PublisherTest, TestSubscriber) { }; auto subscriber = std::make_shared( - subscriber_id_, [this]() { return current_time_; }, subscriber_timeout_ms_, 10); + subscriber_id_, + [this]() { return current_time_; }, + subscriber_timeout_ms_, + 10, + kDefaultPublisherId); // If there's no connection, it will return false. ASSERT_FALSE(subscriber->PublishIfPossible()); // Try connecting. @@ -342,8 +360,9 @@ TEST_F(PublisherTest, TestSubscriber) { absl::flat_hash_set published_objects; // Make sure publishing one object works as expected. auto oid = ObjectID::FromRandom(); - subscriber->QueueMessage(std::make_shared(GeneratePubMessage(oid)), - /*try_publish=*/false); + subscriber->QueueMessage( + std::make_shared(GeneratePubMessage(oid, GetNextSequenceId())), + /*try_publish=*/false); published_objects.emplace(oid); ASSERT_TRUE(subscriber->PublishIfPossible()); ASSERT_TRUE(object_ids_published.contains(oid)); @@ -353,8 +372,9 @@ TEST_F(PublisherTest, TestSubscriber) { // Add 3 oids and see if it works properly. for (int i = 0; i < 3; i++) { oid = ObjectID::FromRandom(); - subscriber->QueueMessage(std::make_shared(GeneratePubMessage(oid)), - /*try_publish=*/false); + subscriber->QueueMessage( + std::make_shared(GeneratePubMessage(oid, GetNextSequenceId())), + /*try_publish=*/false); published_objects.emplace(oid); } // Since there's no connection, objects won't be published. @@ -363,29 +383,51 @@ TEST_F(PublisherTest, TestSubscriber) { for (auto oid : published_objects) { ASSERT_TRUE(object_ids_published.contains(oid)); } + + // Queue is not cleaned up if max_processed_sequence_id hasn't + // been set properly. + request_.set_max_processed_sequence_id(1); + subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + ASSERT_FALSE(subscriber->CheckNoLeaks()); + + // If we set wrong publisher_id, the queue won't be cleaned up. + request_.set_publisher_id(NodeID::FromRandom().Binary()); + request_.set_max_processed_sequence_id(sequence_id_); + subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); + ASSERT_FALSE(subscriber->CheckNoLeaks()); + + // By sending back max_processed_sequence_id, the subscriber's sending queue + // is cleaned up. + request_.set_max_processed_sequence_id(sequence_id_); + request_.set_publisher_id(kDefaultPublisherId.Binary()); + subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); ASSERT_TRUE(subscriber->CheckNoLeaks()); } TEST_F(PublisherTest, TestSubscriberBatchSize) { absl::flat_hash_set object_ids_published; - send_reply_callback = [this, &object_ids_published](Status status, - std::function success, - std::function failure) { - for (int i = 0; i < reply.pub_messages_size(); i++) { - const auto &msg = reply.pub_messages(i); - const auto oid = - ObjectID::FromBinary(msg.worker_object_eviction_message().object_id()); - object_ids_published.emplace(oid); - } - reply = rpc::PubsubLongPollingReply(); - }; + int64_t max_processed_seuquence_id = 0; + send_reply_callback = + [this, &object_ids_published, &max_processed_seuquence_id]( + Status status, std::function success, std::function failure) { + for (int i = 0; i < reply.pub_messages_size(); i++) { + const auto &msg = reply.pub_messages(i); + const auto oid = + ObjectID::FromBinary(msg.worker_object_eviction_message().object_id()); + object_ids_published.emplace(oid); + max_processed_seuquence_id = + std::max(msg.sequence_id(), max_processed_seuquence_id); + } + reply = rpc::PubsubLongPollingReply(); + }; auto max_publish_size = 5; auto subscriber = std::make_shared( subscriber_id_, [this]() { return current_time_; }, subscriber_timeout_ms_, - max_publish_size); + max_publish_size, + kDefaultPublisherId); subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); absl::flat_hash_set published_objects; @@ -393,8 +435,9 @@ TEST_F(PublisherTest, TestSubscriberBatchSize) { for (int i = 0; i < 10; i++) { auto oid = ObjectID::FromRandom(); oids.push_back(oid); - subscriber->QueueMessage(std::make_shared(GeneratePubMessage(oid)), - /*try_publish=*/false); + subscriber->QueueMessage( + std::make_shared(GeneratePubMessage(oid, GetNextSequenceId())), + /*try_publish=*/false); published_objects.emplace(oid); } @@ -409,6 +452,8 @@ TEST_F(PublisherTest, TestSubscriberBatchSize) { } // Remaining messages are published upon polling. + ASSERT_EQ(max_processed_seuquence_id, max_publish_size); + request_.set_max_processed_sequence_id(max_processed_seuquence_id); subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); for (int i = 0; i < 10; i++) { ASSERT_TRUE(object_ids_published.contains(oids[i])); @@ -426,7 +471,11 @@ TEST_F(PublisherTest, TestSubscriberActiveTimeout) { std::function failure) { reply_cnt++; }; auto subscriber = std::make_shared( - subscriber_id_, [this]() { return current_time_; }, subscriber_timeout_ms_, 10); + subscriber_id_, + [this]() { return current_time_; }, + subscriber_timeout_ms_, + 10, + kDefaultPublisherId); subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); @@ -449,6 +498,7 @@ TEST_F(PublisherTest, TestSubscriberActiveTimeout) { ASSERT_EQ(reply_cnt, 1); // New connection is established. + reply = rpc::PubsubLongPollingReply(); subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); ASSERT_TRUE(subscriber->IsActive()); ASSERT_TRUE(subscriber->ConnectionExists()); @@ -460,7 +510,8 @@ TEST_F(PublisherTest, TestSubscriberActiveTimeout) { // A message is published, so the connection is refreshed. auto oid = ObjectID::FromRandom(); - subscriber->QueueMessage(std::make_shared(GeneratePubMessage(oid))); + subscriber->QueueMessage( + std::make_shared(GeneratePubMessage(oid, GetNextSequenceId()))); ASSERT_TRUE(subscriber->IsActive()); ASSERT_FALSE(subscriber->ConnectionExists()); ASSERT_EQ(reply_cnt, 2); @@ -471,6 +522,13 @@ TEST_F(PublisherTest, TestSubscriberActiveTimeout) { ASSERT_TRUE(subscriber->IsActive()); ASSERT_FALSE(subscriber->ConnectionExists()); + // There is one message to be GCed. + ASSERT_FALSE(subscriber->CheckNoLeaks()); + + // Notify that message 1 is safe to be GCed. + request_.set_max_processed_sequence_id(1); + reply = rpc::PubsubLongPollingReply(); + subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); ASSERT_TRUE(subscriber->CheckNoLeaks()); } @@ -485,7 +543,11 @@ TEST_F(PublisherTest, TestSubscriberDisconnected) { std::function failure) { reply_cnt++; }; auto subscriber = std::make_shared( - subscriber_id_, [this]() { return current_time_; }, subscriber_timeout_ms_, 10); + subscriber_id_, + [this]() { return current_time_; }, + subscriber_timeout_ms_, + 10, + kDefaultPublisherId); // Suppose the new connection is removed. subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); @@ -543,7 +605,11 @@ TEST_F(PublisherTest, TestSubscriberTimeoutComplicated) { std::function failure) { reply_cnt++; }; auto subscriber = std::make_shared( - subscriber_id_, [this]() { return current_time_; }, subscriber_timeout_ms_, 10); + subscriber_id_, + [this]() { return current_time_; }, + subscriber_timeout_ms_, + 10, + kDefaultPublisherId); // Suppose the new connection is removed. subscriber->ConnectToSubscriber(request_, &reply, send_reply_callback); @@ -596,7 +662,7 @@ TEST_F(PublisherTest, TestBasicSingleSubscriber) { publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); publisher_->RegisterSubscription( rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); - publisher_->Publish(GeneratePubMessage(oid)); + publisher_->Publish(GeneratePubMessage(oid, 0)); ASSERT_EQ(batched_ids[0], oid); } @@ -742,14 +808,17 @@ TEST_F(PublisherTest, TestMultiSubscribers) { TEST_F(PublisherTest, TestBatch) { // Test if published objects are batched properly. std::vector batched_ids; - send_reply_callback = [this, &batched_ids](Status status, - std::function success, - std::function failure) { + int64_t max_processed_sequence_id = 0; + send_reply_callback = [this, &batched_ids, &max_processed_sequence_id]( + Status status, + std::function success, + std::function failure) { for (int i = 0; i < reply.pub_messages_size(); i++) { const auto &msg = reply.pub_messages(i); const auto oid = ObjectID::FromBinary(msg.worker_object_eviction_message().object_id()); batched_ids.push_back(oid); + max_processed_sequence_id = std::max(max_processed_sequence_id, msg.sequence_id()); } reply = rpc::PubsubLongPollingReply(); }; @@ -766,6 +835,7 @@ TEST_F(PublisherTest, TestBatch) { ASSERT_EQ(batched_ids.size(), 0); // Now connection is initiated, and all oids are published. + request_.set_max_processed_sequence_id(max_processed_sequence_id); publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); for (int i = 0; i < num_oids; i++) { const auto oid_test = oids[i]; @@ -783,7 +853,10 @@ TEST_F(PublisherTest, TestBatch) { rpc::ChannelType::WORKER_OBJECT_EVICTION, subscriber_id_, oid.Binary()); publisher_->Publish(GeneratePubMessage(oid)); } + request_.set_max_processed_sequence_id(max_processed_sequence_id); publisher_->ConnectToSubscriber(request_, &reply, send_reply_callback); + ASSERT_EQ(num_oids, oids.size()); + ASSERT_EQ(num_oids, batched_ids.size()); for (int i = 0; i < num_oids; i++) { const auto oid_test = oids[i]; const auto published_oid = batched_ids[i]; @@ -1032,18 +1105,21 @@ TEST_F(PublisherTest, TestMaxBufferSizePerEntity) { rpc::PubMessage pub_message; pub_message.set_key_id(job_id.Binary()); pub_message.set_channel_type(rpc::ChannelType::RAY_ERROR_INFO_CHANNEL); + pub_message.set_sequence_id(GetNextSequenceId()); pub_message.mutable_error_info_message()->set_error_message(std::string(4000, 'a')); // Buffer is available. - EXPECT_TRUE(subscription_index.Publish(pub_message)); + EXPECT_TRUE(subscription_index.Publish(std::make_shared(pub_message))); // Buffer is still available. pub_message.mutable_error_info_message()->set_error_message(std::string(4000, 'b')); - EXPECT_TRUE(subscription_index.Publish(pub_message)); + pub_message.set_sequence_id(GetNextSequenceId()); + EXPECT_TRUE(subscription_index.Publish(std::make_shared(pub_message))); // Buffer is full. pub_message.mutable_error_info_message()->set_error_message(std::string(4000, 'c')); - EXPECT_TRUE(subscription_index.Publish(pub_message)); + pub_message.set_sequence_id(GetNextSequenceId()); + EXPECT_TRUE(subscription_index.Publish(std::make_shared(pub_message))); // Subscriber receives the last two messages. 1st message is dropped. auto reply = FlushSubscriber(subscriber); @@ -1055,7 +1131,8 @@ TEST_F(PublisherTest, TestMaxBufferSizePerEntity) { // A message larger than the buffer limit can still be published. pub_message.mutable_error_info_message()->set_error_message(std::string(14000, 'd')); - EXPECT_TRUE(subscription_index.Publish(pub_message)); + pub_message.set_sequence_id(GetNextSequenceId()); + EXPECT_TRUE(subscription_index.Publish(std::make_shared(pub_message))); reply = FlushSubscriber(subscriber); ASSERT_EQ(reply.pub_messages().size(), 1); EXPECT_EQ(reply.pub_messages(0).error_info_message().error_message(), @@ -1074,19 +1151,22 @@ TEST_F(PublisherTest, TestMaxBufferSizeAllEntities) { pub_message.set_key_id("aaa"); pub_message.set_channel_type(rpc::ChannelType::RAY_ERROR_INFO_CHANNEL); pub_message.mutable_error_info_message()->set_error_message(std::string(4000, 'a')); + pub_message.set_sequence_id(GetNextSequenceId()); // Buffer is available. - EXPECT_TRUE(subscription_index.Publish(pub_message)); + EXPECT_TRUE(subscription_index.Publish(std::make_shared(pub_message))); // Buffer is still available. pub_message.set_key_id("bbb"); pub_message.mutable_error_info_message()->set_error_message(std::string(4000, 'b')); - EXPECT_TRUE(subscription_index.Publish(pub_message)); + pub_message.set_sequence_id(GetNextSequenceId()); + EXPECT_TRUE(subscription_index.Publish(std::make_shared(pub_message))); // Buffer is full. pub_message.set_key_id("ccc"); pub_message.mutable_error_info_message()->set_error_message(std::string(4000, 'c')); - EXPECT_TRUE(subscription_index.Publish(pub_message)); + pub_message.set_sequence_id(GetNextSequenceId()); + EXPECT_TRUE(subscription_index.Publish(std::make_shared(pub_message))); auto reply = FlushSubscriber(subscriber); ASSERT_EQ(reply.pub_messages().size(), 2); diff --git a/src/ray/pubsub/test/subscriber_test.cc b/src/ray/pubsub/test/subscriber_test.cc index 0dcddf18f55a..2ed946e06eca 100644 --- a/src/ray/pubsub/test/subscriber_test.cc +++ b/src/ray/pubsub/test/subscriber_test.cc @@ -27,6 +27,8 @@ class MockWorkerClient : public pubsub::SubscriberClientInterface { void PubsubLongPolling( const rpc::PubsubLongPollingRequest &request, const rpc::ClientCallback &callback) override { + max_processed_sequence_id_ = request.max_processed_sequence_id(); + publisher_id_ = request.publisher_id(); long_polling_callbacks.push_back(callback); } @@ -52,20 +54,34 @@ class MockWorkerClient : public pubsub::SubscriberClientInterface { return r; } + void ResetSequenceId(int64_t start_sequence_id) { + sequence_id_ = start_sequence_id - 1; + } + + int64_t GetNextSequenceId() { return ++sequence_id_; } + int64_t GetReportedMaxProcessedSequenceId() { return max_processed_sequence_id_; } + bool ReplyLongPolling(rpc::ChannelType channel_type, std::vector &object_ids, - Status status = Status::OK()) { + std::vector sequence_ids, + Status status = Status::OK(), + std::string publisher_id = "") { if (long_polling_callbacks.empty()) { return false; } auto callback = long_polling_callbacks.front(); auto reply = rpc::PubsubLongPollingReply(); - for (const auto &object_id : object_ids) { + for (size_t i = 0; i < object_ids.size(); i++) { + const auto &object_id = object_ids.at(i); auto *new_pub_message = reply.add_pub_messages(); new_pub_message->set_key_id(object_id.Binary()); new_pub_message->set_channel_type(channel_type); + int64_t sequence_id = + sequence_ids.empty() ? GetNextSequenceId() : sequence_ids.at(i); + new_pub_message->set_sequence_id(sequence_id); } + reply.set_publisher_id(publisher_id.empty() ? publisher_id_ : publisher_id); callback(status, reply); long_polling_callbacks.pop_front(); return true; @@ -79,12 +95,14 @@ class MockWorkerClient : public pubsub::SubscriberClientInterface { auto callback = long_polling_callbacks.front(); auto reply = rpc::PubsubLongPollingReply(); + reply.set_publisher_id(publisher_id_); for (const auto &object_id : object_ids) { auto new_pub_message = reply.add_pub_messages(); new_pub_message->set_key_id(object_id.Binary()); new_pub_message->set_channel_type(channel_type); new_pub_message->mutable_failure_message(); + new_pub_message->set_sequence_id(GetNextSequenceId()); } callback(Status::OK(), reply); long_polling_callbacks.pop_front(); @@ -98,6 +116,9 @@ class MockWorkerClient : public pubsub::SubscriberClientInterface { std::deque> long_polling_callbacks; std::deque> command_batch_callbacks; std::queue requests_; + int64_t sequence_id_ = 0; + int64_t max_processed_sequence_id_ = 0; + std::string publisher_id_ = pubsub::PublisherID::FromRandom().Binary(); }; namespace pubsub { @@ -149,9 +170,12 @@ class SubscriberTest : public ::testing::Test { } bool ReplyLongPolling(rpc::ChannelType channel_type, - std::vector &object_ids, - Status status = Status::OK()) { - auto success = owner_client->ReplyLongPolling(channel_type, object_ids, status); + std::vector object_ids, + std::vector sequence_ids = {}, + Status status = Status::OK(), + std::string publiser_id = "") { + auto success = owner_client->ReplyLongPolling( + channel_type, object_ids, sequence_ids, status, publiser_id); // Need to call this to invoke callback when the reply comes. // The io service basically executes the queued handler in a blocking manner, and // reset should be called in order to run the poll_one again. @@ -160,6 +184,10 @@ class SubscriberTest : public ::testing::Test { return success; } + void ResetSequenceId(int64_t start_sequence_id = 1) { + owner_client->ResetSequenceId(start_sequence_id); + } + bool FailureMessagePublished(rpc::ChannelType channel_type, std::vector &object_ids) { auto published = owner_client->FailureMessagePublished(channel_type, object_ids); @@ -177,14 +205,14 @@ class SubscriberTest : public ::testing::Test { std::function(const rpc::Address &)> client_pool; std::shared_ptr subscriber_; - std::unordered_set object_subscribed_; + std::unordered_map object_subscribed_; std::unordered_set object_failed_to_subscribe_; rpc::ChannelType channel; }; TEST_F(SubscriberTest, TestBasicSubscription) { auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; @@ -205,18 +233,111 @@ TEST_F(SubscriberTest, TestBasicSubscription) { std::vector objects_batched; objects_batched.push_back(object_id); ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); + // Make sure the long polling batch works as expected. + for (const auto &object_id : objects_batched) { + ASSERT_TRUE(object_subscribed_[object_id] == 1); + } + + // Publish the objects again, and subscriber should receive it. + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); + for (const auto &object_id : objects_batched) { + ASSERT_TRUE(object_subscribed_[object_id] == 2); + } + ASSERT_TRUE(subscriber_->Unsubscribe(channel, owner_addr, object_id.Binary())); ASSERT_TRUE(owner_client->ReplyCommandBatch()); ASSERT_FALSE(subscriber_->IsSubscribed(channel, owner_addr, object_id.Binary())); + // Here, once the long polling request is replied, the metadata is cleaned up. + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); + ASSERT_TRUE(subscriber_->CheckNoLeaks()); +} + +TEST_F(SubscriberTest, TestIgnoreOutofOrderMessage) { + auto subscription_callback = [this](const rpc::PubMessage &msg) { + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; + }; + auto failure_callback = EMPTY_FAILURE_CALLBACK; + + const auto owner_addr = GenerateOwnerAddress(); + const auto object_id = ObjectID::FromRandom(); + const auto object_id1 = ObjectID::FromRandom(); + subscriber_->SubscribeChannel(std::make_unique(), + channel, + owner_addr, + /*subscribe_done_callback=*/nullptr, + subscription_callback, + failure_callback); + ASSERT_TRUE(owner_client->ReplyCommandBatch()); + + std::vector objects_batched; + objects_batched.push_back(object_id); + objects_batched.push_back(object_id1); // Make sure the long polling batch works as expected. + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); + ASSERT_EQ(2, owner_client->GetReportedMaxProcessedSequenceId()); + for (const auto &object_id : objects_batched) { - ASSERT_TRUE(object_subscribed_.count(object_id) > 0); + ASSERT_TRUE(object_subscribed_[object_id] == 1); } - // Here, once the long polling request is replied, the metadata is cleaned up. + // By resetting the sequence_id, the message now come out of order, + // and the subscriber should ignore out of order message. + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched, {1, 2})); + ASSERT_EQ(2, owner_client->GetReportedMaxProcessedSequenceId()); + + // Make sure the long polling batch works as expected. + for (const auto &object_id : objects_batched) { + ASSERT_TRUE(object_subscribed_[object_id] == 1); + } + + // message arrives out of order (sequence_id 4 comes before 3), + // we will ignore message with sequence id 3. + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched, {4, 3})); + ASSERT_TRUE(object_subscribed_[object_id] == 2); + ASSERT_TRUE(object_subscribed_[object_id1] == 1); + ASSERT_EQ(4, owner_client->GetReportedMaxProcessedSequenceId()); +} + +TEST_F(SubscriberTest, TestPublisherFailsOver) { + auto subscription_callback = [this](const rpc::PubMessage &msg) { + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; + }; + auto failure_callback = EMPTY_FAILURE_CALLBACK; + + const auto owner_addr = GenerateOwnerAddress(); + const auto object_id = ObjectID::FromRandom(); + const auto object_id1 = ObjectID::FromRandom(); + subscriber_->SubscribeChannel(std::make_unique(), + channel, + owner_addr, + /*subscribe_done_callback=*/nullptr, + subscription_callback, + failure_callback); + ASSERT_TRUE(owner_client->ReplyCommandBatch()); + + std::vector objects_batched; + objects_batched.push_back(object_id); + objects_batched.push_back(object_id1); + // Make sure the long polling batch works as expected. ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); - ASSERT_TRUE(subscriber_->CheckNoLeaks()); + ASSERT_EQ(2, owner_client->GetReportedMaxProcessedSequenceId()); + + for (const auto &object_id : objects_batched) { + ASSERT_TRUE(object_subscribed_[object_id] == 1); + } + + // By resetting the sequence_id, the message now come out of order, + // and the subscriber should ignore out of order message. + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched, {1, 2})); + ASSERT_EQ(2, owner_client->GetReportedMaxProcessedSequenceId()); + + auto new_publisher_id = NodeID::FromRandom().Binary(); + // if the publisher_id changes, we should reset both publisher_id and sequence_id. + ASSERT_TRUE(ReplyLongPolling( + channel, std::vector({object_id}), {1}, Status::OK(), new_publisher_id)); + ASSERT_EQ(1, owner_client->GetReportedMaxProcessedSequenceId()); + ASSERT_EQ(new_publisher_id, owner_client->publisher_id_); } TEST_F(SubscriberTest, TestSingleLongPollingWithMultipleSubscriptions) { @@ -225,7 +346,7 @@ TEST_F(SubscriberTest, TestSingleLongPollingWithMultipleSubscriptions) { /// auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; @@ -253,8 +374,8 @@ TEST_F(SubscriberTest, TestSingleLongPollingWithMultipleSubscriptions) { // Make sure the long polling batch works as expected. for (const auto &object_id : objects_batched) { - // RAY_LOG(ERROR) << "haha " << object_subscribed_.count(object_id); - ASSERT_TRUE(object_subscribed_.count(object_id) > 0); + // RAY_LOG(ERROR) << "haha " << object_subscribed_[object_id]; + ASSERT_TRUE(object_subscribed_[object_id] > 0); } } @@ -264,7 +385,7 @@ TEST_F(SubscriberTest, TestMultiLongPollingWithTheSameSubscription) { /// auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; @@ -285,7 +406,7 @@ TEST_F(SubscriberTest, TestMultiLongPollingWithTheSameSubscription) { std::vector objects_batched; objects_batched.push_back(object_id); ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); - ASSERT_TRUE(object_subscribed_.count(object_id) > 0); + ASSERT_TRUE(object_subscribed_[object_id] > 0); objects_batched.clear(); object_subscribed_.clear(); @@ -293,7 +414,7 @@ TEST_F(SubscriberTest, TestMultiLongPollingWithTheSameSubscription) { ASSERT_EQ(owner_client->GetNumberOfInFlightLongPollingRequests(), 1); objects_batched.push_back(object_id); ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); - ASSERT_TRUE(object_subscribed_.count(object_id) > 0); + ASSERT_TRUE(object_subscribed_[object_id] > 0); } TEST_F(SubscriberTest, TestCallbackNotInvokedForNonSubscribedObject) { @@ -302,7 +423,7 @@ TEST_F(SubscriberTest, TestCallbackNotInvokedForNonSubscribedObject) { /// auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; @@ -322,7 +443,7 @@ TEST_F(SubscriberTest, TestCallbackNotInvokedForNonSubscribedObject) { std::vector objects_batched; objects_batched.push_back(object_id_not_subscribed); ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); - ASSERT_EQ(object_subscribed_.count(object_id), 0); + ASSERT_EQ(object_subscribed_[object_id], 0); } TEST_F(SubscriberTest, TestSubscribeChannelEntities) { @@ -331,7 +452,7 @@ TEST_F(SubscriberTest, TestSubscribeChannelEntities) { /// auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; @@ -352,7 +473,7 @@ TEST_F(SubscriberTest, TestSubscribeChannelEntities) { } ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); for (int i = 0; i < 5; ++i) { - ASSERT_EQ(object_subscribed_.count(objects_batched[i]), 1); + ASSERT_EQ(object_subscribed_[objects_batched[i]], 1); } objects_batched.clear(); object_subscribed_.clear(); @@ -366,7 +487,7 @@ TEST_F(SubscriberTest, TestSubscribeChannelEntities) { } ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); for (int i = 0; i < 10; ++i) { - ASSERT_EQ(object_subscribed_.count(objects_batched[i]), 1); + ASSERT_EQ(object_subscribed_[objects_batched[i]], 1); } // Unsubscribe from the channel. @@ -379,7 +500,7 @@ TEST_F(SubscriberTest, TestIgnoreBatchAfterUnsubscription) { /// auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; @@ -400,7 +521,7 @@ TEST_F(SubscriberTest, TestIgnoreBatchAfterUnsubscription) { ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); // Make sure the batched object won't invoke the callback since it is already // unsubscribed before long polling is replied. - ASSERT_EQ(object_subscribed_.count(object_id), 0); + ASSERT_EQ(object_subscribed_[object_id], 0); // Make sure the long polling is not invoked since there's no more subscribed object to // this owner. ASSERT_EQ(owner_client->GetNumberOfInFlightLongPollingRequests(), 0); @@ -413,7 +534,7 @@ TEST_F(SubscriberTest, TestIgnoreBatchAfterUnsubscribeFromAll) { /// auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; @@ -434,7 +555,7 @@ TEST_F(SubscriberTest, TestIgnoreBatchAfterUnsubscribeFromAll) { ASSERT_TRUE(ReplyLongPolling(channel, objects_batched)); // Make sure the returned object won't invoke the callback since the channel is already // unsubscribed before long polling is replied. - ASSERT_EQ(object_subscribed_.count(object_id), 0); + ASSERT_EQ(object_subscribed_[object_id], 0); // After the previous reply, no new long polling is invoked since the channel has been // unsubscribed. ASSERT_EQ(owner_client->GetNumberOfInFlightLongPollingRequests(), 0); @@ -443,7 +564,7 @@ TEST_F(SubscriberTest, TestIgnoreBatchAfterUnsubscribeFromAll) { TEST_F(SubscriberTest, TestLongPollingFailure) { auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; const auto owner_addr = GenerateOwnerAddress(); @@ -462,9 +583,9 @@ TEST_F(SubscriberTest, TestLongPollingFailure) { // Long polling failed. std::vector objects_batched; - ASSERT_TRUE(ReplyLongPolling(channel, objects_batched, Status::NotFound(""))); + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched, {}, Status::NotFound(""))); // Callback is not invoked. - ASSERT_EQ(object_subscribed_.count(object_id), 0); + ASSERT_EQ(object_subscribed_[object_id], 0); // Failure callback is invoked. ASSERT_EQ(object_failed_to_subscribe_.count(object_id), 1); // Since the long polling is failed due to the publisher failure, we shouldn't have any @@ -480,7 +601,7 @@ TEST_F(SubscriberTest, TestUnsubscribeInSubscriptionCallback) { const auto object_id = ObjectID::FromBinary(msg.key_id()); subscriber_->Unsubscribe(channel, owner_addr, object_id.Binary()); ASSERT_TRUE(owner_client->ReplyCommandBatch()); - object_subscribed_.emplace(object_id); + object_subscribed_[object_id]++; }; auto failure_callback = [](const std::string &key_id, const Status &) { // This shouldn't be invoked in this test. @@ -761,7 +882,7 @@ TEST_F(SubscriberTest, TestCommandsCleanedUponPublishFailure) { std::vector objects_batched; // The publisher failed. In this case, the queue should be cleaned up. - ASSERT_TRUE(ReplyLongPolling(channel, objects_batched, Status::Invalid(""))); + ASSERT_TRUE(ReplyLongPolling(channel, objects_batched, {}, Status::Invalid(""))); // The reply from the first batch. ASSERT_TRUE(owner_client->ReplyCommandBatch()); // We shouldn't have the second batch request because the publisher is already dead and @@ -778,7 +899,7 @@ TEST_F(SubscriberTest, TestFailureMessagePublished) { /// is properly called in this scenario. /// auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; const auto owner_addr = GenerateOwnerAddress(); @@ -809,7 +930,7 @@ TEST_F(SubscriberTest, TestFailureMessagePublished) { objects_batched.push_back(object_id); ASSERT_TRUE(FailureMessagePublished(channel, objects_batched)); // Callback is not invoked. - ASSERT_EQ(object_subscribed_.count(object_id), 0); + ASSERT_EQ(object_subscribed_[object_id], 0); // Failure callback is invoked. ASSERT_EQ(object_failed_to_subscribe_.count(object_id), 1); // Since object2 is still subscribed, we should have the long polling requests. @@ -819,14 +940,14 @@ TEST_F(SubscriberTest, TestFailureMessagePublished) { objects_batched.clear(); objects_batched.push_back(object_id2); ASSERT_TRUE(FailureMessagePublished(channel, objects_batched)); - ASSERT_EQ(object_subscribed_.count(object_id2), 0); + ASSERT_EQ(object_subscribed_[object_id2], 0); ASSERT_EQ(object_failed_to_subscribe_.count(object_id2), 1); ASSERT_EQ(owner_client->GetNumberOfInFlightLongPollingRequests(), 0); } TEST_F(SubscriberTest, TestIsSubscribed) { auto subscription_callback = [this](const rpc::PubMessage &msg) { - object_subscribed_.emplace(ObjectID::FromBinary(msg.key_id())); + object_subscribed_[ObjectID::FromBinary(msg.key_id())]++; }; auto failure_callback = EMPTY_FAILURE_CALLBACK; const auto owner_addr = GenerateOwnerAddress(); diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index e745adff3083..6747e5c93564 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -593,7 +593,7 @@ void LocalObjectManager::DeleteSpilledObjects(std::vector urls_to_d }); } -void LocalObjectManager::FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) const { +void LocalObjectManager::FillObjectStoreStats(rpc::GetNodeStatsReply *reply) const { auto stats = reply->mutable_store_stats(); stats->set_spill_time_total_s(spill_time_total_s_); stats->set_spilled_bytes_total(spilled_bytes_total_); @@ -602,6 +602,7 @@ void LocalObjectManager::FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) stats->set_restored_bytes_total(restored_bytes_total_); stats->set_restored_objects_total(restored_objects_total_); stats->set_object_store_bytes_primary_copy(pinned_objects_size_); + stats->set_num_object_store_primary_copies(local_objects_.size()); } void LocalObjectManager::RecordMetrics() const { diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index 72cb4db5d400..116776bd1d16 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -147,10 +147,10 @@ class LocalObjectManager { /// \return True if spilling is still in progress. False otherwise. bool IsSpillingInProgress(); - /// Populate object spilling stats. + /// Populate object store stats. /// - /// \param Output parameter. - void FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) const; + /// \param reply Output parameter. + void FillObjectStoreStats(rpc::GetNodeStatsReply *reply) const; /// Record object spilling stats to metrics. void RecordMetrics() const; diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index ca0b4e015cd7..7865f35f4eb4 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -240,7 +240,8 @@ NodeManager::NodeManager(instrumented_io_context &io_service, }, /*delay_executor*/ [this](std::function fn, int64_t delay_ms) { - RAY_UNUSED(execute_after(io_service_, fn, delay_ms)); + RAY_UNUSED(execute_after( + io_service_, fn, std::chrono::milliseconds(delay_ms))); }), node_manager_server_("NodeManager", config.node_manager_port, @@ -293,6 +294,7 @@ NodeManager::NodeManager(instrumented_io_context &io_service, CreateMemoryUsageRefreshCallback())) { RAY_LOG(INFO) << "Initializing NodeManager with ID " << self_node_id_; cluster_resource_scheduler_ = std::make_shared( + io_service, scheduling::NodeID(self_node_id_.Binary()), config.resource_config.ToResourceMap(), /*is_node_available_fn*/ @@ -373,23 +375,6 @@ NodeManager::NodeManager(instrumented_io_context &io_service, node_manager_server_.RegisterService(node_manager_service_); node_manager_server_.RegisterService(agent_manager_service_); if (RayConfig::instance().use_ray_syncer()) { - periodical_runner_.RunFnPeriodically( - [this]() { - auto now = absl::Now(); - auto threshold = - now - absl::Milliseconds( - RayConfig::instance().ray_syncer_message_refresh_interval_ms()); - auto &resource_manager = - cluster_resource_scheduler_->GetClusterResourceManager(); - for (auto &[node_id, resource] : resource_message_udpated_) { - auto modified_ts = resource_manager.GetNodeResourceModifiedTs( - scheduling::NodeID(node_id.Binary())); - if (modified_ts && *modified_ts < threshold) { - UpdateResourceUsage(node_id, resource); - } - } - }, - RayConfig::instance().ray_syncer_message_refresh_interval_ms()); node_manager_server_.RegisterService(ray_syncer_service_); } node_manager_server_.Run(); @@ -414,7 +399,7 @@ NodeManager::NodeManager(instrumented_io_context &io_service, std::move(options), /*delay_executor=*/ [this](std::function task, uint32_t delay_ms) { - return execute_after(io_service_, task, delay_ms); + return execute_after(io_service_, task, std::chrono::milliseconds(delay_ms)); }, /*runtime_env_agent_factory=*/ [this](const std::string &ip_address, int port) { @@ -441,10 +426,44 @@ ray::Status NodeManager::RegisterGcs() { // If the node resource message is received first and then the node message is received, // ForwardTask will throw exception, because it can't get node info. - auto on_done = [](Status status) { RAY_CHECK_OK(status); }; + auto on_node_change_subscribe_done = [this](Status status) { + RAY_CHECK_OK(status); + + if (RayConfig::instance().use_ray_syncer()) { + // Register resource manager and scheduler + ray_syncer_.Register( + /* message_type */ syncer::MessageType::RESOURCE_VIEW, + /* reporter */ &cluster_resource_scheduler_->GetLocalResourceManager(), + /* receiver */ this, + /* pull_from_reporter_interval_ms */ + RayConfig::instance().raylet_report_resources_period_milliseconds()); + + // Register a commands channel. + // It's only used for GC right now. + ray_syncer_.Register( + /* message_type */ syncer::MessageType::COMMANDS, + /* reporter */ this, + /* receiver */ this, + /* pull_from_reporter_interval_ms */ 0); + + auto gcs_channel = gcs_client_->GetGcsRpcClient().GetChannel(); + ray_syncer_.Connect(kGCSNodeID.Binary(), gcs_channel); + periodical_runner_.RunFnPeriodically( + [this] { + auto triggered_by_global_gc = TryLocalGC(); + // If plasma store is under high pressure, we should try to schedule a global + // gc. + if (triggered_by_global_gc) { + ray_syncer_.OnDemandBroadcasting(syncer::MessageType::COMMANDS); + } + }, + RayConfig::instance().raylet_check_gc_period_milliseconds(), + "NodeManager.CheckGC"); + } + }; // Register a callback to monitor new nodes and a callback to monitor removed nodes. - RAY_RETURN_NOT_OK( - gcs_client_->Nodes().AsyncSubscribeToNodeChange(on_node_change, on_done)); + RAY_RETURN_NOT_OK(gcs_client_->Nodes().AsyncSubscribeToNodeChange( + on_node_change, on_node_change_subscribe_done)); // Subscribe to all unexpected failure notifications from the local and // remote raylets. Note that this does not include workers that failed due to @@ -509,38 +528,6 @@ ray::Status NodeManager::RegisterGcs() { event_stats_print_interval_ms, "NodeManager.deadline_timer.print_event_loop_stats"); } - - if (RayConfig::instance().use_ray_syncer()) { - // Register resource manager and scheduler - ray_syncer_.Register( - /* message_type */ syncer::MessageType::RESOURCE_VIEW, - /* reporter */ &cluster_resource_scheduler_->GetLocalResourceManager(), - /* receiver */ this, - /* pull_from_reporter_interval_ms */ - RayConfig::instance().raylet_report_resources_period_milliseconds()); - - // Register a commands channel. - // It's only used for GC right now. - ray_syncer_.Register( - /* message_type */ syncer::MessageType::COMMANDS, - /* reporter */ this, - /* receiver */ this, - /* pull_from_reporter_interval_ms */ 0); - - auto gcs_channel = gcs_client_->GetGcsRpcClient().GetChannel(); - ray_syncer_.Connect(kGCSNodeID.Binary(), gcs_channel); - periodical_runner_.RunFnPeriodically( - [this] { - auto triggered_by_global_gc = TryLocalGC(); - // If plasma store is under high pressure, we should try to schedule a global - // gc. - if (triggered_by_global_gc) { - ray_syncer_.OnDemandBroadcasting(syncer::MessageType::COMMANDS); - } - }, - RayConfig::instance().raylet_check_gc_period_milliseconds(), - "NodeManager.CheckGC"); - } // Raylet periodically check whether it's alive in GCS. // For failure cases, GCS might think this raylet dead, but this // raylet still think it's alive. This could happen when the cluster setup is wrong, @@ -564,8 +551,8 @@ ray::Status NodeManager::RegisterGcs() { << "GCS is not backed by a DB and restarted or there is data loss " << "in the DB."; } - *checking_ptr = false; } + *checking_ptr = false; }, /* timeout_ms = */ 30000)); }, @@ -1007,6 +994,7 @@ void NodeManager::NodeAdded(const GcsNodeInfo &node_info) { [this, node_id]( Status status, const boost::optional &data) { + // TODO: Always use the message from ray syncer. if (data) { ResourceRequest resources; for (auto &resource_entry : *data) { @@ -1017,6 +1005,15 @@ void NodeManager::NodeAdded(const GcsNodeInfo &node_info) { cluster_task_manager_->ScheduleAndDispatchTasks(); } } + // Update the resource view if a new message has been sent. + if (RayConfig::instance().use_ray_syncer()) { + if (auto sync_msg = ray_syncer_.GetSyncMessage( + node_id.Binary(), syncer::MessageType::RESOURCE_VIEW)) { + if (sync_msg) { + ConsumeSyncMessage(sync_msg); + } + } + } })); } @@ -1048,10 +1045,6 @@ void NodeManager::NodeRemoved(const NodeID &node_id) { // Below, when we remove node_id from all of these data structures, we could // check that it is actually removed, or log a warning otherwise, but that may // not be necessary. - - // Remove the messages received - resource_message_udpated_.erase(node_id); - // Remove the node from the resource map. if (!cluster_resource_scheduler_->GetClusterResourceManager().RemoveNode( scheduling::NodeID(node_id.Binary()))) { @@ -2028,7 +2021,7 @@ void NodeManager::HandleShutdownRaylet(rpc::ShutdownRayletRequest request, return; } auto shutdown_after_reply = []() { - rpc::DrainAndResetServerCallExecutor(); + rpc::DrainServerCallExecutor(); // Note that the callback is posted to the io service after the shutdown GRPC request // is replied. Otherwise, the RPC might not be replied to GCS before it shutsdown // itself. Implementation note: When raylet is shutdown by ray stop, the CLI sends a @@ -2510,7 +2503,7 @@ void NodeManager::HandleGetNodeStats(rpc::GetNodeStatsRequest node_stats_request rpc::GetNodeStatsReply *reply, rpc::SendReplyCallback send_reply_callback) { // Report object spilling stats. - local_object_manager_.FillObjectSpillingStats(reply); + local_object_manager_.FillObjectStoreStats(reply); // Report object store stats. object_manager_.FillObjectStoreStats(reply); // As a result of the HandleGetNodeStats, we are collecting information from all @@ -2790,7 +2783,6 @@ void NodeManager::ConsumeSyncMessage( } // Message view shouldn't carry this field. RAY_CHECK(!data.should_global_gc()); - resource_message_udpated_[node_id] = std::move(data); } else if (message->message_type() == syncer::MessageType::COMMANDS) { rpc::ResourcesData data; data.ParseFromString(message->sync_message()); diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 2982da44aff3..ec6980d6eba5 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -827,9 +827,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, /// Ray syncer for synchronization syncer::RaySyncer ray_syncer_; - /// Resource message updated - absl::flat_hash_map resource_message_udpated_; - /// RaySyncerService for gRPC syncer::RaySyncerService ray_syncer_service_; diff --git a/src/ray/raylet/placement_group_resource_manager_test.cc b/src/ray/raylet/placement_group_resource_manager_test.cc index b458555ff2bd..fe8be3c660d0 100644 --- a/src/ray/raylet/placement_group_resource_manager_test.cc +++ b/src/ray/raylet/placement_group_resource_manager_test.cc @@ -46,7 +46,7 @@ class NewPlacementGroupResourceManagerTest : public ::testing::Test { void InitLocalAvailableResource( absl::flat_hash_map &unit_resource) { cluster_resource_scheduler_ = std::make_shared( - scheduling::NodeID("local"), unit_resource, is_node_available_fn_); + io_context, scheduling::NodeID("local"), unit_resource, is_node_available_fn_); new_placement_group_resource_manager_ = std::make_unique( cluster_resource_scheduler_); @@ -73,6 +73,7 @@ class NewPlacementGroupResourceManagerTest : public ::testing::Test { std::make_shared(std::move(bundle_spec))); return bundle_specs; } + instrumented_io_context io_context; }; TEST_F(NewPlacementGroupResourceManagerTest, @@ -186,8 +187,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewCommitBundleResource) { {"CPU", 1.0}, {"bundle_group_1_" + group_id.Hex(), 1000}, {"bundle_group_" + group_id.Hex(), 1000}}; - auto remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + auto remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); std::shared_ptr resource_instances = std::make_shared(); ASSERT_TRUE( @@ -216,7 +220,7 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewReturnBundleResource) { new_placement_group_resource_manager_->ReturnBundle(bundle_spec); /// 5. check remaining resources is correct. auto remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), unit_resource, is_node_available_fn_); + io_context, scheduling::NodeID("remaining"), unit_resource, is_node_available_fn_); auto remaining_resource_instance = remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources( scheduling::NodeID("remaining")); @@ -252,8 +256,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu {"bundle_group_1_" + group_id.Hex(), 1000}, {"bundle_group_2_" + group_id.Hex(), 1000}, {"bundle_group_" + group_id.Hex(), 2000}}; - auto remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + auto remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); std::shared_ptr resource_instances = std::make_shared(); ASSERT_TRUE( @@ -272,8 +279,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu {"CPU", 2.0}, {"bundle_group_1_" + group_id.Hex(), 1000}, {"bundle_group_" + group_id.Hex(), 2000}}; - remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); ASSERT_TRUE( remaining_resource_scheduler->GetLocalResourceManager().AllocateLocalTaskResources( {{"CPU_group_" + group_id.Hex(), 1.0}, @@ -288,8 +298,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewMultipleBundlesCommitAndRetu new_placement_group_resource_manager_->ReturnBundle(first_bundle_spec); /// 8. check remaining resources is correct after all bundle returned. remaining_resources = {{"CPU", 2.0}}; - remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); remaining_resource_instance = remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources( scheduling::NodeID("remaining")); @@ -312,8 +325,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithMultiPrepare) } /// 4. check remaining resources is correct. absl::flat_hash_map remaining_resources = {{"CPU", 3.0}}; - auto remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + auto remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); std::shared_ptr resource_instances = std::make_shared(); ASSERT_TRUE( @@ -349,8 +365,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithRandomOrder) {"CPU", 3.0}, {"bundle_group_1_" + group_id.Hex(), 1000}, {"bundle_group_" + group_id.Hex(), 1000}}; - auto remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + auto remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); std::shared_ptr resource_instances = std::make_shared(); ASSERT_TRUE( @@ -378,8 +397,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestNewIdempotencyWithRandomOrder) new_placement_group_resource_manager_->CommitBundles( ConvertSingleSpecToVectorPtrs(bundle_spec)); // 8. check remaining resources is correct. - remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), available_resource, is_node_available_fn_); + remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + available_resource, + is_node_available_fn_); remaining_resource_instance = remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources( scheduling::NodeID("remaining")); @@ -402,8 +424,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestPreparedResourceBatched) { ASSERT_FALSE(new_placement_group_resource_manager_->PrepareBundles(bundle_specs)); // 4. check remaining resources is correct. absl::flat_hash_map remaining_resources = {{"CPU", 3.0}}; - auto remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + auto remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); auto remaining_resource_instance = remaining_resource_scheduler->GetClusterResourceManager().GetNodeResources( scheduling::NodeID("remaining")); @@ -428,8 +453,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestPreparedResourceBatched) { {"bundle_group_3_" + group_id.Hex(), 1000}, {"bundle_group_4_" + group_id.Hex(), 1000}, {"bundle_group_" + group_id.Hex(), 4000}}; - remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); std::shared_ptr resource_instances = std::make_shared(); absl::flat_hash_map allocating_resource; @@ -474,8 +502,11 @@ TEST_F(NewPlacementGroupResourceManagerTest, TestCommiteResourceBatched) { {"bundle_group_3_" + group_id.Hex(), 1000}, {"bundle_group_4_" + group_id.Hex(), 1000}, {"bundle_group_" + group_id.Hex(), 4000}}; - auto remaining_resource_scheduler = std::make_shared( - scheduling::NodeID("remaining"), remaining_resources, is_node_available_fn_); + auto remaining_resource_scheduler = + std::make_shared(io_context, + scheduling::NodeID("remaining"), + remaining_resources, + is_node_available_fn_); std::shared_ptr resource_instances = std::make_shared(); absl::flat_hash_map allocating_resource; diff --git a/src/ray/raylet/scheduling/cluster_resource_manager.cc b/src/ray/raylet/scheduling/cluster_resource_manager.cc index 95677a20d994..30b7b63e546e 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager.cc +++ b/src/ray/raylet/scheduling/cluster_resource_manager.cc @@ -22,7 +22,23 @@ namespace ray { -ClusterResourceManager::ClusterResourceManager() : nodes_{} {} +ClusterResourceManager::ClusterResourceManager(instrumented_io_context &io_service) + : timer_(io_service) { + if (RayConfig::instance().use_ray_syncer()) { + timer_.RunFnPeriodically( + [this]() { + auto syncer_delay = absl::Milliseconds( + RayConfig::instance().ray_syncer_message_refresh_interval_ms()); + for (auto &[node_id, resource] : received_node_resources_) { + auto modified_ts = GetNodeResourceModifiedTs(node_id); + if (modified_ts && *modified_ts + syncer_delay < absl::Now()) { + AddOrUpdateNode(node_id, resource); + } + } + }, + RayConfig::instance().ray_syncer_message_refresh_interval_ms()); + } +} std::optional ClusterResourceManager::GetNodeResourceModifiedTs( scheduling::NodeID node_id) const { @@ -76,18 +92,13 @@ bool ClusterResourceManager::UpdateNode(scheduling::NodeID node_id, } AddOrUpdateNode(node_id, local_view); + received_node_resources_[node_id] = std::move(local_view); return true; } bool ClusterResourceManager::RemoveNode(scheduling::NodeID node_id) { - auto it = nodes_.find(node_id); - if (it == nodes_.end()) { - // Node not found. - return false; - } else { - nodes_.erase(it); - return true; - } + received_node_resources_.erase(node_id); + return nodes_.erase(node_id) != 0; } bool ClusterResourceManager::GetNodeResources(scheduling::NodeID node_id, diff --git a/src/ray/raylet/scheduling/cluster_resource_manager.h b/src/ray/raylet/scheduling/cluster_resource_manager.h index b64b5d93fdc7..9a02459ec23e 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager.h +++ b/src/ray/raylet/scheduling/cluster_resource_manager.h @@ -46,7 +46,7 @@ class GcsActorSchedulerTest; /// This class is not thread safe. class ClusterResourceManager { public: - explicit ClusterResourceManager(); + explicit ClusterResourceManager(instrumented_io_context &io_service); /// Get the resource view of the cluster. const absl::flat_hash_map &GetResourceView() const; @@ -57,13 +57,6 @@ class ClusterResourceManager { /// \param resource_data The node resource data. bool UpdateNode(scheduling::NodeID node_id, const rpc::ResourcesData &resource_data); - /// Return the timestamp when the resource of the node got updated by scheduler. - /// - /// \param node_id ID of the node to query - /// \return The timestamp when the node resource got updated. If it's null, it means - /// there is no such node or the resource of the node never got updated. - std::optional GetNodeResourceModifiedTs(scheduling::NodeID node_id) const; - /// Remove node from the cluster data structure. This happens /// when a node fails or it is removed from the cluster. /// @@ -139,6 +132,13 @@ class ClusterResourceManager { friend class ClusterResourceScheduler; friend class gcs::GcsActorSchedulerTest; + /// Return the timestamp when the resource of the node got updated by scheduler. + /// + /// \param node_id ID of the node to query + /// \return The timestamp when the node resource got updated. If it's null, it means + /// there is no such node or the resource of the node never got updated. + std::optional GetNodeResourceModifiedTs(scheduling::NodeID node_id) const; + /// Add a new node or overwrite the resources of an existing node. /// /// \param node_id: Node ID. @@ -158,8 +158,14 @@ class ClusterResourceManager { /// The key of the map is the node ID. absl::flat_hash_map nodes_; + /// Resource message updated + absl::flat_hash_map received_node_resources_; + BundleLocationIndex bundle_location_index_; + /// Timer to revert local changes to the resources periodically. + ray::PeriodicalRunner timer_; + friend class ClusterResourceSchedulerTest; friend struct ClusterResourceManagerTest; friend class raylet::ClusterTaskManagerTest; diff --git a/src/ray/raylet/scheduling/cluster_resource_manager_test.cc b/src/ray/raylet/scheduling/cluster_resource_manager_test.cc index 73d4c8e994cc..f7a0a87b71f4 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_resource_manager_test.cc @@ -35,7 +35,8 @@ NodeResources CreateNodeResources(double available_cpu, struct ClusterResourceManagerTest : public ::testing::Test { void SetUp() { ::testing::Test::SetUp(); - manager = std::make_unique(); + static instrumented_io_context io_context; + manager = std::make_unique(io_context); manager->AddOrUpdateNode(node0, CreateNodeResources(/*available_cpu*/ 1, /*total_cpu*/ 1)); manager->AddOrUpdateNode(node1, diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc index 2ad785a3cf9b..85505a82da2d 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.cc +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.cc @@ -24,6 +24,7 @@ namespace ray { using namespace ::ray::raylet_scheduling_policy; ClusterResourceScheduler::ClusterResourceScheduler( + instrumented_io_context &io_service, scheduling::NodeID local_node_id, const NodeResources &local_node_resources, std::function is_node_available_fn, @@ -31,12 +32,14 @@ ClusterResourceScheduler::ClusterResourceScheduler( : local_node_id_(local_node_id), is_node_available_fn_(is_node_available_fn), is_local_node_with_raylet_(is_local_node_with_raylet) { - Init(local_node_resources, + Init(io_service, + local_node_resources, /*get_used_object_store_memory=*/nullptr, /*get_pull_manager_at_capacity=*/nullptr); } ClusterResourceScheduler::ClusterResourceScheduler( + instrumented_io_context &io_service, scheduling::NodeID local_node_id, const absl::flat_hash_map &local_node_resources, std::function is_node_available_fn, @@ -45,14 +48,18 @@ ClusterResourceScheduler::ClusterResourceScheduler( : local_node_id_(local_node_id), is_node_available_fn_(is_node_available_fn) { NodeResources node_resources = ResourceMapToNodeResources(local_node_resources, local_node_resources); - Init(node_resources, get_used_object_store_memory, get_pull_manager_at_capacity); + Init(io_service, + node_resources, + get_used_object_store_memory, + get_pull_manager_at_capacity); } void ClusterResourceScheduler::Init( + instrumented_io_context &io_service, const NodeResources &local_node_resources, std::function get_used_object_store_memory, std::function get_pull_manager_at_capacity) { - cluster_resource_manager_ = std::make_unique(); + cluster_resource_manager_ = std::make_unique(io_service); local_resource_manager_ = std::make_unique( local_node_id_, local_node_resources, diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler.h b/src/ray/raylet/scheduling/cluster_resource_scheduler.h index 5e48369b9172..2722989cb083 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler.h +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler.h @@ -50,12 +50,14 @@ class ClusterResourceScheduler { /// with the local node. /// \param is_node_available_fn: Function to determine whether a node is available. /// \param is_local_node_with_raylet: Whether there is a raylet on the local node. - ClusterResourceScheduler(scheduling::NodeID local_node_id, + ClusterResourceScheduler(instrumented_io_context &io_service, + scheduling::NodeID local_node_id, const NodeResources &local_node_resources, std::function is_node_available_fn, bool is_local_node_with_raylet = true); ClusterResourceScheduler( + instrumented_io_context &io_service, scheduling::NodeID local_node_id, const absl::flat_hash_map &local_node_resources, std::function is_node_available_fn, @@ -127,7 +129,8 @@ class ClusterResourceScheduler { bool IsLocalNodeWithRaylet() { return is_local_node_with_raylet_; } private: - void Init(const NodeResources &local_node_resources, + void Init(instrumented_io_context &io_service, + const NodeResources &local_node_resources, std::function get_used_object_store_memory, std::function get_pull_manager_at_capacity); diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler_2_test.cc b/src/ray/raylet/scheduling/cluster_resource_scheduler_2_test.cc index 6256b941c282..ff70d836a19f 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler_2_test.cc +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler_2_test.cc @@ -28,6 +28,7 @@ class GcsResourceSchedulerTest : public ::testing::Test { public: void SetUp() override { cluster_resource_scheduler_ = std::make_shared( + io_context_, scheduling::NodeID(NodeID::FromRandom().Binary()), NodeResources(), /*is_node_available_fn=*/ @@ -177,7 +178,7 @@ class GcsResourceSchedulerTest : public ::testing::Test { ASSERT_TRUE(result.status.IsSuccess()); ASSERT_EQ(result.selected_nodes.size(), resources_list.size()); } - + instrumented_io_context io_context_; std::shared_ptr cluster_resource_scheduler_; }; diff --git a/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc b/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc index f1e2c49dd54f..b77fcbf1ae12 100644 --- a/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc +++ b/src/ray/raylet/scheduling/cluster_resource_scheduler_test.cc @@ -251,8 +251,11 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingIdInsertOrDieTest) { TEST_F(ClusterResourceSchedulerTest, SchedulingInitClusterTest) { int num_nodes = 10; + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(num_nodes + 1), NodeResources(), [](auto) { return true; }); + io_context, scheduling::NodeID(num_nodes + 1), NodeResources(), [](auto) { + return true; + }); AssertPredefinedNodeResources(); initCluster(resource_scheduler, num_nodes); @@ -263,9 +266,11 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingInitClusterTest) { TEST_F(ClusterResourceSchedulerTest, SchedulingDeleteClusterNodeTest) { int num_nodes = 4; int64_t remove_id = 2; - + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(num_nodes + 1), NodeResources(), [](auto) { return true; }); + io_context, scheduling::NodeID(num_nodes + 1), NodeResources(), [](auto) { + return true; + }); initCluster(resource_scheduler, num_nodes); resource_scheduler.GetClusterResourceManager().RemoveNode( @@ -277,8 +282,11 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingDeleteClusterNodeTest) { TEST_F(ClusterResourceSchedulerTest, SchedulingModifyClusterNodeTest) { int num_nodes = 4; int64_t update_id = 2; + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(num_nodes + 1), NodeResources(), [](auto) { return true; }); + io_context, scheduling::NodeID(num_nodes + 1), NodeResources(), [](auto) { + return true; + }); initCluster(resource_scheduler, num_nodes); @@ -291,8 +299,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingModifyClusterNodeTest) { TEST_F(ClusterResourceSchedulerTest, NodeAffinitySchedulingStrategyTest) { absl::flat_hash_map resource_total({{"CPU", 10}}); auto local_node_id = scheduling::NodeID(NodeID::FromRandom().Binary()); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - local_node_id, resource_total, is_node_available_fn_); + io_context, local_node_id, resource_total, is_node_available_fn_); AssertPredefinedNodeResources(); auto remote_node_id = scheduling::NodeID(NodeID::FromRandom().Binary()); resource_scheduler.GetClusterResourceManager().AddOrUpdateNode( @@ -358,8 +367,9 @@ TEST_F(ClusterResourceSchedulerTest, NodeAffinitySchedulingStrategyTest) { TEST_F(ClusterResourceSchedulerTest, SpreadSchedulingStrategyTest) { absl::flat_hash_map resource_total({{"CPU", 10}}); auto local_node_id = scheduling::NodeID(NodeID::FromRandom().Binary()); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - local_node_id, resource_total, is_node_available_fn_); + io_context, local_node_id, resource_total, is_node_available_fn_); AssertPredefinedNodeResources(); auto remote_node_id = scheduling::NodeID(NodeID::FromRandom().Binary()); resource_scheduler.GetClusterResourceManager().AddOrUpdateNode( @@ -396,8 +406,9 @@ TEST_F(ClusterResourceSchedulerTest, SpreadSchedulingStrategyTest) { TEST_F(ClusterResourceSchedulerTest, SchedulingWithPreferredNodeTest) { absl::flat_hash_map resource_total({{"CPU", 10}}); auto local_node_id = scheduling::NodeID(NodeID::FromRandom().Binary()); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - local_node_id, resource_total, is_node_available_fn_); + io_context, local_node_id, resource_total, is_node_available_fn_); AssertPredefinedNodeResources(); auto remote_node_id = scheduling::NodeID(NodeID::FromRandom().Binary()); resource_scheduler.GetClusterResourceManager().AddOrUpdateNode( @@ -439,8 +450,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateAvailableResourcesTest) { {ResourceID::GPU(), 3}, {ResourceID("custom1"), 5}, {ResourceID("custom2"), 5}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(1), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(1), node_resources, is_node_available_fn_); AssertPredefinedNodeResources(); { @@ -486,7 +498,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateTotalResourcesTest) { absl::flat_hash_map initial_resources = { {ray::kCPU_ResourceLabel, 1}, {"custom1", 1}}; std::string name = NodeID::FromRandom().Binary(); - ClusterResourceScheduler resource_scheduler(scheduling::NodeID(name), + instrumented_io_context io_context; + ClusterResourceScheduler resource_scheduler(io_context, + scheduling::NodeID(name), initial_resources, is_node_available_fn_, nullptr, @@ -511,8 +525,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingUpdateTotalResourcesTest) { } TEST_F(ClusterResourceSchedulerTest, SchedulingAddOrUpdateNodeTest) { + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), NodeResources(), [](auto) { return true; }); + io_context, scheduling::NodeID(0), NodeResources(), [](auto) { return true; }); NodeResources nr, nr_out; int64_t node_id = 1; @@ -558,8 +573,9 @@ TEST_F(ClusterResourceSchedulerTest, SchedulingResourceRequestTest) { // Create cluster resources containing local node. NodeResources node_resources = CreateNodeResources( {{ResourceID::CPU(), 5}, {ResourceID::Memory(), 5}, {ResourceID("custom1"), 10}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); auto node_id = NodeID::FromRandom(); rpc::SchedulingStrategy scheduling_strategy; scheduling_strategy.mutable_default_scheduling_strategy(); @@ -675,8 +691,9 @@ TEST_F(ClusterResourceSchedulerTest, GetLocalAvailableResourcesWithCpuUnitTest) {ResourceID::Memory(), 4}, {ResourceID::GPU(), 5}, {ResourceID("custom1"), 8}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); TaskResourceInstances available_cluster_resources = resource_scheduler.GetLocalResourceManager() @@ -702,8 +719,9 @@ TEST_F(ClusterResourceSchedulerTest, GetLocalAvailableResourcesTest) { {ResourceID::Memory(), 4}, {ResourceID::GPU(), 5}, {ResourceID("custom1"), 8}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); TaskResourceInstances available_cluster_resources = resource_scheduler.GetLocalResourceManager() @@ -736,8 +754,9 @@ TEST_F(ClusterResourceSchedulerTest, GetCPUInstancesDoubleTest) { TEST_F(ClusterResourceSchedulerTest, AvailableResourceInstancesOpsTest) { NodeResources node_resources = CreateNodeResources({{ResourceID::CPU(), 3}}); + instrumented_io_context io_context; ClusterResourceScheduler cluster( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); std::vector total = {6., 6., 6.}; std::vector available = {3., 2., 5.}; @@ -767,8 +786,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) { { NodeResources node_resources = CreateNodeResources( {{ResourceID::CPU(), 3}, {ResourceID::Memory(), 4}, {ResourceID::GPU(), 5}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest( {{ResourceID::CPU(), 3}, {ResourceID::Memory(), 2}, {ResourceID::GPU(), 1.5}}); @@ -796,8 +816,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) { { NodeResources node_resources = CreateNodeResources( {{ResourceID::CPU(), 3}, {ResourceID::Memory(), 4}, {ResourceID::GPU(), 5}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest( {{ResourceID::CPU(), 4}, {ResourceID::Memory(), 2}, {ResourceID::GPU(), 1.5}}); @@ -823,8 +844,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) { {ResourceID::GPU(), 5}, {ResourceID("custom1"), 4}, {ResourceID("custom2"), 4}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest({{ResourceID::CPU(), 3}, @@ -857,8 +879,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest) { {ResourceID::GPU(), 5}, {ResourceID("custom1"), 4}, {ResourceID("custom2"), 4}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest({{ResourceID::CPU(), 3}, @@ -889,8 +912,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesAllocationFailureTest) {ResourceID("custom1"), 4}, {ResourceID("custom2"), 4}, {ResourceID("custom3"), 4}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest({{ResourceID("custom1"), 3}, {ResourceID("custom3"), 3}, @@ -918,8 +942,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest2) { {ResourceID::GPU(), 5}, {ResourceID("custom1"), 4}, {ResourceID("custom2"), 4}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest({{ResourceID::CPU(), 2}, @@ -950,7 +975,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesTest2) { } TEST_F(ClusterResourceSchedulerTest, DeadNodeTest) { - ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"), + instrumented_io_context io_context; + ClusterResourceScheduler resource_scheduler(io_context, + scheduling::NodeID("local"), absl::flat_hash_map{}, is_node_available_fn_); absl::flat_hash_map resource; @@ -988,12 +1015,13 @@ TEST_F(ClusterResourceSchedulerTest, DeadNodeTest) { TEST_F(ClusterResourceSchedulerTest, TaskGPUResourceInstancesTest) { { + instrumented_io_context io_context; NodeResources node_resources = CreateNodeResources({{ResourceID::CPU(), 1}, {ResourceID::Memory(), 1}, {ResourceID::GPU(), 4}, {ResourceID("custom1"), 8}}); ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); std::vector allocate_gpu_instances{0.5, 0.5, 0.5, 0.5}; resource_scheduler.GetLocalResourceManager().SubtractResourceInstances( @@ -1059,8 +1087,9 @@ TEST_F(ClusterResourceSchedulerTest, {ResourceID::Memory(), 1}, {ResourceID::GPU(), 4}, {ResourceID("custom1"), 8}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); { std::vector allocate_gpu_instances{0.5, 0.5, 2, 0.5}; @@ -1111,8 +1140,9 @@ TEST_F(ClusterResourceSchedulerTest, TEST_F(ClusterResourceSchedulerTest, TaskResourceInstanceWithHardRequestTest) { NodeResources node_resources = CreateNodeResources( {{ResourceID::CPU(), 4}, {ResourceID::Memory(), 2}, {ResourceID::GPU(), 4}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest( {{ResourceID::CPU(), 2}, {ResourceID::Memory(), 2}, {ResourceID::GPU(), 1.5}}); @@ -1134,8 +1164,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstanceWithHardRequestTest) { TEST_F(ClusterResourceSchedulerTest, TaskResourceInstanceWithoutCpuUnitTest) { NodeResources node_resources = CreateNodeResources( {{ResourceID::CPU(), 4}, {ResourceID::Memory(), 2}, {ResourceID::GPU(), 4}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(0), node_resources, is_node_available_fn_); + io_context, scheduling::NodeID(0), node_resources, is_node_available_fn_); ResourceRequest resource_request = CreateResourceRequest( {{ResourceID::CPU(), 2}, {ResourceID::Memory(), 2}, {ResourceID::GPU(), 1.5}}); @@ -1156,7 +1187,9 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstanceWithoutCpuUnitTest) { TEST_F(ClusterResourceSchedulerTest, TestAlwaysSpillInfeasibleTask) { absl::flat_hash_map resource_spec({{"CPU", 1}}); - ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"), + instrumented_io_context io_context; + ClusterResourceScheduler resource_scheduler(io_context, + scheduling::NodeID("local"), absl::flat_hash_map{}, is_node_available_fn_); for (int i = 0; i < 100; i++) { @@ -1218,8 +1251,9 @@ TEST_F(ClusterResourceSchedulerTest, ResourceUsageReportTest) { absl::flat_hash_map initial_resources( {{"CPU", 1}, {"GPU", 2}, {"memory", 3}, {"1", 1}, {"2", 2}, {"3", 3}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID("0"), initial_resources, is_node_available_fn_); + io_context, scheduling::NodeID("0"), initial_resources, is_node_available_fn_); NodeResources other_node_resources = CreateNodeResources({{ResourceID::CPU(), 1}, {ResourceID::Memory(), 1}, {ResourceID::GPU(), 1}, @@ -1301,8 +1335,9 @@ TEST_F(ClusterResourceSchedulerTest, ObjectStoreMemoryUsageTest) { {"object_store_memory", 1000 * 1024 * 1024}}); int64_t used_object_store_memory = 250 * 1024 * 1024; int64_t *ptr = &used_object_store_memory; + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID("0"), initial_resources, is_node_available_fn_, [&] { + io_context, scheduling::NodeID("0"), initial_resources, is_node_available_fn_, [&] { return *ptr; }); NodeResources other_node_resources = CreateNodeResources({{ResourceID::CPU(), 1}, @@ -1391,8 +1426,9 @@ TEST_F(ClusterResourceSchedulerTest, ObjectStoreMemoryUsageTest) { TEST_F(ClusterResourceSchedulerTest, DirtyLocalViewTest) { absl::flat_hash_map initial_resources({{"CPU", 1}}); + instrumented_io_context io_service; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID("local"), initial_resources, is_node_available_fn_); + io_service, scheduling::NodeID("local"), initial_resources, is_node_available_fn_); auto remote = scheduling::NodeID(NodeID::FromRandom().Binary()); resource_scheduler.GetClusterResourceManager().AddOrUpdateNode( remote, {{"CPU", 2.}}, {{"CPU", 2.}}); @@ -1456,8 +1492,9 @@ TEST_F(ClusterResourceSchedulerTest, DirtyLocalViewTest) { } TEST_F(ClusterResourceSchedulerTest, DynamicResourceTest) { + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID("local"), {{"CPU", 2}}, is_node_available_fn_); + io_context, scheduling::NodeID("local"), {{"CPU", 2}}, is_node_available_fn_); absl::flat_hash_map resource_request = {{"CPU", 1}, {"custom123", 2}}; @@ -1526,8 +1563,9 @@ TEST_F(ClusterResourceSchedulerTest, DynamicResourceTest) { } TEST_F(ClusterResourceSchedulerTest, AvailableResourceEmptyTest) { + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID("local"), {{"custom123", 5}}, is_node_available_fn_); + io_context, scheduling::NodeID("local"), {{"custom123", 5}}, is_node_available_fn_); std::shared_ptr resource_instances = std::make_shared(); absl::flat_hash_map resource_request = {{"custom123", 5}}; @@ -1541,8 +1579,9 @@ TEST_F(ClusterResourceSchedulerTest, AvailableResourceEmptyTest) { TEST_F(ClusterResourceSchedulerTest, TestForceSpillback) { absl::flat_hash_map resource_spec({{"CPU", 1}}); + instrumented_io_context io_context; ClusterResourceScheduler resource_scheduler( - scheduling::NodeID("local"), resource_spec, is_node_available_fn_); + io_context, scheduling::NodeID("local"), resource_spec, is_node_available_fn_); std::vector node_ids; for (int i = 0; i < 100; i++) { node_ids.emplace_back(NodeID::FromRandom().Binary()); @@ -1603,8 +1642,11 @@ TEST_F(ClusterResourceSchedulerTest, TestForceSpillback) { TEST_F(ClusterResourceSchedulerTest, CustomResourceInstanceTest) { SetUnitInstanceResourceIds({ResourceID("FPGA")}); - ClusterResourceScheduler resource_scheduler( - scheduling::NodeID("local"), {{"CPU", 4}, {"FPGA", 2}}, is_node_available_fn_); + instrumented_io_context io_context; + ClusterResourceScheduler resource_scheduler(io_context, + scheduling::NodeID("local"), + {{"CPU", 4}, {"FPGA", 2}}, + is_node_available_fn_); auto fpga_resource_id = ResourceID("FPGA"); @@ -1631,7 +1673,9 @@ TEST_F(ClusterResourceSchedulerTest, CustomResourceInstanceTest) { TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesSerializedStringTest) { SetUnitInstanceResourceIds({ResourceID("GPU")}); - ClusterResourceScheduler resource_scheduler(scheduling::NodeID("local"), + instrumented_io_context io_context; + ClusterResourceScheduler resource_scheduler(io_context, + scheduling::NodeID("local"), {{"CPU", 4}, {"memory", 4}, {"GPU", 2}}, is_node_available_fn_); std::shared_ptr cluster_resources = @@ -1651,6 +1695,7 @@ TEST_F(ClusterResourceSchedulerTest, TaskResourceInstancesSerializedStringTest) cluster_instance_resources->Set(ResourceID::Memory(), {4.}); cluster_instance_resources->Set(ResourceID::GPU(), {1., 1.}); ClusterResourceScheduler resource_scheduler_cpu_instance( + io_context, scheduling::NodeID("local"), {{"CPU", 4}, {"memory", 4}, {"GPU", 2}}, is_node_available_fn_); @@ -1671,8 +1716,11 @@ TEST_F(ClusterResourceSchedulerTest, AffinityWithBundleScheduleTest) { CreateResourceRequest(AddPlacementGroupConstraint( {{"CPU", 1}, {"memory", 100}}, bundle_1.first, bundle_1.second)); NodeResources node_resources = NodeResources(bundle_resource_request); - ClusterResourceScheduler resource_scheduler( - scheduling::NodeID(node_1.Binary()), node_resources, is_node_available_fn_); + instrumented_io_context io_service; + ClusterResourceScheduler resource_scheduler(io_service, + scheduling::NodeID(node_1.Binary()), + node_resources, + is_node_available_fn_); ResourceRequest bundle_resource_request_2 = CreateResourceRequest(AddPlacementGroupConstraint( {{"CPU", 1}, {"memory", 100}}, bundle_2.first, bundle_2.second)); diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/cluster_task_manager_test.cc index f95c51c2aac3..de2bd227996c 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc @@ -131,8 +131,9 @@ std::shared_ptr CreateSingleNodeScheduler( local_node_resources[ray::kCPU_ResourceLabel] = num_cpus; local_node_resources[ray::kGPU_ResourceLabel] = num_gpus; local_node_resources[ray::kMemory_ResourceLabel] = 128; - + static instrumented_io_context io_context; auto scheduler = std::make_shared( + io_context, scheduling::NodeID(id), local_node_resources, /*is_node_available_fn*/ [&gcs_client](scheduling::NodeID node_id) { diff --git a/src/ray/raylet/scheduling/fixed_point.h b/src/ray/raylet/scheduling/fixed_point.h index 8e9cbfae206d..ecd59150f1d2 100644 --- a/src/ray/raylet/scheduling/fixed_point.h +++ b/src/ray/raylet/scheduling/fixed_point.h @@ -19,7 +19,7 @@ #include #include -#define RESOURCE_UNIT_SCALING 10000 +#include "ray/common/constants.h" /// Fixed point data type. class FixedPoint { @@ -28,9 +28,9 @@ class FixedPoint { public: FixedPoint() : FixedPoint(0.0) {} - FixedPoint(double d) { i_ = (int64_t)(d * RESOURCE_UNIT_SCALING); } // NOLINT + FixedPoint(double d) { i_ = (int64_t)(d * kResourceUnitScaling); } // NOLINT - FixedPoint(int i) { i_ = (i * RESOURCE_UNIT_SCALING); } // NOLINT + FixedPoint(int i) { i_ = (i * kResourceUnitScaling); } // NOLINT FixedPoint(int64_t i) : FixedPoint((double)i) {} // NOLINT @@ -72,23 +72,23 @@ class FixedPoint { FixedPoint operator+(double const d) const { FixedPoint res; - res.i_ = i_ + static_cast(d * RESOURCE_UNIT_SCALING); + res.i_ = i_ + static_cast(d * kResourceUnitScaling); return res; } FixedPoint operator-(double const d) const { FixedPoint res; - res.i_ = i_ - static_cast(d * RESOURCE_UNIT_SCALING); + res.i_ = i_ - static_cast(d * kResourceUnitScaling); return res; } FixedPoint operator=(double const d) { - i_ = static_cast(d * RESOURCE_UNIT_SCALING); + i_ = static_cast(d * kResourceUnitScaling); return *this; } FixedPoint operator+=(double const d) { - i_ += static_cast(d * RESOURCE_UNIT_SCALING); + i_ += static_cast(d * kResourceUnitScaling); return *this; } @@ -104,7 +104,7 @@ class FixedPoint { bool operator==(FixedPoint const &ru1) const { return (i_ == ru1.i_); }; bool operator!=(FixedPoint const &ru1) const { return (i_ != ru1.i_); }; - [[nodiscard]] double Double() const { return round(i_) / RESOURCE_UNIT_SCALING; }; + [[nodiscard]] double Double() const { return round(i_) / kResourceUnitScaling; }; friend std::ostream &operator<<(std::ostream &out, FixedPoint const &ru1); }; diff --git a/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc b/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc index 7f8772b503ae..786fc52aac61 100644 --- a/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc +++ b/src/ray/raylet/scheduling/policy/hybrid_scheduling_policy_test.cc @@ -68,13 +68,6 @@ class HybridSchedulingPolicyTest : public ::testing::Test { schedule_top_k_absolute, scheduler_top_k_fraction); } - - ClusterResourceManager MockClusterResourceManager( - const absl::flat_hash_map &nodes) { - ClusterResourceManager cluster_resource_manager; - cluster_resource_manager.nodes_ = nodes; - return cluster_resource_manager; - } }; TEST_F(HybridSchedulingPolicyTest, GetBestNode) { diff --git a/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc b/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc index 86d32b1d547d..56a56a3317ae 100644 --- a/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc +++ b/src/ray/raylet/scheduling/policy/scheduling_policy_test.cc @@ -66,10 +66,11 @@ class SchedulingPolicyTest : public ::testing::Test { scheduler_top_k_fraction); } - ClusterResourceManager MockClusterResourceManager( + std::unique_ptr MockClusterResourceManager( const absl::flat_hash_map &nodes) { - ClusterResourceManager cluster_resource_manager; - cluster_resource_manager.nodes_ = nodes; + static instrumented_io_context io_context; + auto cluster_resource_manager = std::make_unique(io_context); + cluster_resource_manager->nodes_ = nodes; return cluster_resource_manager; } }; @@ -86,7 +87,7 @@ TEST_F(SchedulingPolicyTest, NodeAffinityPolicyTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); raylet_scheduling_policy::CompositeSchedulingPolicy scheduling_policy( - scheduling::NodeID("local"), cluster_resource_manager, [](auto) { return true; }); + scheduling::NodeID("local"), *cluster_resource_manager, [](auto) { return true; }); auto to_schedule = scheduling_policy.Schedule( req, SchedulingOptions::NodeAffinity(false, false, "local", false)); @@ -140,7 +141,7 @@ TEST_F(SchedulingPolicyTest, SpreadPolicyTest) { nodes.emplace(remote_node_3, CreateNodeResources(20, 20, 0, 0, 0, 0)); auto cluster_resource_manager = MockClusterResourceManager(nodes); raylet_scheduling_policy::CompositeSchedulingPolicy scheduling_policy( - local_node, cluster_resource_manager, [](auto) { return true; }); + local_node, *cluster_resource_manager, [](auto) { return true; }); auto to_schedule = scheduling_policy.Schedule(req, SchedulingOptions::Spread(false, false)); @@ -178,7 +179,7 @@ TEST_F(SchedulingPolicyTest, RandomPolicyTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); raylet_scheduling_policy::CompositeSchedulingPolicy scheduling_policy( - local_node, cluster_resource_manager, [](auto) { return true; }); + local_node, *cluster_resource_manager, [](auto) { return true; }); std::map decisions; size_t num_node_0_picks = 0; @@ -268,7 +269,7 @@ TEST_F(SchedulingPolicyTest, AvailableTruncationTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, false)); ASSERT_EQ(to_schedule, local_node); } @@ -283,7 +284,7 @@ TEST_F(SchedulingPolicyTest, AvailableTieBreakTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.50, false, false)); ASSERT_EQ(to_schedule, remote_node); } @@ -298,7 +299,7 @@ TEST_F(SchedulingPolicyTest, AvailableOverFeasibleTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.50, false, false)); ASSERT_EQ(to_schedule, remote_node); } @@ -311,7 +312,7 @@ TEST_F(SchedulingPolicyTest, InfeasibleTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.50, false, false)); ASSERT_TRUE(to_schedule.IsNil()); } @@ -325,7 +326,7 @@ TEST_F(SchedulingPolicyTest, BarelyFeasibleTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.50, false, false)); ASSERT_EQ(to_schedule, local_node); } @@ -339,7 +340,7 @@ TEST_F(SchedulingPolicyTest, TruncationAcrossFeasibleNodesTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, false)); ASSERT_EQ(to_schedule, local_node); } @@ -353,7 +354,7 @@ TEST_F(SchedulingPolicyTest, ForceSpillbackIfAvailableTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, true, true)); ASSERT_EQ(to_schedule, remote_node); } @@ -370,7 +371,7 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) { const ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false); const auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(ResourceMapToResourceRequest({{"CPU", 1}}, false), HybridOptions(0.51, false, true, true)); ASSERT_EQ(to_schedule, remote_node); @@ -380,7 +381,7 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) { const ResourceRequest req = ResourceMapToResourceRequest({{"GPU", 1}}, false); const auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, true, true)); ASSERT_EQ(to_schedule, local_node); } @@ -389,7 +390,7 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) { const ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false); const auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, true, true)); ASSERT_EQ(to_schedule, remote_node); } @@ -399,7 +400,7 @@ TEST_F(SchedulingPolicyTest, AvoidSchedulingCPURequestsOnGPUNodes) { ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false); const auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, true, true)); ASSERT_EQ(to_schedule, local_node); } @@ -415,7 +416,7 @@ TEST_F(SchedulingPolicyTest, SchedulenCPURequestsOnGPUNodeAsALastResort) { auto cluster_resource_manager = MockClusterResourceManager(nodes); const auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, true, true)); ASSERT_EQ(to_schedule, remote_node); } @@ -429,7 +430,7 @@ TEST_F(SchedulingPolicyTest, ForceSpillbackTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, true, false)); ASSERT_EQ(to_schedule, remote_node); } @@ -444,7 +445,7 @@ TEST_F(SchedulingPolicyTest, ForceSpillbackOnlyFeasibleLocallyTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, true, false)); ASSERT_TRUE(to_schedule.IsNil()); } @@ -462,7 +463,7 @@ TEST_F(SchedulingPolicyTest, NonGpuNodePreferredSchedulingTest) { ResourceRequest req = ResourceMapToResourceRequest({{"CPU", 1}}, false); auto to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, @@ -472,7 +473,7 @@ TEST_F(SchedulingPolicyTest, NonGpuNodePreferredSchedulingTest) { req = ResourceMapToResourceRequest({{"CPU", 3}}, false); to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, @@ -482,7 +483,7 @@ TEST_F(SchedulingPolicyTest, NonGpuNodePreferredSchedulingTest) { req = ResourceMapToResourceRequest({{"CPU", 1}, {"GPU", 1}}, false); to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, @@ -492,7 +493,7 @@ TEST_F(SchedulingPolicyTest, NonGpuNodePreferredSchedulingTest) { req = ResourceMapToResourceRequest({{"CPU", 2}}, false); to_schedule = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.51, false, @@ -522,22 +523,22 @@ TEST_F(SchedulingPolicyTest, BundleSchedulingMaxFractionTest) { auto cluster_resource_manager = MockClusterResourceManager(nodes); // req is unscheduleable because the max cpu fraction reaches 0.5. auto unscheduable = raylet_scheduling_policy::BundlePackSchedulingPolicy( - cluster_resource_manager, [](auto) { return true; }) + *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, pack_op); ASSERT_TRUE(unscheduable.status.IsFailed()); unscheduable = raylet_scheduling_policy::BundleSpreadSchedulingPolicy( - cluster_resource_manager, [](auto) { return true; }) + *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, spread_op); ASSERT_TRUE(unscheduable.status.IsFailed()); unscheduable = raylet_scheduling_policy::BundleStrictPackSchedulingPolicy( - cluster_resource_manager, [](auto) { return true; }) + *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, strict_pack_op); ASSERT_TRUE(unscheduable.status.IsInfeasible()); unscheduable = raylet_scheduling_policy::BundleStrictSpreadSchedulingPolicy( - cluster_resource_manager, [](auto) { return true; }) + *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, strict_spread_op); ASSERT_TRUE(unscheduable.status.IsInfeasible()); } @@ -559,7 +560,7 @@ TEST_F(SchedulingPolicyTest, BundleSchedulingMaxFractionOneCpuReservationGuarant auto cluster_resource_manager = MockClusterResourceManager(nodes); // req is unscheduleable because the max cpu fraction reaches 0.5. auto to_schedule = raylet_scheduling_policy::BundlePackSchedulingPolicy( - cluster_resource_manager, [](auto) { return true; }) + *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, pack_op); ASSERT_TRUE(to_schedule.status.IsSuccess()); } @@ -582,7 +583,7 @@ TEST_F(SchedulingPolicyTest, auto cluster_resource_manager = MockClusterResourceManager(nodes); // req is unscheduleable because the max cpu fraction reaches 0.5. auto to_schedule = raylet_scheduling_policy::BundlePackSchedulingPolicy( - cluster_resource_manager, [](auto) { return true; }) + *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, pack_op); ASSERT_TRUE(to_schedule.status.IsSuccess()); @@ -590,7 +591,7 @@ TEST_F(SchedulingPolicyTest, auto to_schedule_task = raylet_scheduling_policy::CompositeSchedulingPolicy( - local_node, cluster_resource_manager, [](auto) { return true; }) + local_node, *cluster_resource_manager, [](auto) { return true; }) .Schedule(req, HybridOptions(0.50, false, false)); ASSERT_TRUE(!to_schedule_task.IsNil()); } @@ -615,7 +616,7 @@ TEST_F(SchedulingPolicyTest, BundleSchedulingMaxFractionWorkingWhenNormalResourc auto cluster_resource_manager = MockClusterResourceManager(nodes); // req is unscheduleable because the max cpu fraction reaches 0.5. auto to_schedule = raylet_scheduling_policy::BundlePackSchedulingPolicy( - cluster_resource_manager, [](auto) { return true; }) + *cluster_resource_manager, [](auto) { return true; }) .Schedule(req_list, pack_op); ASSERT_TRUE(to_schedule.status.IsSuccess()); } diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 2c1ef0ec049d..b9dc82862edc 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -216,7 +216,6 @@ void WorkerPool::AddWorkerProcess( const std::vector &dynamic_options) { state.worker_processes.emplace(worker_startup_token_counter_, WorkerProcessInfo{/*is_pending_registration=*/true, - {}, worker_type, proc, start, @@ -773,7 +772,6 @@ void WorkerPool::OnWorkerStarted(const std::shared_ptr &worker) auto it = state.worker_processes.find(worker_startup_token); if (it != state.worker_processes.end()) { it->second.is_pending_registration = false; - it->second.alive_started_workers.insert(worker); // We may have slots to start more workers now. TryStartIOWorkers(worker->GetLanguage()); } @@ -1063,126 +1061,76 @@ void WorkerPool::TryKillingIdleWorkers() { RAY_LOG(DEBUG) << "idle worker is already dead. Not going to kill worker " << idle_worker->WorkerId(); // This worker has already been killed. - // This is possible because a Java worker process may hold multiple workers. + // It will be removed from idle_of_all_languages_ later. + // This happens when ExitReply is received but the worker is not removed from + // idle_of_all_languages_ yet. continue; } - auto worker_startup_token = idle_worker->GetStartupToken(); - auto &worker_state = GetStateForLanguage(idle_worker->GetLanguage()); - auto it = worker_state.worker_processes.find(worker_startup_token); - if (it != worker_state.worker_processes.end() && it->second.is_pending_registration) { - // A Java worker process may hold multiple workers. - // Some workers of this process are pending registration. Skip killing this worker. + // Skip killing the worker process if there's any inflight `Exit` RPC requests to + // this worker process. + if (pending_exit_idle_workers_.count(idle_worker->WorkerId())) { continue; } - // TODO(clarng): get rid of multiple workers per process code here, as that is - // not longer supported. - auto process = idle_worker->GetProcess(); - // Make sure all workers in this worker process are idle. - // This block of code is needed by Java workers. - auto workers_in_the_same_process = GetWorkersByProcess(process); - bool can_be_killed = true; - for (const auto &worker : workers_in_the_same_process) { - if (worker_state.idle.count(worker) == 0 || - now - idle_of_all_languages_map_[worker] < - RayConfig::instance().idle_worker_killing_time_threshold_ms()) { - // Another worker in this process isn't idle, or hasn't been idle for a while, so - // this process can't be killed. - can_be_killed = false; - break; - } - - // Skip killing the worker process if there's any inflight `Exit` RPC requests to - // this worker process. - if (pending_exit_idle_workers_.count(worker->WorkerId())) { - can_be_killed = false; - break; - } - } - if (!can_be_killed) { - continue; + RAY_LOG(DEBUG) << "The worker pool has " << running_size + << " registered workers which exceeds the soft limit of " + << num_workers_soft_limit_ << ", and worker " + << idle_worker->WorkerId() << " with pid " + << idle_worker->GetProcess().GetId() + << " has been idle for a a while. Kill it."; + // To avoid object lost issue caused by forcibly killing, send an RPC request to the + // worker to allow it to do cleanup before exiting. We kill it anyway if the driver + // is already exited. + RAY_LOG(DEBUG) << "Sending exit message to worker " << idle_worker->WorkerId(); + // Register the worker to pending exit so that we can correctly calculate the + // running_size. + // This also means that there's an inflight `Exit` RPC request to the worker. + pending_exit_idle_workers_.emplace(idle_worker->WorkerId(), idle_worker); + auto rpc_client = idle_worker->rpc_client(); + RAY_CHECK(rpc_client); + RAY_CHECK(running_size > 0); + running_size--; + rpc::ExitRequest request; + if (finished_jobs_.contains(job_id) && + RayConfig::instance().kill_idle_workers_of_terminated_job()) { + RAY_LOG(INFO) << "Force exiting worker whose job has exited " + << idle_worker->WorkerId(); + request.set_force_exit(true); } + rpc_client->Exit( + request, [this, idle_worker](const ray::Status &status, const rpc::ExitReply &r) { + RAY_CHECK(pending_exit_idle_workers_.erase(idle_worker->WorkerId())); + if (!status.ok()) { + RAY_LOG(ERROR) << "Failed to send exit request: " << status.ToString(); + } - RAY_CHECK(running_size >= workers_in_the_same_process.size()); - if (running_size - workers_in_the_same_process.size() < - static_cast(num_workers_soft_limit_)) { - // A Java worker process may contain multiple workers. Killing more workers than we - // expect may slow the job. - if (!finished_jobs_.count(job_id)) { - // Ignore the soft limit for jobs that have already finished, as we - // should always clean up these workers. - return; - } - } - - for (const auto &worker : workers_in_the_same_process) { - RAY_LOG(DEBUG) << "The worker pool has " << running_size - << " registered workers which exceeds the soft limit of " - << num_workers_soft_limit_ << ", and worker " << worker->WorkerId() - << " with pid " << process.GetId() - << " has been idle for a a while. Kill it."; - // To avoid object lost issue caused by forcibly killing, send an RPC request to the - // worker to allow it to do cleanup before exiting. We kill it anyway if the driver - // is already exited. - if (!worker->IsDead()) { - RAY_LOG(DEBUG) << "Sending exit message to worker " << worker->WorkerId(); - // Register the worker to pending exit so that we can correctly calculate the - // running_size. - // This also means that there's an inflight `Exit` RPC request to the worker. - pending_exit_idle_workers_.emplace(worker->WorkerId(), worker); - auto rpc_client = worker->rpc_client(); - RAY_CHECK(rpc_client); - RAY_CHECK(running_size > 0); - running_size--; - rpc::ExitRequest request; - if (finished_jobs_.contains(job_id) && - RayConfig::instance().kill_idle_workers_of_terminated_job()) { - RAY_LOG(INFO) << "Force exiting worker whose job has exited " - << worker->WorkerId(); - request.set_force_exit(true); - } - rpc_client->Exit( - request, [this, worker](const ray::Status &status, const rpc::ExitReply &r) { - RAY_CHECK(pending_exit_idle_workers_.erase(worker->WorkerId())); - if (!status.ok()) { - RAY_LOG(ERROR) << "Failed to send exit request: " << status.ToString(); - } - - // In case of failed to send request, we remove it from pool as well - // TODO (iycheng): We should handle the grpc failure in better way. - if (!status.ok() || r.success()) { - RAY_LOG(DEBUG) << "Removed worker " << worker->WorkerId(); - auto &worker_state = GetStateForLanguage(worker->GetLanguage()); - // If we could kill the worker properly, we remove them from the idle - // pool. - RemoveWorker(worker_state.idle, worker); - // We always mark the worker as dead. - // If the worker is not idle at this moment, we'd want to mark it as dead - // so it won't be reused later. - if (!worker->IsDead()) { - worker->MarkDead(); - } - } else { - RAY_LOG(DEBUG) << "Failed to remove worker " << worker->WorkerId(); - // We re-insert the idle worker to the back of the queue if it fails to - // kill the worker (e.g., when the worker owns the object). Without this, - // if the first N workers own objects, it can't kill idle workers that are - // >= N+1. - const auto &idle_pair = idle_of_all_languages_.front(); - idle_of_all_languages_.push_back(idle_pair); - idle_of_all_languages_.pop_front(); - RAY_CHECK(idle_of_all_languages_.size() == - idle_of_all_languages_map_.size()); - } - }); - } else { - RAY_LOG(DEBUG) << "Removing dead worker " << worker->WorkerId(); - - // Even it's a dead worker, we still need to remove them from the pool. - RemoveWorker(worker_state.idle, worker); - } - } + // In case of failed to send request, we remove it from pool as well + // TODO (iycheng): We should handle the grpc failure in better way. + if (!status.ok() || r.success()) { + RAY_LOG(DEBUG) << "Removed worker " << idle_worker->WorkerId(); + auto &worker_state = GetStateForLanguage(idle_worker->GetLanguage()); + // If we could kill the worker properly, we remove them from the idle + // pool. + RemoveWorker(worker_state.idle, idle_worker); + // We always mark the worker as dead. + // If the worker is not idle at this moment, we'd want to mark it as dead + // so it won't be reused later. + if (!idle_worker->IsDead()) { + idle_worker->MarkDead(); + } + } else { + RAY_LOG(DEBUG) << "Failed to remove worker " << idle_worker->WorkerId(); + // We re-insert the idle worker to the back of the queue if it fails to + // kill the worker (e.g., when the worker owns the object). Without this, + // if the first N workers own objects, it can't kill idle workers that are + // >= N+1. + const auto &idle_pair = idle_of_all_languages_.front(); + idle_of_all_languages_.push_back(idle_pair); + idle_of_all_languages_.pop_front(); + RAY_CHECK(idle_of_all_languages_.size() == idle_of_all_languages_map_.size()); + } + }); } std::list, int64_t>> @@ -1417,7 +1365,7 @@ void WorkerPool::DisconnectWorker(const std::shared_ptr &worker auto &state = GetStateForLanguage(worker->GetLanguage()); auto it = state.worker_processes.find(worker->GetStartupToken()); if (it != state.worker_processes.end()) { - if (!RemoveWorker(it->second.alive_started_workers, worker)) { + if (it->second.is_pending_registration) { // Worker is either starting or started, // if it's not started, we should remove it from starting. it->second.is_pending_registration = false; @@ -1425,11 +1373,9 @@ void WorkerPool::DisconnectWorker(const std::shared_ptr &worker TryPendingPopWorkerRequests(worker->GetLanguage()); } } - if (it->second.alive_started_workers.size() == 0 && - !it->second.is_pending_registration) { - DeleteRuntimeEnvIfPossible(it->second.runtime_env_info.serialized_runtime_env()); - RemoveWorkerProcess(state, worker->GetStartupToken()); - } + + DeleteRuntimeEnvIfPossible(it->second.runtime_env_info.serialized_runtime_env()); + RemoveWorkerProcess(state, worker->GetStartupToken()); } RAY_CHECK(RemoveWorker(state.registered_workers, worker)); @@ -1605,20 +1551,6 @@ void WorkerPool::TryStartIOWorkers(const Language &language, } } -std::unordered_set> WorkerPool::GetWorkersByProcess( - const Process &process) { - std::unordered_set> workers_of_process; - for (auto &entry : states_by_lang_) { - auto &worker_state = entry.second; - for (const auto &worker : worker_state.registered_workers) { - if (worker->GetProcess().GetId() == process.GetId()) { - workers_of_process.insert(worker); - } - } - } - return workers_of_process; -} - std::string WorkerPool::DebugString() const { std::stringstream result; result << "WorkerPool:"; diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index b0c03c3d4997..b3efd2839de9 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -471,8 +471,6 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { struct WorkerProcessInfo { /// Whether this worker is pending registration or is started. bool is_pending_registration = true; - /// The started workers which is alive. - std::unordered_set> alive_started_workers; /// The type of the worker. rpc::WorkerType worker_type; /// The worker process instance. @@ -585,13 +583,6 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { /// \param language The language of the PopWorker requests. void TryPendingPopWorkerRequests(const Language &language); - /// Get all workers of the given process. - /// - /// \param process The process of workers. - /// \return The workers of the given process. - std::unordered_set> GetWorkersByProcess( - const Process &process); - /// Get either restore or spill worker state from state based on worker_type. /// /// \param worker_type IO Worker Type. diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index c8bc23a3736b..f626247d20e4 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -488,7 +488,7 @@ class WorkerPoolTest : public ::testing::Test { std::move(options), /*delay_executor=*/ [this](std::function task, uint32_t delay_ms) { - return execute_after(io_service_, task, delay_ms); + return execute_after(io_service_, task, std::chrono::milliseconds(delay_ms)); }, /*runtime_env_agent_factory=*/ [](const std::string &ip_address, int port) { diff --git a/src/ray/raylet_client/raylet_client.cc b/src/ray/raylet_client/raylet_client.cc index a2cc2dd45924..cf7f89cc9ab5 100644 --- a/src/ray/raylet_client/raylet_client.cc +++ b/src/ray/raylet_client/raylet_client.cc @@ -519,7 +519,6 @@ void raylet::RayletClient::GlobalGC( } void raylet::RayletClient::UpdateResourceUsage( - std::string &serialized_resource_usage_batch, const rpc::ClientCallback &callback) { rpc::UpdateResourceUsageRequest request; diff --git a/src/ray/rpc/node_manager/node_manager_client_pool.cc b/src/ray/rpc/node_manager/node_manager_client_pool.cc index 17d8afddb9b9..097ad168d631 100644 --- a/src/ray/rpc/node_manager/node_manager_client_pool.cc +++ b/src/ray/rpc/node_manager/node_manager_client_pool.cc @@ -30,7 +30,8 @@ shared_ptr NodeManagerClientPool::GetOrConnectByAddr auto connection = client_factory_(address); client_map_[raylet_id] = connection; - RAY_LOG(DEBUG) << "Connected to " << address.ip_address() << ":" << address.port(); + RAY_LOG(DEBUG) << "Connected to raylet " << raylet_id << " at " << address.ip_address() + << ":" << address.port(); RAY_CHECK(connection != nullptr); return connection; } diff --git a/src/ray/rpc/server_call.cc b/src/ray/rpc/server_call.cc index b28317598e05..2f432999b29e 100644 --- a/src/ray/rpc/server_call.cc +++ b/src/ray/rpc/server_call.cc @@ -30,8 +30,9 @@ std::unique_ptr &_GetServerCallExecutor() { boost::asio::thread_pool &GetServerCallExecutor() { return *_GetServerCallExecutor(); } -void DrainAndResetServerCallExecutor() { - GetServerCallExecutor().join(); +void DrainServerCallExecutor() { GetServerCallExecutor().join(); } + +void ResetServerCallExecutor() { _GetServerCallExecutor() = std::make_unique( ::RayConfig::instance().num_server_call_thread()); } diff --git a/src/ray/rpc/server_call.h b/src/ray/rpc/server_call.h index 31d078ff78f0..8242c6b69fe8 100644 --- a/src/ray/rpc/server_call.h +++ b/src/ray/rpc/server_call.h @@ -32,9 +32,14 @@ namespace rpc { /// This pool is shared across gRPC servers. boost::asio::thread_pool &GetServerCallExecutor(); -/// For testing -/// Drain the executor and reset it. -void DrainAndResetServerCallExecutor(); +/// Drain the executor. +void DrainServerCallExecutor(); + +/// Reset the server call executor. +/// Testing only. After you drain the executor +/// you need to regenerate the executor +/// because they are global. +void ResetServerCallExecutor(); /// Represents the callback function to be called when a `ServiceHandler` finishes /// handling a request.