From a461707a81ffdfac3be2365a6061692034ed2946 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 22 Dec 2025 00:44:32 +0100 Subject: [PATCH 01/19] Add GitHub Actions workflow for type checking (mypy, pyright, ty). Add script for including docstrings into stubfiles before building wheels. diff --git c/.github/workflows/python.yml i/.github/workflows/python.yml index e5d367958d..4ca0f9b6dc 100644 --- c/.github/workflows/python.yml +++ i/.github/workflows/python.yml @@ -239,6 +239,11 @@ jobs: - name: Test shell: bash run: ci/scripts/python_test.sh $(pwd) $(pwd)/build + - name: Test annotations + shell: bash + env: + PYARROW_TEST_ANNOTATIONS: "ON" + run: ci/scripts/python_test_type_annotations.sh $(pwd)/python windows: name: AMD64 Windows 2022 Python 3.13 @@ -296,3 +301,7 @@ jobs: shell: cmd run: | call "ci\scripts\python_test.bat" %cd% + - name: Test annotations + shell: cmd + run: | + call "ci\scripts\python_test_type_annotations.bat" %cd%\python diff --git c/ci/scripts/python_test_type_annotations.bat i/ci/scripts/python_test_type_annotations.bat new file mode 100644 index 0000000000..3446e329a8 --- /dev/null +++ i/ci/scripts/python_test_type_annotations.bat @@ -0,0 +1,38 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set PYARROW_DIR=%1 + +echo Annotation testing on Windows ... + +@REM Install library stubs +%PYTHON_CMD% -m pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 + +@REM Install other dependencies for type checking +%PYTHON_CMD% -m pip install fsspec || exit /B 1 + +@REM Install type checkers +%PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 + +@REM Run type checkers +pushd %PYARROW_DIR% + +mypy +pyright +ty check diff --git c/ci/scripts/python_test_type_annotations.sh i/ci/scripts/python_test_type_annotations.sh new file mode 100755 index 0000000000..82610ce663 --- /dev/null +++ i/ci/scripts/python_test_type_annotations.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex +pyarrow_dir=${1} + +if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then + # Install library stubs + pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil + + # Install type checkers + pip install mypy pyright ty + + # Install other dependencies for type checking + pip install fsspec + + # Run type checkers + pushd ${pyarrow_dir} + mypy + pyright + ty check; +else + echo "Skipping type annotation tests"; +fi diff --git c/ci/scripts/python_wheel_macos_build.sh i/ci/scripts/python_wheel_macos_build.sh index bd61154430..b64eee623f 100755 --- c/ci/scripts/python_wheel_macos_build.sh +++ i/ci/scripts/python_wheel_macos_build.sh @@ -177,6 +177,11 @@ export CMAKE_PREFIX_PATH=${build_dir}/install export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python +# We first populate stub docstrings and then build the wheel +python setup.py build_ext --inplace +python -m pip install griffe libcst +python ../dev/update_stub_docstrings.py pyarrow-stubs + python setup.py bdist_wheel popd diff --git c/ci/scripts/python_wheel_validate_contents.py i/ci/scripts/python_wheel_validate_contents.py index 84fcaba42e..ee4a31aedb 100644 --- c/ci/scripts/python_wheel_validate_contents.py +++ i/ci/scripts/python_wheel_validate_contents.py @@ -35,6 +35,11 @@ def validate_wheel(path): assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" print(f"The wheel: {wheels[0]} seems valid.") + candidates = [info for info in f.filelist if info.filename.endswith('compute.pyi')] + assert candidates, "compute.pyi not found in wheel" + content = f.read(candidates[0]).decode('utf-8', errors='replace') + assert '"""' in content, "compute.pyi missing docstrings (no triple quotes found)" + def main(): parser = argparse.ArgumentParser() diff --git c/ci/scripts/python_wheel_windows_build.bat i/ci/scripts/python_wheel_windows_build.bat index b4b7fed99f..3da7f60f18 100644 --- c/ci/scripts/python_wheel_windows_build.bat +++ i/ci/scripts/python_wheel_windows_build.bat @@ -135,6 +135,11 @@ pushd C:\arrow\python @REM Build wheel %PYTHON_CMD% setup.py bdist_wheel || exit /B 1 +@REM We first populate stub docstrings and then build the wheel +%PYTHON_CMD% setup.py build_ext --inplace +%PYTHON_CMD% -m pip install griffe libcst +%PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs + @REM Repair the wheel with delvewheel @REM @REM Since we bundled the Arrow C++ libraries ourselves, we only need to diff --git c/ci/scripts/python_wheel_xlinux_build.sh i/ci/scripts/python_wheel_xlinux_build.sh index a3fbeb3c0b..977ef64e00 100755 --- c/ci/scripts/python_wheel_xlinux_build.sh +++ i/ci/scripts/python_wheel_xlinux_build.sh @@ -167,6 +167,11 @@ export ARROW_HOME=/tmp/arrow-dist export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python +# We first populate stub docstrings and then build the wheel +python setup.py build_ext --inplace +python -m pip install griffe libcst +python ../dev/update_stub_docstrings.py pyarrow-stubs + python setup.py bdist_wheel echo "=== Strip symbols from wheel ===" diff --git c/compose.yaml i/compose.yaml index 2bd38a381e..ae0a1d4243 100644 --- c/compose.yaml +++ i/compose.yaml @@ -919,12 +919,14 @@ services: environment: <<: [*common, *ccache, *sccache] PYTEST_ARGS: # inherit + PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: &python-conda-command [" /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-emscripten: # Usage: @@ -1001,6 +1003,7 @@ services: ARROW_S3: "OFF" ARROW_SUBSTRAIT: "OFF" ARROW_WITH_OPENTELEMETRY: "OFF" + PYARROW_TEST_ANNOTATIONS: "ON" SETUPTOOLS_SCM_PRETEND_VERSION: volumes: *ubuntu-volumes deploy: *cuda-deploy @@ -1008,7 +1011,8 @@ services: /bin/bash -c " /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow" + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python" debian-python: # Usage: @@ -1500,6 +1504,7 @@ services: python: ${PYTHON} shm_size: *shm-size environment: + PYARROW_TEST_ANNOTATIONS: "ON" <<: [*common, *ccache, *sccache] PARQUET_REQUIRE_ENCRYPTION: # inherit HYPOTHESIS_PROFILE: # inherit @@ -1510,7 +1515,8 @@ services: /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && mamba uninstall -y numpy && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-docs: # Usage: @@ -1530,13 +1536,15 @@ services: BUILD_DOCS_CPP: "ON" BUILD_DOCS_PYTHON: "ON" PYTEST_ARGS: "--doctest-modules --doctest-cython" + PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-dask: # Possible $DASK parameters: diff --git c/docs/source/developers/python/development.rst i/docs/source/developers/python/development.rst index d03b2439b1..c23891e94d 100644 --- c/docs/source/developers/python/development.rst +++ i/docs/source/developers/python/development.rst @@ -42,7 +42,7 @@ Unit Testing ============ We are using `pytest `_ to develop our unit -test suite. After `building the project `_ you can run its unit tests +test suite. After `building the project `_ you can run its unit tests like so: .. code-block:: @@ -101,6 +101,74 @@ The test groups currently include: * ``s3``: Tests for Amazon S3 * ``tensorflow``: Tests that involve TensorFlow +Type Checking +============= + +PyArrow provides type stubs (``*.pyi`` files) for static type checking. These +stubs are located in the ``pyarrow-stubs/`` directory and are automatically +included in the distributed wheel packages. + +Running Type Checkers +--------------------- + +We support multiple type checkers. Their configurations are in +``pyproject.toml``. + +**mypy** + +To run mypy on the PyArrow codebase: + +.. code-block:: + + $ cd arrow/python + $ mypy + +The mypy configuration is in the ``[tool.mypy]`` section of ``pyproject.toml``. + +**pyright** + +To run pyright: + +.. code-block:: + + $ cd arrow/python + $ pyright + +The pyright configuration is in the ``[tool.pyright]`` section of ``pyproject.toml``. + +**ty** + +To run ty (note: currently only partially configured): + +.. code-block:: + + $ cd arrow/python + $ ty check + +Maintaining Type Stubs +----------------------- + +Type stubs for PyArrow are maintained in the ``pyarrow-stubs/`` +directory. These stubs mirror the structure of the main ``pyarrow/`` package. + +When adding or modifying public APIs: + +1. **Update the corresponding ``.pyi`` stub file** in ``pyarrow-stubs/`` + to reflect the new or changed function/class signatures. + +2. **Include type annotations** where possible. For Cython modules or + dynamically generated APIs such as compute kernels add the corresponding + stub in ``pyarrow-stubs/``. + +3. **Run type checkers** to ensure the stubs are correct and complete. + +The stub files are automatically copied into the built wheel during the build +process and will be included when users install PyArrow, enabling type checking +in downstream projects and for users' IDEs. + +Note: ``py.typed`` marker file in the ``pyarrow/`` directory indicates to type +checkers that PyArrow supports type checking according to :pep:`561`. + Doctest ======= diff --git c/python/MANIFEST.in i/python/MANIFEST.in index ed7012e4b7..2840ba7412 100644 --- c/python/MANIFEST.in +++ i/python/MANIFEST.in @@ -4,6 +4,7 @@ include ../NOTICE.txt global-include CMakeLists.txt graft pyarrow +graft pyarrow-stubs graft cmake_modules global-exclude *.so diff --git c/python/pyarrow-stubs/pyarrow/__init__.pyi i/python/pyarrow-stubs/pyarrow/__init__.pyi new file mode 100644 index 0000000000..2a68a51309 --- /dev/null +++ i/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Type stubs for PyArrow. + +This is a placeholder stub file. +Complete type annotations will be added in subsequent PRs. +""" + +from typing import Any + +def __getattr__(name: str) -> Any: ... diff --git c/python/pyarrow/py.typed i/python/pyarrow/py.typed new file mode 100644 index 0000000000..13a83393a9 --- /dev/null +++ i/python/pyarrow/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git c/python/pyproject.toml i/python/pyproject.toml index 899144d418..9f62f02896 100644 --- c/python/pyproject.toml +++ i/python/pyproject.toml @@ -84,11 +84,11 @@ zip-safe=false include-package-data=true [tool.setuptools.packages.find] -include = ["pyarrow"] +include = ["pyarrow", "pyarrow.*"] namespaces = false [tool.setuptools.package-data] -pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"] +pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd", "py.typed"] [tool.setuptools_scm] root = '..' @@ -96,3 +96,39 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' + +# TODO: Enable type checking once stubs are merged +[tool.mypy] +files = ["pyarrow-stubs"] +mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" +exclude = [ + "^pyarrow/", + "^benchmarks/", + "^examples/", + "^scripts/", +] + +# TODO: Enable type checking once stubs are merged +[tool.pyright] +pythonPlatform = "All" +pythonVersion = "3.10" +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", + "build", +] +stubPath = "pyarrow-stubs" +typeCheckingMode = "basic" + +# TODO: Enable type checking once stubs are merged +[tool.ty.src] +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", +] diff --git c/python/setup.py i/python/setup.py index a27bd3baef..a25d2d76b3 100755 --- c/python/setup.py +++ i/python/setup.py @@ -121,8 +121,35 @@ class build_ext(_build_ext): def run(self): self._run_cmake() + self._copy_stubs() _build_ext.run(self) + def _copy_stubs(self): + """Copy .pyi stub files from pyarrow-stubs to the build directory.""" + build_cmd = self.get_finalized_command('build') + build_lib = os.path.abspath(build_cmd.build_lib) + + stubs_src = pjoin(setup_dir, 'pyarrow-stubs', 'pyarrow') + stubs_dest = pjoin(build_lib, 'pyarrow') + + if os.path.exists(stubs_src): + print(f"-- Copying stub files from {stubs_src} to {stubs_dest}") + for root, dirs, files in os.walk(stubs_src): + # Calculate relative path from stubs_src + rel_dir = os.path.relpath(root, stubs_src) + dest_dir = pjoin(stubs_dest, rel_dir) if rel_dir != '.' else stubs_dest + + # Create destination directory if needed + if not os.path.exists(dest_dir): + os.makedirs(dest_dir) + + # Copy .pyi files + for file in files: + if file.endswith('.pyi'): + src_file = pjoin(root, file) + dest_file = pjoin(dest_dir, file) + shutil.copy2(src_file, dest_file) + # adapted from cmake_build_ext in dynd-python # github.com/libdynd/dynd-python --- .github/workflows/python.yml | 9 +++ ci/scripts/python_test_type_annotations.bat | 38 ++++++++++ ci/scripts/python_test_type_annotations.sh | 40 +++++++++++ ci/scripts/python_wheel_macos_build.sh | 5 ++ ci/scripts/python_wheel_validate_contents.py | 5 ++ ci/scripts/python_wheel_windows_build.bat | 5 ++ ci/scripts/python_wheel_xlinux_build.sh | 5 ++ compose.yaml | 16 +++-- docs/source/developers/python/development.rst | 70 ++++++++++++++++++- python/MANIFEST.in | 1 + python/pyarrow-stubs/pyarrow/__init__.pyi | 26 +++++++ python/pyarrow/py.typed | 16 +++++ python/pyproject.toml | 40 ++++++++++- python/setup.py | 27 +++++++ 14 files changed, 296 insertions(+), 7 deletions(-) create mode 100644 ci/scripts/python_test_type_annotations.bat create mode 100755 ci/scripts/python_test_type_annotations.sh create mode 100644 python/pyarrow-stubs/pyarrow/__init__.pyi create mode 100644 python/pyarrow/py.typed diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index e5d367958dd..4ca0f9b6dc6 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -239,6 +239,11 @@ jobs: - name: Test shell: bash run: ci/scripts/python_test.sh $(pwd) $(pwd)/build + - name: Test annotations + shell: bash + env: + PYARROW_TEST_ANNOTATIONS: "ON" + run: ci/scripts/python_test_type_annotations.sh $(pwd)/python windows: name: AMD64 Windows 2022 Python 3.13 @@ -296,3 +301,7 @@ jobs: shell: cmd run: | call "ci\scripts\python_test.bat" %cd% + - name: Test annotations + shell: cmd + run: | + call "ci\scripts\python_test_type_annotations.bat" %cd%\python diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat new file mode 100644 index 00000000000..3446e329a89 --- /dev/null +++ b/ci/scripts/python_test_type_annotations.bat @@ -0,0 +1,38 @@ +@rem Licensed to the Apache Software Foundation (ASF) under one +@rem or more contributor license agreements. See the NOTICE file +@rem distributed with this work for additional information +@rem regarding copyright ownership. The ASF licenses this file +@rem to you under the Apache License, Version 2.0 (the +@rem "License"); you may not use this file except in compliance +@rem with the License. You may obtain a copy of the License at +@rem +@rem http://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, +@rem software distributed under the License is distributed on an +@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@rem KIND, either express or implied. See the License for the +@rem specific language governing permissions and limitations +@rem under the License. + +@echo on + +set PYARROW_DIR=%1 + +echo Annotation testing on Windows ... + +@REM Install library stubs +%PYTHON_CMD% -m pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 + +@REM Install other dependencies for type checking +%PYTHON_CMD% -m pip install fsspec || exit /B 1 + +@REM Install type checkers +%PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 + +@REM Run type checkers +pushd %PYARROW_DIR% + +mypy +pyright +ty check diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh new file mode 100755 index 00000000000..82610ce6630 --- /dev/null +++ b/ci/scripts/python_test_type_annotations.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex +pyarrow_dir=${1} + +if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then + # Install library stubs + pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil + + # Install type checkers + pip install mypy pyright ty + + # Install other dependencies for type checking + pip install fsspec + + # Run type checkers + pushd ${pyarrow_dir} + mypy + pyright + ty check; +else + echo "Skipping type annotation tests"; +fi diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index bd61154430e..b64eee623f3 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -177,6 +177,11 @@ export CMAKE_PREFIX_PATH=${build_dir}/install export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python +# We first populate stub docstrings and then build the wheel +python setup.py build_ext --inplace +python -m pip install griffe libcst +python ../dev/update_stub_docstrings.py pyarrow-stubs + python setup.py bdist_wheel popd diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 84fcaba42e6..ee4a31aedb8 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -35,6 +35,11 @@ def validate_wheel(path): assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" print(f"The wheel: {wheels[0]} seems valid.") + candidates = [info for info in f.filelist if info.filename.endswith('compute.pyi')] + assert candidates, "compute.pyi not found in wheel" + content = f.read(candidates[0]).decode('utf-8', errors='replace') + assert '"""' in content, "compute.pyi missing docstrings (no triple quotes found)" + def main(): parser = argparse.ArgumentParser() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index b4b7fed99fd..3da7f60f182 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -135,6 +135,11 @@ pushd C:\arrow\python @REM Build wheel %PYTHON_CMD% setup.py bdist_wheel || exit /B 1 +@REM We first populate stub docstrings and then build the wheel +%PYTHON_CMD% setup.py build_ext --inplace +%PYTHON_CMD% -m pip install griffe libcst +%PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs + @REM Repair the wheel with delvewheel @REM @REM Since we bundled the Arrow C++ libraries ourselves, we only need to diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index a3fbeb3c0b3..977ef64e008 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -167,6 +167,11 @@ export ARROW_HOME=/tmp/arrow-dist export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python +# We first populate stub docstrings and then build the wheel +python setup.py build_ext --inplace +python -m pip install griffe libcst +python ../dev/update_stub_docstrings.py pyarrow-stubs + python setup.py bdist_wheel echo "=== Strip symbols from wheel ===" diff --git a/compose.yaml b/compose.yaml index 2bd38a381e8..ae0a1d42439 100644 --- a/compose.yaml +++ b/compose.yaml @@ -919,12 +919,14 @@ services: environment: <<: [*common, *ccache, *sccache] PYTEST_ARGS: # inherit + PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: &python-conda-command [" /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-emscripten: # Usage: @@ -1001,6 +1003,7 @@ services: ARROW_S3: "OFF" ARROW_SUBSTRAIT: "OFF" ARROW_WITH_OPENTELEMETRY: "OFF" + PYARROW_TEST_ANNOTATIONS: "ON" SETUPTOOLS_SCM_PRETEND_VERSION: volumes: *ubuntu-volumes deploy: *cuda-deploy @@ -1008,7 +1011,8 @@ services: /bin/bash -c " /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow" + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python" debian-python: # Usage: @@ -1500,6 +1504,7 @@ services: python: ${PYTHON} shm_size: *shm-size environment: + PYARROW_TEST_ANNOTATIONS: "ON" <<: [*common, *ccache, *sccache] PARQUET_REQUIRE_ENCRYPTION: # inherit HYPOTHESIS_PROFILE: # inherit @@ -1510,7 +1515,8 @@ services: /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && mamba uninstall -y numpy && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-docs: # Usage: @@ -1530,13 +1536,15 @@ services: BUILD_DOCS_CPP: "ON" BUILD_DOCS_PYTHON: "ON" PYTEST_ARGS: "--doctest-modules --doctest-cython" + PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-dask: # Possible $DASK parameters: diff --git a/docs/source/developers/python/development.rst b/docs/source/developers/python/development.rst index d03b2439b10..c23891e94d0 100644 --- a/docs/source/developers/python/development.rst +++ b/docs/source/developers/python/development.rst @@ -42,7 +42,7 @@ Unit Testing ============ We are using `pytest `_ to develop our unit -test suite. After `building the project `_ you can run its unit tests +test suite. After `building the project `_ you can run its unit tests like so: .. code-block:: @@ -101,6 +101,74 @@ The test groups currently include: * ``s3``: Tests for Amazon S3 * ``tensorflow``: Tests that involve TensorFlow +Type Checking +============= + +PyArrow provides type stubs (``*.pyi`` files) for static type checking. These +stubs are located in the ``pyarrow-stubs/`` directory and are automatically +included in the distributed wheel packages. + +Running Type Checkers +--------------------- + +We support multiple type checkers. Their configurations are in +``pyproject.toml``. + +**mypy** + +To run mypy on the PyArrow codebase: + +.. code-block:: + + $ cd arrow/python + $ mypy + +The mypy configuration is in the ``[tool.mypy]`` section of ``pyproject.toml``. + +**pyright** + +To run pyright: + +.. code-block:: + + $ cd arrow/python + $ pyright + +The pyright configuration is in the ``[tool.pyright]`` section of ``pyproject.toml``. + +**ty** + +To run ty (note: currently only partially configured): + +.. code-block:: + + $ cd arrow/python + $ ty check + +Maintaining Type Stubs +----------------------- + +Type stubs for PyArrow are maintained in the ``pyarrow-stubs/`` +directory. These stubs mirror the structure of the main ``pyarrow/`` package. + +When adding or modifying public APIs: + +1. **Update the corresponding ``.pyi`` stub file** in ``pyarrow-stubs/`` + to reflect the new or changed function/class signatures. + +2. **Include type annotations** where possible. For Cython modules or + dynamically generated APIs such as compute kernels add the corresponding + stub in ``pyarrow-stubs/``. + +3. **Run type checkers** to ensure the stubs are correct and complete. + +The stub files are automatically copied into the built wheel during the build +process and will be included when users install PyArrow, enabling type checking +in downstream projects and for users' IDEs. + +Note: ``py.typed`` marker file in the ``pyarrow/`` directory indicates to type +checkers that PyArrow supports type checking according to :pep:`561`. + Doctest ======= diff --git a/python/MANIFEST.in b/python/MANIFEST.in index ed7012e4b70..2840ba74128 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -4,6 +4,7 @@ include ../NOTICE.txt global-include CMakeLists.txt graft pyarrow +graft pyarrow-stubs graft cmake_modules global-exclude *.so diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi new file mode 100644 index 00000000000..2a68a513099 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Type stubs for PyArrow. + +This is a placeholder stub file. +Complete type annotations will be added in subsequent PRs. +""" + +from typing import Any + +def __getattr__(name: str) -> Any: ... diff --git a/python/pyarrow/py.typed b/python/pyarrow/py.typed new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/pyarrow/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyproject.toml b/python/pyproject.toml index 899144d418d..9f62f028961 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -84,11 +84,11 @@ zip-safe=false include-package-data=true [tool.setuptools.packages.find] -include = ["pyarrow"] +include = ["pyarrow", "pyarrow.*"] namespaces = false [tool.setuptools.package-data] -pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"] +pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd", "py.typed"] [tool.setuptools_scm] root = '..' @@ -96,3 +96,39 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' + +# TODO: Enable type checking once stubs are merged +[tool.mypy] +files = ["pyarrow-stubs"] +mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" +exclude = [ + "^pyarrow/", + "^benchmarks/", + "^examples/", + "^scripts/", +] + +# TODO: Enable type checking once stubs are merged +[tool.pyright] +pythonPlatform = "All" +pythonVersion = "3.10" +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", + "build", +] +stubPath = "pyarrow-stubs" +typeCheckingMode = "basic" + +# TODO: Enable type checking once stubs are merged +[tool.ty.src] +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", +] diff --git a/python/setup.py b/python/setup.py index a27bd3baefd..a25d2d76b36 100755 --- a/python/setup.py +++ b/python/setup.py @@ -121,8 +121,35 @@ def build_extensions(self): def run(self): self._run_cmake() + self._copy_stubs() _build_ext.run(self) + def _copy_stubs(self): + """Copy .pyi stub files from pyarrow-stubs to the build directory.""" + build_cmd = self.get_finalized_command('build') + build_lib = os.path.abspath(build_cmd.build_lib) + + stubs_src = pjoin(setup_dir, 'pyarrow-stubs', 'pyarrow') + stubs_dest = pjoin(build_lib, 'pyarrow') + + if os.path.exists(stubs_src): + print(f"-- Copying stub files from {stubs_src} to {stubs_dest}") + for root, dirs, files in os.walk(stubs_src): + # Calculate relative path from stubs_src + rel_dir = os.path.relpath(root, stubs_src) + dest_dir = pjoin(stubs_dest, rel_dir) if rel_dir != '.' else stubs_dest + + # Create destination directory if needed + if not os.path.exists(dest_dir): + os.makedirs(dest_dir) + + # Copy .pyi files + for file in files: + if file.endswith('.pyi'): + src_file = pjoin(root, file) + dest_file = pjoin(dest_dir, file) + shutil.copy2(src_file, dest_file) + # adapted from cmake_build_ext in dynd-python # github.com/libdynd/dynd-python From 97596fb9f7c82c3a557dbf84f7f20b6b319bb102 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 23 Dec 2025 11:48:34 +0100 Subject: [PATCH 02/19] Update ci/scripts/python_test_type_annotations.sh --- ci/scripts/python_test_type_annotations.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index 82610ce6630..1ed08964380 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -22,7 +22,7 @@ pyarrow_dir=${1} if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then # Install library stubs - pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil + pip install pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil # Install type checkers pip install mypy pyright ty From a5e35c28284403b9211cd103fc639034fa75f170 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 11:33:24 +0100 Subject: [PATCH 03/19] Apply suggestion from @raulcd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Raúl Cumplido --- ci/scripts/python_wheel_validate_contents.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index ee4a31aedb8..acab56af941 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -35,6 +35,7 @@ def validate_wheel(path): assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" print(f"The wheel: {wheels[0]} seems valid.") + # Validate at least one typing stub has been generated. candidates = [info for info in f.filelist if info.filename.endswith('compute.pyi')] assert candidates, "compute.pyi not found in wheel" content = f.read(candidates[0]).decode('utf-8', errors='replace') From 90622268722c5edfe156c99138d67c8804964542 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 11:39:12 +0100 Subject: [PATCH 04/19] review feedback --- ci/scripts/python_test_type_annotations.bat | 7 ++----- ci/scripts/python_test_type_annotations.sh | 11 ++++++----- ci/scripts/python_wheel_windows_build.bat | 1 - docs/source/developers/python/development.rst | 2 +- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat index 3446e329a89..e4dc09d465d 100644 --- a/ci/scripts/python_test_type_annotations.bat +++ b/ci/scripts/python_test_type_annotations.bat @@ -21,11 +21,8 @@ set PYARROW_DIR=%1 echo Annotation testing on Windows ... -@REM Install library stubs -%PYTHON_CMD% -m pip install pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 - -@REM Install other dependencies for type checking -%PYTHON_CMD% -m pip install fsspec || exit /B 1 +@REM Install library stubs. Note some libraries contain their own type hints so they need to be installed +%PYTHON_CMD% -m pip install fsspec pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 @REM Install type checkers %PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index 1ed08964380..95ca9e5fe39 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -21,15 +21,16 @@ set -ex pyarrow_dir=${1} if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then - # Install library stubs - pip install pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil + if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + . "${ARROW_PYTHON_VENV}/bin/activate" + fi + + # Install library stubs. Note some libraries contain their own type hints so they need to be installed + pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil # Install type checkers pip install mypy pyright ty - # Install other dependencies for type checking - pip install fsspec - # Run type checkers pushd ${pyarrow_dir} mypy diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 3da7f60f182..b640c0d0627 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -138,7 +138,6 @@ pushd C:\arrow\python @REM We first populate stub docstrings and then build the wheel %PYTHON_CMD% setup.py build_ext --inplace %PYTHON_CMD% -m pip install griffe libcst -%PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs @REM Repair the wheel with delvewheel @REM diff --git a/docs/source/developers/python/development.rst b/docs/source/developers/python/development.rst index c23891e94d0..596715b9217 100644 --- a/docs/source/developers/python/development.rst +++ b/docs/source/developers/python/development.rst @@ -42,7 +42,7 @@ Unit Testing ============ We are using `pytest `_ to develop our unit -test suite. After `building the project `_ you can run its unit tests +test suite. After `building the project `_ you can run its unit tests like so: .. code-block:: From a7fba0b121dd509bcbd3e9aeb63d0c49f5fcf5d8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 15:30:49 +0100 Subject: [PATCH 05/19] include dev/update_stub_docstrings.py --- ci/scripts/python_wheel_windows_build.bat | 1 + dev/update_stub_docstrings.py | 215 ++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 dev/update_stub_docstrings.py diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index b640c0d0627..3da7f60f182 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -138,6 +138,7 @@ pushd C:\arrow\python @REM We first populate stub docstrings and then build the wheel %PYTHON_CMD% setup.py build_ext --inplace %PYTHON_CMD% -m pip install griffe libcst +%PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs @REM Repair the wheel with delvewheel @REM diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py new file mode 100644 index 00000000000..2b0933d5351 --- /dev/null +++ b/dev/update_stub_docstrings.py @@ -0,0 +1,215 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Utility to extract docstrings from pyarrow and update +# docstrings in stubfiles. +# +# Usage +# ===== +# +# python ../dev/update_stub_docstrings.py pyarrow-stubs + + +from pathlib import Path +from textwrap import indent + +import click +# TODO: perhaps replace griffe with importlib +import griffe +from griffe import AliasResolutionError +import libcst +from libcst import matchers as m + + +def _get_docstring(name, package, indentation): + # print("extract_docstrings", name) + try: + obj = package.get_member(name) + except (KeyError, ValueError, AliasResolutionError): + # Some cython __init__ symbols can't be found + # e.g. pyarrow.lib.OSFile.__init__ + stack = name.split(".") + parent_name = ".".join(stack[:-1]) + + try: + obj = package.get_member(parent_name).all_members[stack[-1]] + except (KeyError, ValueError, AliasResolutionError): + print(f"{name} not found in {package.name}, it's probably ok.") + return None + + if obj.has_docstring: + docstring = obj.docstring.value + # Remove signature if present in docstring + if docstring.startswith(obj.name) or ( + (hasattr(obj.parent, "name") and + docstring.startswith(f"{obj.parent.name}.{obj.name}"))): + docstring = "\n".join(docstring.splitlines()[2:]) + # Skip empty docstrings + if docstring.strip() == "": + return None + # Indent docstring + indentation_prefix = indentation * " " + docstring = indent(docstring + '\n"""', indentation_prefix) + docstring = '"""\n' + docstring + return docstring + return None + + +class ReplaceEllipsis(libcst.CSTTransformer): + def __init__(self, package, namespace): + self.package = package + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + # Insert module level docstring if _clone_signature is used + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign( + value=m.Call(func=m.Name(value="_clone_signature")) + ), m.ZeroOrMore()] + ) + for statement in updated_node.body: + new_body.append(statement) + if m.matches(statement, clone_matcher): + name = statement.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.package, 0) + if docstring is not None: + new_expr = libcst.Expr(value=libcst.SimpleString(docstring)) + new_line = libcst.SimpleStatementLine(body=[new_expr]) + new_body.append(new_line) + + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + class_matcher_1 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.SimpleStatementLine( + body=[m.Expr(m.Ellipsis()), m.ZeroOrMore()] + ), m.ZeroOrMore()] + ) + ) + class_matcher_2 = m.ClassDef( + name=m.Name(), + body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()] + ) + ) + + if m.matches(updated_node, class_matcher_1): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_node = libcst.SimpleString(value=docstring) + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, new_node) + + if m.matches(updated_node, class_matcher_2): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + list(updated_node.body.body) + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = ".".join(self.stack) + if self.base_namespace: + name = self.base_namespace + "." + name + + function_matcher = m.FunctionDef( + name=m.Name(), + body=m.SimpleStatementSuite( + body=[m.Expr( + m.Ellipsis() + )])) + if m.matches(original_node, function_matcher): + docstring = _get_docstring(name, self.package, self.indentation) + if docstring is not None: + new_docstring = libcst.SimpleString(value=docstring) + new_body = [ + libcst.SimpleWhitespace(self.indentation * " "), + libcst.Expr(value=new_docstring), + libcst.Newline() + ] + new_body = libcst.IndentedBlock(body=new_body) + updated_node = updated_node.with_changes(body=new_body) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +@click.command() +@click.argument('pyarrow_folder', type=click.Path(resolve_path=True)) +def add_docs_to_stub_files(pyarrow_folder): + print("Updating docstrings of stub files in:", pyarrow_folder) + package = griffe.load("pyarrow", try_relative_path=True, + force_inspection=True, resolve_aliases=True) + lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", + "_types"] + + for stub_file in Path(pyarrow_folder).rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + module = stub_file.with_suffix('').name + print(f"[{stub_file} {module}]") + + with open(stub_file, 'r') as f: + tree = libcst.parse_module(f.read()) + + if module in lib_modules: + module = "lib" + elif stub_file.parent.name in ["parquet", "interchange"]: + module = f"{stub_file.parent.name}.{module}" + elif module == "__init__": + module = "" + + modified_tree = tree.visit(ReplaceEllipsis(package, module)) + with open(stub_file, "w") as f: + f.write(modified_tree.code) + print("\n") + + +if __name__ == "__main__": + docstrings_map = {} + add_docs_to_stub_files(obj={}) + From f3528dbd81e3f7fe06fa463ac77d5a5a7563560b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 16:11:52 +0100 Subject: [PATCH 06/19] Use PYARROW_TEST_ANNOTATIONS in windows build, disable wheel docstring check --- ci/scripts/python_test_type_annotations.bat | 22 +++++++++++--------- ci/scripts/python_test_type_annotations.sh | 2 +- ci/scripts/python_wheel_validate_contents.py | 8 +------ ci/scripts/python_wheel_windows_build.bat | 6 +++--- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat index e4dc09d465d..ed6a2e664d7 100644 --- a/ci/scripts/python_test_type_annotations.bat +++ b/ci/scripts/python_test_type_annotations.bat @@ -19,17 +19,19 @@ set PYARROW_DIR=%1 -echo Annotation testing on Windows ... +if "%PYARROW_TEST_ANNOTATIONS%"=="ON" ( + echo Annotation testing on Windows ... -@REM Install library stubs. Note some libraries contain their own type hints so they need to be installed -%PYTHON_CMD% -m pip install fsspec pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 + @REM Install library stubs. Note some libraries contain their own type hints so they need to be installed. + %PYTHON_CMD% -m pip install fsspec pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 -@REM Install type checkers -%PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 + @REM Install type checkers + %PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 -@REM Run type checkers -pushd %PYARROW_DIR% + @REM Run type checkers + pushd %PYARROW_DIR% -mypy -pyright -ty check + mypy + pyright + ty check +) diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index 95ca9e5fe39..260f1bea42b 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -25,7 +25,7 @@ if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then . "${ARROW_PYTHON_VENV}/bin/activate" fi - # Install library stubs. Note some libraries contain their own type hints so they need to be installed + # Install library stubs. Note some libraries contain their own type hints so they need to be installed. pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil # Install type checkers diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index acab56af941..7d41b1b7385 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -34,13 +34,7 @@ def validate_wheel(path): ] assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" print(f"The wheel: {wheels[0]} seems valid.") - - # Validate at least one typing stub has been generated. - candidates = [info for info in f.filelist if info.filename.endswith('compute.pyi')] - assert candidates, "compute.pyi not found in wheel" - content = f.read(candidates[0]).decode('utf-8', errors='replace') - assert '"""' in content, "compute.pyi missing docstrings (no triple quotes found)" - + # TODO(GH-32609): Validate some docstrings were generated and added. def main(): parser = argparse.ArgumentParser() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 3da7f60f182..7fe431a0c89 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -132,14 +132,14 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python -@REM Build wheel -%PYTHON_CMD% setup.py bdist_wheel || exit /B 1 - @REM We first populate stub docstrings and then build the wheel %PYTHON_CMD% setup.py build_ext --inplace %PYTHON_CMD% -m pip install griffe libcst %PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs +@REM Build wheel +%PYTHON_CMD% setup.py bdist_wheel || exit /B 1 + @REM Repair the wheel with delvewheel @REM @REM Since we bundled the Arrow C++ libraries ourselves, we only need to From 7e160c124814307e0094e72f641171b4980da59f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 16:14:52 +0100 Subject: [PATCH 07/19] work on dev/update_stub_docstrings.py --- dev/update_stub_docstrings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index 2b0933d5351..ff581700de5 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -210,6 +210,5 @@ def add_docs_to_stub_files(pyarrow_folder): if __name__ == "__main__": - docstrings_map = {} add_docs_to_stub_files(obj={}) From 74cf0fce37ef1ad773d018b634f5b7dcd40f853a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 16:23:57 +0100 Subject: [PATCH 08/19] further work on dev/update_stub_docstrings.py --- dev/update_stub_docstrings.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index ff581700de5..5f992cf51d7 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -133,13 +133,13 @@ def leave_ClassDef(self, original_node, updated_node): docstring = _get_docstring(name, self.package, self.indentation) if docstring is not None: new_docstring = libcst.SimpleString(value=docstring) - new_body = [ - libcst.SimpleWhitespace(self.indentation * " "), - libcst.Expr(value=new_docstring), - libcst.Newline() - ] + list(updated_node.body.body) - new_body = libcst.IndentedBlock(body=new_body) - updated_node = updated_node.with_changes(body=new_body) + new_docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=new_docstring)] + ) + new_body = [new_docstring_stmt] + list(updated_node.body.body) + updated_node = updated_node.with_changes( + body=updated_node.body.with_changes(body=new_body) + ) self.stack.pop() self.indentation -= 1 @@ -164,12 +164,10 @@ def leave_FunctionDef(self, original_node, updated_node): docstring = _get_docstring(name, self.package, self.indentation) if docstring is not None: new_docstring = libcst.SimpleString(value=docstring) - new_body = [ - libcst.SimpleWhitespace(self.indentation * " "), - libcst.Expr(value=new_docstring), - libcst.Newline() - ] - new_body = libcst.IndentedBlock(body=new_body) + new_docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=new_docstring)] + ) + new_body = libcst.IndentedBlock(body=[new_docstring_stmt]) updated_node = updated_node.with_changes(body=new_body) self.stack.pop() @@ -210,5 +208,5 @@ def add_docs_to_stub_files(pyarrow_folder): if __name__ == "__main__": - add_docs_to_stub_files(obj={}) + add_docs_to_stub_files() From 28ebbc26b365b341b1e7579b6d73c9df5aa7b956 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 16:35:59 +0100 Subject: [PATCH 09/19] lint --- dev/update_stub_docstrings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index 5f992cf51d7..d1513067b07 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -209,4 +209,3 @@ def add_docs_to_stub_files(pyarrow_folder): if __name__ == "__main__": add_docs_to_stub_files() - From 4109bb3f6b2df01b713562b84309bd65bcd78319 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 17:15:24 +0100 Subject: [PATCH 10/19] add click for docstring population --- ci/scripts/python_wheel_macos_build.sh | 2 +- ci/scripts/python_wheel_windows_build.bat | 2 +- ci/scripts/python_wheel_xlinux_build.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index b64eee623f3..6a3773551f1 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -179,7 +179,7 @@ export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python # We first populate stub docstrings and then build the wheel python setup.py build_ext --inplace -python -m pip install griffe libcst +python -m pip install click griffe libcst python ../dev/update_stub_docstrings.py pyarrow-stubs python setup.py bdist_wheel diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 7fe431a0c89..3aec1ea410f 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -134,7 +134,7 @@ pushd C:\arrow\python @REM We first populate stub docstrings and then build the wheel %PYTHON_CMD% setup.py build_ext --inplace -%PYTHON_CMD% -m pip install griffe libcst +%PYTHON_CMD% -m pip install click griffe libcst %PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs @REM Build wheel diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index 977ef64e008..b31a52137a3 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -169,7 +169,7 @@ export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python # We first populate stub docstrings and then build the wheel python setup.py build_ext --inplace -python -m pip install griffe libcst +python -m pip install click griffe libcst python ../dev/update_stub_docstrings.py pyarrow-stubs python setup.py bdist_wheel From 08ecb71109b9c7026db7783f1ff1e319afae4956 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 18:30:41 +0100 Subject: [PATCH 11/19] Remove dependencies on click and griffe --- ci/scripts/python_wheel_macos_build.sh | 2 +- ci/scripts/python_wheel_windows_build.bat | 2 +- ci/scripts/python_wheel_xlinux_build.sh | 2 +- dev/update_stub_docstrings.py | 176 ++++++++++++++++------ 4 files changed, 132 insertions(+), 50 deletions(-) diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 6a3773551f1..bbdf1a5d4f3 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -179,7 +179,7 @@ export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python # We first populate stub docstrings and then build the wheel python setup.py build_ext --inplace -python -m pip install click griffe libcst +python -m pip install libcst python ../dev/update_stub_docstrings.py pyarrow-stubs python setup.py bdist_wheel diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 3aec1ea410f..3c5d41967de 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -134,7 +134,7 @@ pushd C:\arrow\python @REM We first populate stub docstrings and then build the wheel %PYTHON_CMD% setup.py build_ext --inplace -%PYTHON_CMD% -m pip install click griffe libcst +%PYTHON_CMD% -m pip install libcst %PYTHON_CMD% ..\dev\update_stub_docstrings.py pyarrow-stubs @REM Build wheel diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index b31a52137a3..9ff871d4d18 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -169,7 +169,7 @@ export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python # We first populate stub docstrings and then build the wheel python setup.py build_ext --inplace -python -m pip install click griffe libcst +python -m pip install libcst python ../dev/update_stub_docstrings.py pyarrow-stubs python setup.py bdist_wheel diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index d1513067b07..e55b85acb1f 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -24,54 +24,118 @@ # python ../dev/update_stub_docstrings.py pyarrow-stubs +import argparse +import importlib +import inspect from pathlib import Path from textwrap import indent -import click -# TODO: perhaps replace griffe with importlib -import griffe -from griffe import AliasResolutionError import libcst from libcst import matchers as m -def _get_docstring(name, package, indentation): - # print("extract_docstrings", name) - try: - obj = package.get_member(name) - except (KeyError, ValueError, AliasResolutionError): - # Some cython __init__ symbols can't be found - # e.g. pyarrow.lib.OSFile.__init__ - stack = name.split(".") - parent_name = ".".join(stack[:-1]) +def _resolve_object(module, path): + """ + Resolve an object by dotted path from a base module. + Parameters + ---------- + module : module + The base module (e.g., pyarrow) + path : str + Dotted path like "lib.Array" or "lib.concat_arrays" + + Returns + ------- + tuple + (obj, parent, obj_name) or (None, None, None) if not found + """ + if not path: + return module, None, module.__name__ + + parts = path.split(".") + parent = None + obj = module + + for part in parts: + parent = obj try: - obj = package.get_member(parent_name).all_members[stack[-1]] - except (KeyError, ValueError, AliasResolutionError): - print(f"{name} not found in {package.name}, it's probably ok.") - return None - - if obj.has_docstring: - docstring = obj.docstring.value - # Remove signature if present in docstring - if docstring.startswith(obj.name) or ( - (hasattr(obj.parent, "name") and - docstring.startswith(f"{obj.parent.name}.{obj.name}"))): - docstring = "\n".join(docstring.splitlines()[2:]) - # Skip empty docstrings - if docstring.strip() == "": - return None - # Indent docstring - indentation_prefix = indentation * " " - docstring = indent(docstring + '\n"""', indentation_prefix) - docstring = '"""\n' + docstring - return docstring - return None + obj = getattr(obj, part) + except AttributeError: + # Fallback: try __dict__ access for special methods like __init__ + # that may not be directly accessible via getattr + if hasattr(parent, "__dict__"): + obj = parent.__dict__.get(part) + if obj is not None: + continue + # Try vars() as another fallback + try: + obj = vars(parent).get(part) + if obj is not None: + continue + except TypeError: + pass + return None, None, None + + # Get the object's simple name + obj_name = getattr(obj, "__name__", parts[-1]) + return obj, parent, obj_name + + +def _get_docstring(name, module, indentation): + """ + Extract and format docstring for a symbol. + + Parameters + ---------- + name : str + Dotted name like "lib.Array" or "lib.concat_arrays" + module : module + The pyarrow module + indentation : int + Number of indentation levels (4 spaces each) + + Returns + ------- + str or None + Formatted docstring ready for insertion, or None if not found + """ + obj, parent, obj_name = _resolve_object(module, name) + + if obj is None: + print(f"{name} not found in {module.__name__}, it's probably ok.") + return None + + # Get docstring using inspect.getdoc for cleaner formatting + docstring = inspect.getdoc(obj) + if not docstring: + return None + + # Get parent name for signature detection + parent_name = getattr(parent, "__name__", None) if parent else None + + # Remove signature if present in docstring + # Cython/pybind11 often include signatures like "func_name(...)\n\n..." + if docstring.startswith(obj_name) or ( + parent_name is not None and docstring.startswith(f"{parent_name}.{obj_name}") + ): + docstring = "\n".join(docstring.splitlines()[2:]) + + # Skip empty docstrings + if not docstring.strip(): + return None + + # Format as docstring with proper indentation + indentation_prefix = indentation * " " + docstring = indent(docstring + '\n"""', indentation_prefix) + docstring = '"""\n' + docstring + + return docstring class ReplaceEllipsis(libcst.CSTTransformer): - def __init__(self, package, namespace): - self.package = package + def __init__(self, module, namespace): + self.module = module self.base_namespace = namespace self.stack = [] self.indentation = 0 @@ -90,7 +154,7 @@ def leave_Module(self, original_node, updated_node): name = statement.body[0].targets[0].target.value if self.base_namespace: name = f"{self.base_namespace}.{name}" - docstring = _get_docstring(name, self.package, 0) + docstring = _get_docstring(name, self.module, 0) if docstring is not None: new_expr = libcst.Expr(value=libcst.SimpleString(docstring)) new_line = libcst.SimpleStatementLine(body=[new_expr]) @@ -123,14 +187,14 @@ def leave_ClassDef(self, original_node, updated_node): ) if m.matches(updated_node, class_matcher_1): - docstring = _get_docstring(name, self.package, self.indentation) + docstring = _get_docstring(name, self.module, self.indentation) if docstring is not None: new_node = libcst.SimpleString(value=docstring) updated_node = updated_node.deep_replace( updated_node.body.body[0].body[0].value, new_node) if m.matches(updated_node, class_matcher_2): - docstring = _get_docstring(name, self.package, self.indentation) + docstring = _get_docstring(name, self.module, self.indentation) if docstring is not None: new_docstring = libcst.SimpleString(value=docstring) new_docstring_stmt = libcst.SimpleStatementLine( @@ -161,7 +225,7 @@ def leave_FunctionDef(self, original_node, updated_node): m.Ellipsis() )])) if m.matches(original_node, function_matcher): - docstring = _get_docstring(name, self.package, self.indentation) + docstring = _get_docstring(name, self.module, self.indentation) if docstring is not None: new_docstring = libcst.SimpleString(value=docstring) new_docstring_stmt = libcst.SimpleStatementLine( @@ -175,17 +239,25 @@ def leave_FunctionDef(self, original_node, updated_node): return updated_node -@click.command() -@click.argument('pyarrow_folder', type=click.Path(resolve_path=True)) def add_docs_to_stub_files(pyarrow_folder): + """ + Update stub files with docstrings extracted from pyarrow runtime. + + Parameters + ---------- + pyarrow_folder : Path + Path to the pyarrow-stubs folder + """ print("Updating docstrings of stub files in:", pyarrow_folder) - package = griffe.load("pyarrow", try_relative_path=True, - force_inspection=True, resolve_aliases=True) + + # Load pyarrow using importlib + pyarrow_module = importlib.import_module("pyarrow") + lib_modules = ["array", "builder", "compat", "config", "device", "error", "io", "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"] - for stub_file in Path(pyarrow_folder).rglob('*.pyi'): + for stub_file in pyarrow_folder.rglob('*.pyi'): if stub_file.name == "_stubs_typing.pyi": continue module = stub_file.with_suffix('').name @@ -201,11 +273,21 @@ def add_docs_to_stub_files(pyarrow_folder): elif module == "__init__": module = "" - modified_tree = tree.visit(ReplaceEllipsis(package, module)) + modified_tree = tree.visit(ReplaceEllipsis(pyarrow_module, module)) with open(stub_file, "w") as f: f.write(modified_tree.code) print("\n") if __name__ == "__main__": - add_docs_to_stub_files() + parser = argparse.ArgumentParser( + description="Extract docstrings from pyarrow and update stub files." + ) + parser.add_argument( + "pyarrow_folder", + type=Path, + help="Path to the pyarrow-stubs folder" + ) + args = parser.parse_args() + + add_docs_to_stub_files(args.pyarrow_folder.resolve()) From a2b522de4edd570b4e0e14c81f7ceeb6492d7828 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 19:20:55 +0100 Subject: [PATCH 12/19] fix import paths --- dev/update_stub_docstrings.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index e55b85acb1f..fe680740cf2 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -27,9 +27,13 @@ import argparse import importlib import inspect +import sys from pathlib import Path from textwrap import indent +# Add current directory to path to find locally built pyarrow +sys.path.insert(0, ".") + import libcst from libcst import matchers as m From cebbfcdf835219277100163e3699dc1c2184d312 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 14 Jan 2026 19:58:01 +0100 Subject: [PATCH 13/19] add PYARROW_TEST_ANNOTATIONS to AMD64 Windows 2022 Python 3.13 change bat lint add a popd and nicer logging for windows ReplaceElipsis -> DocstringInserter simplify remove sphinx --- .github/workflows/python.yml | 2 ++ ci/scripts/python_test_type_annotations.bat | 11 +++++++---- ci/scripts/python_test_type_annotations.sh | 1 + dev/update_stub_docstrings.py | 17 ++++++----------- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 4ca0f9b6dc6..e886e7527ec 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -303,5 +303,7 @@ jobs: call "ci\scripts\python_test.bat" %cd% - name: Test annotations shell: cmd + env: + PYARROW_TEST_ANNOTATIONS: "ON" run: | call "ci\scripts\python_test_type_annotations.bat" %cd%\python diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat index ed6a2e664d7..5a3d0952dc8 100644 --- a/ci/scripts/python_test_type_annotations.bat +++ b/ci/scripts/python_test_type_annotations.bat @@ -23,7 +23,7 @@ if "%PYARROW_TEST_ANNOTATIONS%"=="ON" ( echo Annotation testing on Windows ... @REM Install library stubs. Note some libraries contain their own type hints so they need to be installed. - %PYTHON_CMD% -m pip install fsspec pandas-stubs scipy-stubs sphinx types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 + %PYTHON_CMD% -m pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 @REM Install type checkers %PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 @@ -31,7 +31,10 @@ if "%PYARROW_TEST_ANNOTATIONS%"=="ON" ( @REM Run type checkers pushd %PYARROW_DIR% - mypy - pyright - ty check + mypy || exit /B 1 + pyright || exit /B 1 + ty check || exit /B 1 + popd +) else ( + echo Annotation testing skipped on Windows ... ) diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index 260f1bea42b..05586b6e1e3 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -36,6 +36,7 @@ if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then mypy pyright ty check; + popd else echo "Skipping type annotation tests"; fi diff --git a/dev/update_stub_docstrings.py b/dev/update_stub_docstrings.py index fe680740cf2..1ba69fa27f5 100644 --- a/dev/update_stub_docstrings.py +++ b/dev/update_stub_docstrings.py @@ -31,12 +31,12 @@ from pathlib import Path from textwrap import indent -# Add current directory to path to find locally built pyarrow -sys.path.insert(0, ".") - import libcst from libcst import matchers as m +# Add current directory to path to find locally built pyarrow +sys.path.insert(0, ".") + def _resolve_object(module, path): """ @@ -66,13 +66,8 @@ def _resolve_object(module, path): try: obj = getattr(obj, part) except AttributeError: - # Fallback: try __dict__ access for special methods like __init__ + # Fallback: try vars() for special methods like __init__ # that may not be directly accessible via getattr - if hasattr(parent, "__dict__"): - obj = parent.__dict__.get(part) - if obj is not None: - continue - # Try vars() as another fallback try: obj = vars(parent).get(part) if obj is not None: @@ -137,7 +132,7 @@ def _get_docstring(name, module, indentation): return docstring -class ReplaceEllipsis(libcst.CSTTransformer): +class DocstringInserter(libcst.CSTTransformer): def __init__(self, module, namespace): self.module = module self.base_namespace = namespace @@ -277,7 +272,7 @@ def add_docs_to_stub_files(pyarrow_folder): elif module == "__init__": module = "" - modified_tree = tree.visit(ReplaceEllipsis(pyarrow_module, module)) + modified_tree = tree.visit(DocstringInserter(pyarrow_module, module)) with open(stub_file, "w") as f: f.write(modified_tree.code) print("\n") From b5f357b9ffaea5efd8a5bf49eca8c3271a9a8e0f Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 23 Jan 2026 17:19:45 +0100 Subject: [PATCH 14/19] move check to pre-commit --- .github/workflows/python.yml | 11 ---- .pre-commit-config.yaml | 58 +++++++++++++++++++++ ci/scripts/python_test_type_annotations.bat | 40 -------------- ci/scripts/python_test_type_annotations.sh | 42 --------------- compose.yaml | 16 ++---- 5 files changed, 62 insertions(+), 105 deletions(-) delete mode 100644 ci/scripts/python_test_type_annotations.bat delete mode 100755 ci/scripts/python_test_type_annotations.sh diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index e886e7527ec..e5d367958dd 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -239,11 +239,6 @@ jobs: - name: Test shell: bash run: ci/scripts/python_test.sh $(pwd) $(pwd)/build - - name: Test annotations - shell: bash - env: - PYARROW_TEST_ANNOTATIONS: "ON" - run: ci/scripts/python_test_type_annotations.sh $(pwd)/python windows: name: AMD64 Windows 2022 Python 3.13 @@ -301,9 +296,3 @@ jobs: shell: cmd run: | call "ci\scripts\python_test.bat" %cd% - - name: Test annotations - shell: cmd - env: - PYARROW_TEST_ANNOTATIONS: "ON" - run: | - call "ci\scripts\python_test_type_annotations.bat" %cd%\python diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4c4f04188d..1e370d32779 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -188,6 +188,64 @@ repos: ?^python/pyarrow/util\.py$| ?^python/pyarrow/vendored/| ) + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.15.0 + hooks: + - id: mypy + alias: python + name: Python (Stubs) Type Check (mypy) + args: + - "--config-file=python/pyproject.toml" + files: >- + ^python/pyarrow-stubs/ + pass_filenames: false + additional_dependencies: + - fsspec + - pandas-stubs + - scipy-stubs + - types-cffi + - types-psutil + - types-requests + - types-python-dateutil + - repo: https://github.com/RobertCraiworthy/mirrors-pyright + rev: v1.1.398 + hooks: + - id: pyright + alias: python + name: Python (Stubs) Type Check (pyright) + args: + - "--project=python/pyproject.toml" + files: >- + ^python/pyarrow-stubs/ + pass_filenames: false + additional_dependencies: + - fsspec + - pandas-stubs + - scipy-stubs + - types-cffi + - types-psutil + - types-requests + - types-python-dateutil + - repo: https://github.com/astral-sh/ty + rev: 0.0.13 + hooks: + - id: ty + alias: python + name: Python (Stubs) Type Check (ty) + args: + - "check" + - "python/pyarrow-stubs" + files: >- + ^python/pyarrow-stubs/ + pass_filenames: false + additional_dependencies: + - fsspec + - pandas-stubs + - scipy-stubs + - types-cffi + - types-psutil + - types-requests + - types-python-dateutil - repo: local hooks: - id: lintr diff --git a/ci/scripts/python_test_type_annotations.bat b/ci/scripts/python_test_type_annotations.bat deleted file mode 100644 index 5a3d0952dc8..00000000000 --- a/ci/scripts/python_test_type_annotations.bat +++ /dev/null @@ -1,40 +0,0 @@ -@rem Licensed to the Apache Software Foundation (ASF) under one -@rem or more contributor license agreements. See the NOTICE file -@rem distributed with this work for additional information -@rem regarding copyright ownership. The ASF licenses this file -@rem to you under the Apache License, Version 2.0 (the -@rem "License"); you may not use this file except in compliance -@rem with the License. You may obtain a copy of the License at -@rem -@rem http://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, -@rem software distributed under the License is distributed on an -@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -@rem KIND, either express or implied. See the License for the -@rem specific language governing permissions and limitations -@rem under the License. - -@echo on - -set PYARROW_DIR=%1 - -if "%PYARROW_TEST_ANNOTATIONS%"=="ON" ( - echo Annotation testing on Windows ... - - @REM Install library stubs. Note some libraries contain their own type hints so they need to be installed. - %PYTHON_CMD% -m pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil || exit /B 1 - - @REM Install type checkers - %PYTHON_CMD% -m pip install mypy pyright ty || exit /B 1 - - @REM Run type checkers - pushd %PYARROW_DIR% - - mypy || exit /B 1 - pyright || exit /B 1 - ty check || exit /B 1 - popd -) else ( - echo Annotation testing skipped on Windows ... -) diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh deleted file mode 100755 index 05586b6e1e3..00000000000 --- a/ci/scripts/python_test_type_annotations.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex -pyarrow_dir=${1} - -if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then - if [ -n "${ARROW_PYTHON_VENV:-}" ]; then - . "${ARROW_PYTHON_VENV}/bin/activate" - fi - - # Install library stubs. Note some libraries contain their own type hints so they need to be installed. - pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil - - # Install type checkers - pip install mypy pyright ty - - # Run type checkers - pushd ${pyarrow_dir} - mypy - pyright - ty check; - popd -else - echo "Skipping type annotation tests"; -fi diff --git a/compose.yaml b/compose.yaml index ae0a1d42439..2bd38a381e8 100644 --- a/compose.yaml +++ b/compose.yaml @@ -919,14 +919,12 @@ services: environment: <<: [*common, *ccache, *sccache] PYTEST_ARGS: # inherit - PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: &python-conda-command [" /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow && - /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] + /arrow/ci/scripts/python_test.sh /arrow"] conda-python-emscripten: # Usage: @@ -1003,7 +1001,6 @@ services: ARROW_S3: "OFF" ARROW_SUBSTRAIT: "OFF" ARROW_WITH_OPENTELEMETRY: "OFF" - PYARROW_TEST_ANNOTATIONS: "ON" SETUPTOOLS_SCM_PRETEND_VERSION: volumes: *ubuntu-volumes deploy: *cuda-deploy @@ -1011,8 +1008,7 @@ services: /bin/bash -c " /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow && - /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python" + /arrow/ci/scripts/python_test.sh /arrow" debian-python: # Usage: @@ -1504,7 +1500,6 @@ services: python: ${PYTHON} shm_size: *shm-size environment: - PYARROW_TEST_ANNOTATIONS: "ON" <<: [*common, *ccache, *sccache] PARQUET_REQUIRE_ENCRYPTION: # inherit HYPOTHESIS_PROFILE: # inherit @@ -1515,8 +1510,7 @@ services: /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && mamba uninstall -y numpy && - /arrow/ci/scripts/python_test.sh /arrow && - /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] + /arrow/ci/scripts/python_test.sh /arrow"] conda-python-docs: # Usage: @@ -1536,15 +1530,13 @@ services: BUILD_DOCS_CPP: "ON" BUILD_DOCS_PYTHON: "ON" PYTEST_ARGS: "--doctest-modules --doctest-cython" - PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow && - /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] + /arrow/ci/scripts/python_test.sh /arrow"] conda-python-dask: # Possible $DASK parameters: From 577068bdb4dff2be857d40284b884aa1bed550a0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 23 Jan 2026 17:41:43 +0100 Subject: [PATCH 15/19] change pre-commit, add note --- .pre-commit-config.yaml | 15 ++++++++------- python/pyarrow-stubs/pyarrow/__init__.pyi | 3 +++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1e370d32779..8aaf81837e0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -207,8 +207,8 @@ repos: - types-psutil - types-requests - types-python-dateutil - - repo: https://github.com/RobertCraiworthy/mirrors-pyright - rev: v1.1.398 + - repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.408 hooks: - id: pyright alias: python @@ -218,6 +218,8 @@ repos: files: >- ^python/pyarrow-stubs/ pass_filenames: false + env: + PYRIGHT_PYTHON_FORCE_VERSION: latest additional_dependencies: - fsspec - pandas-stubs @@ -226,19 +228,18 @@ repos: - types-psutil - types-requests - types-python-dateutil - - repo: https://github.com/astral-sh/ty - rev: 0.0.13 + - repo: local hooks: - id: ty alias: python name: Python (Stubs) Type Check (ty) - args: - - "check" - - "python/pyarrow-stubs" + language: python + entry: ty check python/pyarrow-stubs files: >- ^python/pyarrow-stubs/ pass_filenames: false additional_dependencies: + - ty - fsspec - pandas-stubs - scipy-stubs diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi index 2a68a513099..ccec8d5abc0 100644 --- a/python/pyarrow-stubs/pyarrow/__init__.pyi +++ b/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -23,4 +23,7 @@ Complete type annotations will be added in subsequent PRs. from typing import Any +# TODO(GH-48970): remove __getattr__ before release as this +# will annotate non-existing attributes as Any. +# https://github.com/apache/arrow/issues/48970 def __getattr__(name: str) -> Any: ... From 27a4a65b20ea8f162bf8698848f1855d0506df27 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 23 Jan 2026 18:01:08 +0100 Subject: [PATCH 16/19] change pre-commit script --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8aaf81837e0..bd584e3689e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -196,6 +196,7 @@ repos: name: Python (Stubs) Type Check (mypy) args: - "--config-file=python/pyproject.toml" + - "python/pyarrow-stubs" files: >- ^python/pyarrow-stubs/ pass_filenames: false From 6f2f175b4e099ec6c4ee336cb263d8a8f6742b16 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 23 Jan 2026 19:13:49 +0100 Subject: [PATCH 17/19] Revert from pre-commit to ci/scripts/python_test_type_annotations.sh --- .pre-commit-config.yaml | 60 ---------------------- ci/scripts/python_test_type_annotations.sh | 42 +++++++++++++++ compose.yaml | 4 +- 3 files changed, 45 insertions(+), 61 deletions(-) create mode 100755 ci/scripts/python_test_type_annotations.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd584e3689e..c4c4f04188d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -188,66 +188,6 @@ repos: ?^python/pyarrow/util\.py$| ?^python/pyarrow/vendored/| ) - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.15.0 - hooks: - - id: mypy - alias: python - name: Python (Stubs) Type Check (mypy) - args: - - "--config-file=python/pyproject.toml" - - "python/pyarrow-stubs" - files: >- - ^python/pyarrow-stubs/ - pass_filenames: false - additional_dependencies: - - fsspec - - pandas-stubs - - scipy-stubs - - types-cffi - - types-psutil - - types-requests - - types-python-dateutil - - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.408 - hooks: - - id: pyright - alias: python - name: Python (Stubs) Type Check (pyright) - args: - - "--project=python/pyproject.toml" - files: >- - ^python/pyarrow-stubs/ - pass_filenames: false - env: - PYRIGHT_PYTHON_FORCE_VERSION: latest - additional_dependencies: - - fsspec - - pandas-stubs - - scipy-stubs - - types-cffi - - types-psutil - - types-requests - - types-python-dateutil - - repo: local - hooks: - - id: ty - alias: python - name: Python (Stubs) Type Check (ty) - language: python - entry: ty check python/pyarrow-stubs - files: >- - ^python/pyarrow-stubs/ - pass_filenames: false - additional_dependencies: - - ty - - fsspec - - pandas-stubs - - scipy-stubs - - types-cffi - - types-psutil - - types-requests - - types-python-dateutil - repo: local hooks: - id: lintr diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh new file mode 100755 index 00000000000..05586b6e1e3 --- /dev/null +++ b/ci/scripts/python_test_type_annotations.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex +pyarrow_dir=${1} + +if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then + if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + . "${ARROW_PYTHON_VENV}/bin/activate" + fi + + # Install library stubs. Note some libraries contain their own type hints so they need to be installed. + pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil + + # Install type checkers + pip install mypy pyright ty + + # Run type checkers + pushd ${pyarrow_dir} + mypy + pyright + ty check; + popd +else + echo "Skipping type annotation tests"; +fi diff --git a/compose.yaml b/compose.yaml index 2bd38a381e8..741e5601ecf 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1530,13 +1530,15 @@ services: BUILD_DOCS_CPP: "ON" BUILD_DOCS_PYTHON: "ON" PYTEST_ARGS: "--doctest-modules --doctest-cython" + PYARROW_TEST_ANNOTATIONS: "ON" volumes: *conda-volumes command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-dask: # Possible $DASK parameters: From 2ccbb37e103c82bd73f77672fc126e2e47571359 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 25 Jan 2026 11:07:57 +0100 Subject: [PATCH 18/19] Apply suggestions from code review Co-authored-by: Sutou Kouhei --- ci/scripts/python_test_type_annotations.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/python_test_type_annotations.sh b/ci/scripts/python_test_type_annotations.sh index 05586b6e1e3..3d740c0a059 100755 --- a/ci/scripts/python_test_type_annotations.sh +++ b/ci/scripts/python_test_type_annotations.sh @@ -35,8 +35,8 @@ if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then pushd ${pyarrow_dir} mypy pyright - ty check; + ty check popd else - echo "Skipping type annotation tests"; + echo "Skipping type annotation tests" fi From b1b7cd9476fa3330d0c86c7e5bd950e9aed734ee Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 25 Jan 2026 11:22:38 +0100 Subject: [PATCH 19/19] apply review suggestion --- .pre-commit-config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4c4f04188d..fb46b2eda09 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -337,6 +337,7 @@ repos: ?^ci/scripts/python_sdist_build\.sh$| ?^ci/scripts/python_sdist_test\.sh$| ?^ci/scripts/python_wheel_unix_test\.sh$| + ?^ci/scripts/python_test_type_annotations\.sh$| ?^ci/scripts/r_build\.sh$| ?^ci/scripts/r_revdepcheck\.sh$| ?^ci/scripts/release_test\.sh$| @@ -379,6 +380,7 @@ repos: # TODO: Remove this when we fix all lint failures files: >- ( + ?^ci/scripts/python_test_type_annotations\.sh$| ?^dev/release/05-binary-upload\.sh$| ?^dev/release/binary-recover\.sh$| ?^dev/release/post-03-binary\.sh$|