diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fe0aef9..2088b0d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,6 +18,10 @@ jobs: uses: actions/checkout@v4 with: submodules: recursive + - name: Pull submodules + run: | + git submodule update --init --recursive + git submodule update --recursive - name: Set up Python uses: actions/setup-python@v5 with: @@ -25,6 +29,14 @@ jobs: - name: Install build dependencies run: | python -m pip install build --user + - name: Build Substrait planloader library + run: | + cd ${{ github.workspace }}/third_party/substrait-cpp + make release + - name: Install Substrait planloader library + run: | + cd ${{ github.workspace }}/third_party/substrait-cpp/build-Release/export/planloader + make install - name: Build package run: | python -m build diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a770484..f6fbe48 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -22,10 +22,22 @@ jobs: uses: actions/checkout@v4 with: submodules: recursive + - name: Pull submodules + run: | + git submodule update --init --recursive + git submodule update --recursive - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} + - name: Build Substrait planloader library + run: | + cd ${{ github.workspace }}/third_party/substrait-cpp + make release + - name: Install Substrait planloader library + run: | + cd ${{ github.workspace }}/third_party/substrait-cpp/build-Release/export/planloader + make install - name: Install package and test dependencies run: | python -m pip install --upgrade pip diff --git a/.gitmodules b/.gitmodules index d9705e1..b8c576e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third_party/substrait"] path = third_party/substrait url = https://github.com/substrait-io/substrait +[submodule "third_party/substrait-cpp"] + path = third_party/substrait-cpp + url = git@github.com:substrait-io/substrait-cpp.git diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3421534..3d932c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,6 +50,16 @@ Generate the protobuf files manually. Requires protobuf `v3.20.1`. # Build + +## Build and install the textplan loader dynamic library +```commandline +pushd third_party/substrait-cpp +make release +cd build-Release/export/planloader +make install +popd +``` + ## Python package Editable installation. ``` diff --git a/README.md b/README.md index 19d663e..9be0d98 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ This project is not an execution engine for Substrait Plans. This is an experimental package that is still under development. # Example -At the moment, this project contains only generated Python classes for the Substrait protobuf messages. Let's use an existing Substrait producer, [Ibis](https://ibis-project.org), to provide an example using Python Substrait as the consumer. +At the moment, this project contains generated Python classes for the Substrait protobuf messages and a library for loading and saving them in various formats. Let's use an existing Substrait producer, [Ibis](https://ibis-project.org), to provide an example using Python Substrait as the consumer. ## Produce a Substrait Plan with Ibis ``` In [1]: import ibis diff --git a/environment.yml b/environment.yml index e0116bb..fc005cc 100644 --- a/environment.yml +++ b/environment.yml @@ -11,3 +11,4 @@ dependencies: - python >= 3.8.1 - setuptools >= 61.0.0 - setuptools_scm >= 6.2.0 + - libcurl4-openssl-dev diff --git a/pyproject.toml b/pyproject.toml index bb937af..149a722 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ authors = [{name = "Substrait contributors", email = "substrait@googlegroups.com license = {text = "Apache-2.0"} readme = "README.md" requires-python = ">=3.8.1" -dependencies = ["protobuf >= 3.20"] +dependencies = ["protobuf >= 3.24", "protobuf-c >= 3.20"] dynamic = ["version"] [tool.setuptools_scm] @@ -17,9 +17,10 @@ test = ["pytest >= 7.0.0"] [tool.pytest.ini_options] pythonpath = "src" +addopts = "--ignore=third_party" [build-system] -requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=6.2.0"] +requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=6.2.0", "ninja", "cmake>=3.24", "libcurl4-openssl-dev", "scikit-build>=0.13"] build-backend = "setuptools.build_meta" [tool.ruff] diff --git a/src/substrait/planloader/planloader.py b/src/substrait/planloader/planloader.py new file mode 100644 index 0000000..090a3ed --- /dev/null +++ b/src/substrait/planloader/planloader.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Routines for loading and saving Substrait plans.""" +import ctypes +import ctypes.util as ctutil +import enum +import substrait.gen.proto.plan_pb2 as plan_pb2 +import sys + + +class PlanFileFormat(enum.Enum): + BINARY = ctypes.c_int32(0) + JSON = ctypes.c_int32(1) + PROTOTEXT = ctypes.c_int32(2) + TEXT = ctypes.c_int32(3) + + +class PlanFileException(Exception): + pass + + +class SerializedPlan(ctypes.Structure): + pass + + +SerializedPlan._fields_ = [ + ("buffer", ctypes.POINTER(ctypes.c_byte)), + ("size", ctypes.c_int32), + ("errorMessage", ctypes.c_char_p), +] + + +# Load the C++ library +planloader_path = ctutil.find_library("planloader") +planloader_lib = ctypes.CDLL(planloader_path) +if planloader_lib is None: + print('Failed to find planloader library') + sys.exit(1) + +# Declare the function signatures for the external functions. +external_load_substrait_plan = planloader_lib.load_substrait_plan +external_load_substrait_plan.argtypes = [ctypes.c_char_p] +external_load_substrait_plan.restype = ctypes.POINTER(SerializedPlan) + +external_free_substrait_plan = planloader_lib.free_substrait_plan +external_free_substrait_plan.argtypes = [ctypes.POINTER(SerializedPlan)] +external_free_substrait_plan.restype = None + +external_save_substrait_plan = planloader_lib.save_substrait_plan +external_save_substrait_plan.argtypes = [ctypes.c_void_p, ctypes.c_int32, ctypes.c_char_p, ctypes.c_int32] +external_save_substrait_plan.restype = ctypes.c_char_p + + +def load_substrait_plan(filename: str) -> plan_pb2.Plan: + """ + Loads a Substrait plan (in any format) from disk. + + Returns: + A Plan protobuf object if successful. + Raises: + PlanFileException if an except occurs while converting or reading from disk. + """ + result = external_load_substrait_plan(filename.encode('UTF-8')) + try: + if result.contents.errorMessage: + raise PlanFileException(result.contents.errorMessage) + data = ctypes.string_at(result.contents.buffer, result.contents.size) + plan = plan_pb2.Plan() + plan.ParseFromString(data) + finally: + external_free_substrait_plan(result) + return plan + + +def save_substrait_plan(plan: plan_pb2.Plan, filename: str, file_format: PlanFileFormat): + """ + Saves the given plan to disk in the specified file format. + + Raises: + PlanFileException if an except occurs while converting or writing to disk. + """ + data = plan.SerializeToString() + err = external_save_substrait_plan(data, len(data), filename.encode('UTF-8'), file_format.value) + if err: + raise PlanFileException(err) diff --git a/tests/test_planloader.py b/tests/test_planloader.py new file mode 100644 index 0000000..1cdb82d --- /dev/null +++ b/tests/test_planloader.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 + + +from substrait.planloader import planloader + + +def test_main(): + testplan = planloader.load_substrait_plan('tests/tpch-plan01.json') + planloader.save_substrait_plan(testplan, 'myoutfile.splan', planloader.PlanFileFormat.TEXT.value) diff --git a/tests/tpch-plan01.json b/tests/tpch-plan01.json new file mode 100644 index 0000000..877b4b7 --- /dev/null +++ b/tests/tpch-plan01.json @@ -0,0 +1,823 @@ +# select l_returnflag, l_linestatus, sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price, sum(l_extendedprice smoke.sh tpch_smoke.sh (1 - l_discount)) as sum_disc_price, sum(l_extendedprice smoke.sh tpch_smoke.sh (1 - l_discount) smoke.sh tpch_smoke.sh (1 + l_tax)) as sum_charge, avg(l_quantity) as avg_qty, avg(l_extendedprice) as avg_price, avg(l_discount) as avg_disc, count(*) as count_order from lineitem where l_shipdate <= date '1998-12-01' - interval '120' day (3) group by l_returnflag, l_linestatus order by l_returnflag, l_linestatus +{ + "extensionUris": [{ + "extensionUriAnchor": 3, + "uri": "/functions_aggregate_generic.yaml" + }, { + "extensionUriAnchor": 2, + "uri": "/functions_arithmetic_decimal.yaml" + }, { + "extensionUriAnchor": 1, + "uri": "/functions_datetime.yaml" + }], + "extensions": [{ + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "lte:date_date" + } + }, { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 1, + "name": "subtract:date_day" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "multiply:dec_dec" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 3, + "name": "subtract:dec_dec" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 4, + "name": "add:dec_dec" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 5, + "name": "sum:dec" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 6, + "name": "avg:dec" + } + }, { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 7, + "name": "count:any" + } + }], + "relations": [{ + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [16, 17, 18, 19, 20, 21, 22] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT", "L_SHIPMODE", "L_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["LINEITEM"] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [{ + "value": { + "literal": { + "date": 10561, + "nullable": false, + "typeVariationReference": 0 + } + } + }, { + "value": { + "literal": { + "intervalDayToSecond": { + "days": 120, + "seconds": 0, + "microseconds": 0 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + }], + "options": [] + } + } + }], + "options": [] + } + } + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + } + }], + "options": [] + } + }, { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + } + }], + "options": [] + } + } + }, { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [{ + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + } + }], + "options": [] + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + }] + } + }, + "groupings": [{ + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }] + }], + "measures": [{ + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "measure": { + "functionReference": 6, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "measure": { + "functionReference": 6, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "measure": { + "functionReference": 6, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [{ + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + }], + "options": [] + } + }, { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [], + "options": [] + } + }] + } + }, + "sorts": [{ + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }, { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }] + } + }, + "names": ["L_RETURNFLAG", "L_LINESTATUS", "SUM_QTY", "SUM_BASE_PRICE", "SUM_DISC_PRICE", "SUM_CHARGE", "AVG_QTY", "AVG_PRICE", "AVG_DISC", "COUNT_ORDER"] + } + }], + "expectedTypeUrls": [] +} diff --git a/third_party/substrait-cpp b/third_party/substrait-cpp new file mode 160000 index 0000000..e26585f --- /dev/null +++ b/third_party/substrait-cpp @@ -0,0 +1 @@ +Subproject commit e26585f45cdfd0ed3bf03f700c354f04685398c6