diff --git a/pyiceberg/table/deletion_vector.py b/pyiceberg/table/deletion_vector.py index f337c758a7..88fb3daf73 100644 --- a/pyiceberg/table/deletion_vector.py +++ b/pyiceberg/table/deletion_vector.py @@ -77,11 +77,17 @@ def to_vector(self) -> "pa.ChunkedArray": return self._bitmaps_to_chunked_array(self._bitmaps) +def _extract_vector_payload(blob_payload: bytes) -> bytes: + """Strip deletion-vector-v1 blob framing: length(4 big-endian) + DV magic(4) ... CRC(4 big-endian).""" + length_prefix = int.from_bytes(blob_payload[0:4], "big") + return blob_payload[8 : 4 + length_prefix] + + def deletion_vectors_from_puffin_file(puffin_file: PuffinFile) -> list[DeletionVector]: return [ DeletionVector( referenced_data_file=blob.properties[PROPERTY_REFERENCED_DATA_FILE], - bitmaps=DeletionVector._deserialize_bitmap(puffin_file.get_blob_payload(blob)), + bitmaps=DeletionVector._deserialize_bitmap(_extract_vector_payload(puffin_file.get_blob_payload(blob))), ) for blob in puffin_file.footer.blobs ] diff --git a/pyiceberg/table/puffin.py b/pyiceberg/table/puffin.py index 571687bb3f..6803d905f8 100644 --- a/pyiceberg/table/puffin.py +++ b/pyiceberg/table/puffin.py @@ -29,7 +29,7 @@ class PuffinBlobMetadata(IcebergBaseModel): - type: Literal["deletion-vector-v1"] = Field() + type: Literal["apache-datasketches-theta-v1", "deletion-vector-v1"] = Field() fields: list[int] = Field() snapshot_id: int = Field(alias="snapshot-id") sequence_number: int = Field(alias="sequence-number") @@ -65,7 +65,7 @@ def __init__(self, puffin: bytes) -> None: footer_payload_size_int = int.from_bytes(puffin[-12:-8], byteorder="little") self.footer = Footer.model_validate_json(puffin[-(footer_payload_size_int + 12) : -12]) - self._payload = puffin[8:] + self._payload = puffin def get_blob_payload(self, blob: PuffinBlobMetadata) -> bytes: return self._payload[blob.offset : blob.offset + blob.length] diff --git a/pyiceberg/table/theta_sketch.py b/pyiceberg/table/theta_sketch.py new file mode 100644 index 0000000000..aa6c121094 --- /dev/null +++ b/pyiceberg/table/theta_sketch.py @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from typing import TYPE_CHECKING + +import zstandard + +from pyiceberg.table.puffin import PuffinBlobMetadata, PuffinFile + +if TYPE_CHECKING: + from datasketches import compact_theta_sketch + +BLOB_TYPE_APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1" + + +class ThetaSketch: + field_id: int + _sketch: compact_theta_sketch + + def __init__(self, field_id: int, sketch: compact_theta_sketch) -> None: + self.field_id = field_id + self._sketch = sketch + + def get_estimate(self) -> float: + return self._sketch.get_estimate() + + def get_lower_bound(self, num_std_devs: int = 1) -> float: + return self._sketch.get_lower_bound(num_std_devs) + + def get_upper_bound(self, num_std_devs: int = 1) -> float: + return self._sketch.get_upper_bound(num_std_devs) + + def is_empty(self) -> bool: + return self._sketch.is_empty() + + def is_estimation_mode(self) -> bool: + return self._sketch.is_estimation_mode() + + @property + def sketch(self) -> compact_theta_sketch: + return self._sketch + + +def _theta_sketches_from_blob(blob: PuffinBlobMetadata, payload: bytes) -> list[ThetaSketch]: + from datasketches import compact_theta_sketch + + if blob.compression_codec == "zstd": + payload = zstandard.decompress(payload) + + sketch = compact_theta_sketch.deserialize(payload) + return [ThetaSketch(field_id=field_id, sketch=sketch) for field_id in blob.fields] + + +def theta_sketches_from_puffin_file(puffin_file: PuffinFile) -> list[ThetaSketch]: + sketches = [] + for blob in puffin_file.footer.blobs: + if blob.type == BLOB_TYPE_APACHE_DATASKETCHES_THETA_V1: + sketches.extend(_theta_sketches_from_blob(blob, puffin_file.get_blob_payload(blob))) + return sketches diff --git a/pyproject.toml b/pyproject.toml index 3ef225ed86..cd392656e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,6 +98,7 @@ datafusion = ["datafusion>=52,<53"] gcp-auth = ["google-auth>=2.4.0"] entra-auth = ["azure-identity>=1.25.1"] geoarrow = ["geoarrow-pyarrow>=0.2.0"] +datasketches = ["datasketches>=3.4.0,<6.0.0"] [dependency-groups] dev = [ @@ -124,6 +125,7 @@ dev = [ "papermill>=2.6.0", "nbformat>=5.10.0", "ipykernel>=6.29.0", + "datasketches>=3.4.0,<6.0.0", ] # for mkdocs docs = [ diff --git a/tests/table/puffin/v1/theta-sketches.puffin b/tests/table/puffin/v1/theta-sketches.puffin new file mode 100644 index 0000000000..9beca220d0 Binary files /dev/null and b/tests/table/puffin/v1/theta-sketches.puffin differ diff --git a/tests/table/test_theta_sketch.py b/tests/table/test_theta_sketch.py new file mode 100644 index 0000000000..e1ad0e88d3 --- /dev/null +++ b/tests/table/test_theta_sketch.py @@ -0,0 +1,166 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import json +from os import path + +import pytest +from datasketches import compact_theta_sketch, update_theta_sketch + +from pyiceberg.table.puffin import MAGIC_BYTES, PuffinFile +from pyiceberg.table.theta_sketch import ThetaSketch, theta_sketches_from_puffin_file + + +def _open_fixture(file: str) -> bytes: + cur_dir = path.dirname(path.realpath(__file__)) + with open(f"{cur_dir}/puffin/v1/{file}", "rb") as f: + return f.read() + + +def _make_sketch(values: list[int]) -> compact_theta_sketch: + ts = update_theta_sketch() + for v in values: + ts.update(v) + return ts.compact() + + +@pytest.fixture +def empty_sketch_bytes() -> bytes: + return update_theta_sketch().compact().serialize() + + +@pytest.fixture +def three_value_sketch_bytes() -> bytes: + return _make_sketch([1, 2, 3]).serialize() + + +def test_empty_sketch(empty_sketch_bytes: bytes) -> None: + sketch = compact_theta_sketch.deserialize(empty_sketch_bytes) + ts = ThetaSketch(field_id=1, sketch=sketch) + + assert ts.is_empty() + assert ts.get_estimate() == 0.0 + + +def test_sketch_estimate(three_value_sketch_bytes: bytes) -> None: + sketch = compact_theta_sketch.deserialize(three_value_sketch_bytes) + ts = ThetaSketch(field_id=1, sketch=sketch) + + assert not ts.is_empty() + assert ts.get_estimate() == pytest.approx(3.0) + assert not ts.is_estimation_mode() + + +def test_sketch_bounds_exact_mode(three_value_sketch_bytes: bytes) -> None: + sketch = compact_theta_sketch.deserialize(three_value_sketch_bytes) + ts = ThetaSketch(field_id=1, sketch=sketch) + + assert ts.get_lower_bound(1) == pytest.approx(3.0) + assert ts.get_upper_bound(1) == pytest.approx(3.0) + + +def test_sketch_field_id() -> None: + sketch = _make_sketch([10, 20, 30]) + ts = ThetaSketch(field_id=42, sketch=sketch) + + assert ts.field_id == 42 + + +def test_sketch_property() -> None: + sketch = _make_sketch([1, 2]) + ts = ThetaSketch(field_id=1, sketch=sketch) + + assert ts.sketch is sketch + + +def test_estimation_mode() -> None: + ts_builder = update_theta_sketch(lg_k=5) + for i in range(100): + ts_builder.update(i) + sketch = ts_builder.compact() + ts = ThetaSketch(field_id=1, sketch=sketch) + + assert ts.is_estimation_mode() + assert ts.get_estimate() > 0 + assert ts.get_lower_bound(1) <= ts.get_estimate() + assert ts.get_upper_bound(1) >= ts.get_estimate() + + +def _build_puffin_file(blob_bytes: bytes, field_ids: list[int], snapshot_id: int = 1) -> bytes: + # Puffin layout: magic(4) + blobs + footer_json + footer_size(4) + flags(4) + magic(4) + # Blob offsets are file-absolute; first blob starts immediately after the 4-byte magic. + blob_offset = 4 + footer = { + "blobs": [ + { + "type": "apache-datasketches-theta-v1", + "snapshot-id": snapshot_id, + "sequence-number": 1, + "fields": field_ids, + "offset": blob_offset, + "length": len(blob_bytes), + } + ], + "properties": {}, + } + footer_json = json.dumps(footer, separators=(",", ":")).encode("utf-8") + footer_size_bytes = len(footer_json).to_bytes(4, byteorder="little") + flags = b"\x00\x00\x00\x00" + return MAGIC_BYTES + blob_bytes + footer_json + footer_size_bytes + flags + MAGIC_BYTES + + +def test_theta_sketches_from_puffin_file_single_field(three_value_sketch_bytes: bytes) -> None: + puffin_bytes = _build_puffin_file(three_value_sketch_bytes, field_ids=[5]) + puffin_file = PuffinFile(puffin_bytes) + + sketches = theta_sketches_from_puffin_file(puffin_file) + + assert len(sketches) == 1 + assert sketches[0].field_id == 5 + assert sketches[0].get_estimate() == pytest.approx(3.0) + + +def test_theta_sketches_from_puffin_file_multiple_fields(three_value_sketch_bytes: bytes) -> None: + puffin_bytes = _build_puffin_file(three_value_sketch_bytes, field_ids=[1, 2, 3]) + puffin_file = PuffinFile(puffin_bytes) + + sketches = theta_sketches_from_puffin_file(puffin_file) + + assert len(sketches) == 3 + assert [s.field_id for s in sketches] == [1, 2, 3] + for sketch in sketches: + assert sketch.get_estimate() == pytest.approx(3.0) + + +def test_theta_sketches_from_puffin_file_empty_sketch(empty_sketch_bytes: bytes) -> None: + puffin_bytes = _build_puffin_file(empty_sketch_bytes, field_ids=[7]) + puffin_file = PuffinFile(puffin_bytes) + + sketches = theta_sketches_from_puffin_file(puffin_file) + + assert len(sketches) == 1 + assert sketches[0].is_empty() + assert sketches[0].get_estimate() == 0.0 + + +def test_theta_sketches_from_trino_written_puffin_file() -> None: + puffin_file = PuffinFile(_open_fixture("theta-sketches.puffin")) + sketches = theta_sketches_from_puffin_file(puffin_file) + + assert len(sketches) == 3 + assert [s.field_id for s in sketches] == [1, 2, 3] + for sketch in sketches: + assert sketch.get_estimate() == pytest.approx(5.0) diff --git a/uv.lock b/uv.lock index 65e77adcb6..c1b2660b77 100644 --- a/uv.lock +++ b/uv.lock @@ -1210,6 +1210,45 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1c/48/01906ab5c1a70373c6874ac5192d03646fa7b94d9ff06e3f676cb6b0f43f/datafusion-52.3.0-cp310-abi3-win_amd64.whl", hash = "sha256:9fb35738cf4dbff672dbcfffc7332813024cb0ad2ab8cda1fb90b9054277ab0c", size = 33765807, upload-time = "2026-03-16T10:54:05.728Z" }, ] +[[package]] +name = "datasketches" +version = "5.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/34/cd/659ae9fc53f34d6deafbe12162977654be5bb0a584e6afa6656337e13952/datasketches-5.2.0.tar.gz", hash = "sha256:c00d61da4695e00036e63f590999f584cc39246cbb147b171f375f792604a612", size = 53213, upload-time = "2025-03-01T07:49:24.567Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/13/0146b96195819f528ecf3f77e0dd58054c321076468746fc67687c20ff19/datasketches-5.2.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:177e9545aafd7359c44e7935b995b0b0f9b08704a78f53e7840a7125d2d1fc9e", size = 652707, upload-time = "2025-03-01T07:48:01.904Z" }, + { url = "https://files.pythonhosted.org/packages/38/16/38fc321557d86be3542e63ab0c54eef4807c43a16e4d102b85fab64716d0/datasketches-5.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4d33a1d7894509556efb6a41f2ad530257d67a094a039e3b5a047a22934a951e", size = 584959, upload-time = "2025-03-01T07:48:03.875Z" }, + { url = "https://files.pythonhosted.org/packages/f0/a5/d3a92c4904207a2429de4c336b806cb1137ac86d96d053b73a9ed8b3d6ba/datasketches-5.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52c53f8c94c48b3f047ab488e9ea41ce8dbf6897c2c1f353cb8a79b22cb4a22b", size = 681874, upload-time = "2025-03-01T07:48:06.356Z" }, + { url = "https://files.pythonhosted.org/packages/c3/cc/b97554d566ca9a3b02645e5e8cb6047e80ee9409bc03f8924ee64e3eba4f/datasketches-5.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c13fcd9071147a377b587f590887751d85874df677ec5ed72ccdf3f7a19446a", size = 738419, upload-time = "2025-03-01T07:48:08.782Z" }, + { url = "https://files.pythonhosted.org/packages/fe/87/36c48c4af91ab732a09bfe06392df664b30ea780523d719e1be7819ec622/datasketches-5.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:62aa8ddfb0e8f4d0e3e1e214e19e0de11b8ea1e34bf7752dfd987e6e9e3b1264", size = 1069353, upload-time = "2025-03-01T07:48:11.088Z" }, + { url = "https://files.pythonhosted.org/packages/4e/f2/abc65ff28286e4997c02d3d83f1236b72595c3e5ba9d3ba39274eab87eed/datasketches-5.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:95c2f6b2dea71385bfddf525cc14c9d8bf261f7aa98cbd65155dbaa0764f6cd9", size = 1131149, upload-time = "2025-03-01T07:48:13.893Z" }, + { url = "https://files.pythonhosted.org/packages/fa/4d/be6af6e5cf0bdef5b55d640852a47946a7f044c6fcad989e81619ebb46ab/datasketches-5.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:a8a0500c0de3012fda530e12b6c62d27278f7d6cce21008b9208895dfd337201", size = 506641, upload-time = "2025-03-01T07:48:15.779Z" }, + { url = "https://files.pythonhosted.org/packages/c3/b9/5c69df548b19fc2a20d5990f32023db4483cef43a263d775ba7e08bda00e/datasketches-5.2.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:dd8640866011c7dd346d8c9cf9ad0438de16d56788c13b12c96ecb38c5c7df9d", size = 653081, upload-time = "2025-03-01T07:48:17.643Z" }, + { url = "https://files.pythonhosted.org/packages/f5/0e/8baa3ec5ab48408c1ad7d2d4a9ecba07a4a7e16392b8e0d3480fe88b0285/datasketches-5.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e78a462b397c9a11e876a7fd8198d77d35b8bbe849a8d329c518962bc8463ad1", size = 585248, upload-time = "2025-03-01T07:48:19.778Z" }, + { url = "https://files.pythonhosted.org/packages/fc/99/011c7edd1c7971ca4a5c2e545a99889716d907498e25f96e1055c1a9c49f/datasketches-5.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cde9ce8f42b6a05930dd9b5d0da1b4420d1e8db53faaa41c6732ce7a1670c4a", size = 682144, upload-time = "2025-03-01T07:48:21.663Z" }, + { url = "https://files.pythonhosted.org/packages/ce/27/820dc70c6d4b23fc43187113a9386d4f8702c162f4387f459dfc5063ee63/datasketches-5.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470111ef1b213bc75b3602e14153c3e70a1ebe0fe4b069a7800557685080c477", size = 738715, upload-time = "2025-03-01T07:48:24.085Z" }, + { url = "https://files.pythonhosted.org/packages/94/42/5e8bc5277891797e3207a02ab12db3b2ae7159a45aae917a510c690c5880/datasketches-5.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0208fb7bf40edca3f8cfcd9d1acdb95798abc10b685341b39557f59eb5bd862e", size = 1069861, upload-time = "2025-03-01T07:48:26.419Z" }, + { url = "https://files.pythonhosted.org/packages/62/43/765df9fd6ea2f6fffc1fd757e28e4736568abf9ff1789471d3d8d7dc9ad8/datasketches-5.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bfa6571373244a9e3a4a1f59919b8a41979a997b92be3db0219400d8c6e96d20", size = 1131346, upload-time = "2025-03-01T07:48:28.756Z" }, + { url = "https://files.pythonhosted.org/packages/4a/f4/76a267c596c0a9849241efc91ffb9791fd8b6d818b594d38ce0a322215e7/datasketches-5.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:516e67cd3fd2d14c58c9d2b593f50ac23aff3874406c325ba2589a9c989862ca", size = 507446, upload-time = "2025-03-01T07:48:30.823Z" }, + { url = "https://files.pythonhosted.org/packages/fe/88/ac2dc472e9d054e3edbce9a5d1288bab447fc080d5a0d85984455cf66808/datasketches-5.2.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:65b3e2bb6b2083dfdd6a611e4b07d058ca6625f838088b5cb0897335c4580e6d", size = 643447, upload-time = "2025-03-01T07:48:32.844Z" }, + { url = "https://files.pythonhosted.org/packages/00/0c/faca927b0575482d567eda4fa65ffe5ebe1ac04b6c5f0321faeb490d5b8e/datasketches-5.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bfc80979ccb2e11bf5e93996db7f1f486c9d3ef4a15c18f1f1bc37aaa4b2a038", size = 578736, upload-time = "2025-03-01T07:48:35.427Z" }, + { url = "https://files.pythonhosted.org/packages/ce/6f/9201d36b6775ec8dc3f7d4cc10e32b35b2012683b520d0cf83b0cb674866/datasketches-5.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a76731c0d876cff25cc2896ef9d80c54167f5aff7170f12b89db6a9b898fc714", size = 675679, upload-time = "2025-03-01T07:48:38.157Z" }, + { url = "https://files.pythonhosted.org/packages/af/87/56bbb0be6d6c49c9b1705f56bad3f4319f8cdbc546f79e811ac88bdfe2cc/datasketches-5.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9aa0c1dad8d00c242718c5dbb9f8cf1876ea099ee55c1c0634a8beecb589164", size = 748356, upload-time = "2025-03-01T07:48:40.344Z" }, + { url = "https://files.pythonhosted.org/packages/c9/37/0c25e113ae8a148201d60050f26efeada54f46e8417bd35082b40355eac3/datasketches-5.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:40e6cecb8656b6694e845d9319c5c3aaa083f754e992db6065f18806474685c1", size = 1070320, upload-time = "2025-03-01T07:48:43.218Z" }, + { url = "https://files.pythonhosted.org/packages/0f/43/5ea198ff05ba3c1f904a1118b23f1b36b0c60f16b15af791a3ab4ccee752/datasketches-5.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a673180dc5226bf3c84baeeadf36d9a4f1ee82e679dcd92776651f8ffcc09287", size = 1146824, upload-time = "2025-03-01T07:48:46.841Z" }, + { url = "https://files.pythonhosted.org/packages/8d/fc/1ca195f0fe524e3ffc12ac81c07f3cc3ce144499b4e3ec5e73a52167253f/datasketches-5.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:72cc339cb7775f82516c36fe8e9a70e9f504214a8d0ed06246332f53203b89f3", size = 509067, upload-time = "2025-03-01T07:48:48.784Z" }, + { url = "https://files.pythonhosted.org/packages/62/a7/2b69296c200bd59550cb6ee292d8ba6739ea2d847e10d38452d86120bb45/datasketches-5.2.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:a76c998ddf4d39f895b830a3ccc41d1df2d0454c74fc5e53844db119658e71a7", size = 643448, upload-time = "2025-03-01T07:48:50.808Z" }, + { url = "https://files.pythonhosted.org/packages/ed/56/ca425991d21e4b4e4b0a72276a77678201a92d5609acbb27ad7a05ddfce6/datasketches-5.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6c2be3884b28d24a3be103a5c285902f06eee85c076be57c62c9c5eecbe15d4d", size = 578736, upload-time = "2025-03-01T07:48:53.73Z" }, + { url = "https://files.pythonhosted.org/packages/2b/89/d8ce2f6eab2914a5091360f94fc52cb6f93b1e3852f1aa86dbba9833e20f/datasketches-5.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9bcbf2687b436593f4e03ba73f57dd72fc6a7414d88db57e91f4c24f6e62ee45", size = 675679, upload-time = "2025-03-01T07:48:55.734Z" }, + { url = "https://files.pythonhosted.org/packages/62/33/351d1f0c700e143217597d29b333c77695db0f0b3757cf3c2b6e8cf58ea7/datasketches-5.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49e78e52d6a1d63b08a2990873f94bbc9d7427f6907f600af1257f0a9c901b1f", size = 748357, upload-time = "2025-03-01T07:48:57.901Z" }, + { url = "https://files.pythonhosted.org/packages/7d/6c/9ef89caf91c2d2b70fce5038607f2b5d8b09ee14db6bbe89027a8421ed88/datasketches-5.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6be79e382a4b4fe033a7d3de0033fe81b0e01ebc5124ae24f785214517a847a7", size = 1070320, upload-time = "2025-03-01T07:49:01.074Z" }, + { url = "https://files.pythonhosted.org/packages/82/22/a6281d53249af4570b36418de2367114a4ecebd16099e195346641e9d5e8/datasketches-5.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fbd6eab5af078eb65d678bf27fc468097a1c5db07800ac537a75fd983c437e57", size = 1146824, upload-time = "2025-03-01T07:49:04.782Z" }, + { url = "https://files.pythonhosted.org/packages/6f/da/c3feb5eca3d7c43d068069b56f77685c01d1ee67e687490cc6341ec920f1/datasketches-5.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:bf8f1cc1b1c4a35554924e27d6c2872e7b0dc065e2694ec83035ffbc203f17c3", size = 509058, upload-time = "2025-03-01T07:49:06.796Z" }, +] + [[package]] name = "debugpy" version = "1.8.19" @@ -1954,6 +1993,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7d/ed/6bfa4109fcb23a58819600392564fea69cdc6551ffd5e69ccf1d52a40cbc/greenlet-3.2.4-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:8c68325b0d0acf8d91dde4e6f930967dd52a5302cd4062932a6b2e7c2969f47c", size = 271061, upload-time = "2025-08-07T13:17:15.373Z" }, { url = "https://files.pythonhosted.org/packages/2a/fc/102ec1a2fc015b3a7652abab7acf3541d58c04d3d17a8d3d6a44adae1eb1/greenlet-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:94385f101946790ae13da500603491f04a76b6e4c059dab271b3ce2e283b2590", size = 629475, upload-time = "2025-08-07T13:42:54.009Z" }, { url = "https://files.pythonhosted.org/packages/c5/26/80383131d55a4ac0fb08d71660fd77e7660b9db6bdb4e8884f46d9f2cc04/greenlet-3.2.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f10fd42b5ee276335863712fa3da6608e93f70629c631bf77145021600abc23c", size = 640802, upload-time = "2025-08-07T13:45:25.52Z" }, + { url = "https://files.pythonhosted.org/packages/9f/7c/e7833dbcd8f376f3326bd728c845d31dcde4c84268d3921afcae77d90d08/greenlet-3.2.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c8c9e331e58180d0d83c5b7999255721b725913ff6bc6cf39fa2a45841a4fd4b", size = 636703, upload-time = "2025-08-07T13:53:12.622Z" }, { url = "https://files.pythonhosted.org/packages/e9/49/547b93b7c0428ede7b3f309bc965986874759f7d89e4e04aeddbc9699acb/greenlet-3.2.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:58b97143c9cc7b86fc458f215bd0932f1757ce649e05b640fea2e79b54cedb31", size = 635417, upload-time = "2025-08-07T13:18:25.189Z" }, { url = "https://files.pythonhosted.org/packages/7f/91/ae2eb6b7979e2f9b035a9f612cf70f1bf54aad4e1d125129bef1eae96f19/greenlet-3.2.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2ca18a03a8cfb5b25bc1cbe20f3d9a4c80d8c3b13ba3df49ac3961af0b1018d", size = 584358, upload-time = "2025-08-07T13:18:23.708Z" }, { url = "https://files.pythonhosted.org/packages/f7/85/433de0c9c0252b22b16d413c9407e6cb3b41df7389afc366ca204dbc1393/greenlet-3.2.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9fe0a28a7b952a21e2c062cd5756d34354117796c6d9215a87f55e38d15402c5", size = 1113550, upload-time = "2025-08-07T13:42:37.467Z" }, @@ -1964,6 +2004,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/de/f28ced0a67749cac23fecb02b694f6473f47686dff6afaa211d186e2ef9c/greenlet-3.2.4-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:96378df1de302bc38e99c3a9aa311967b7dc80ced1dcc6f171e99842987882a2", size = 272305, upload-time = "2025-08-07T13:15:41.288Z" }, { url = "https://files.pythonhosted.org/packages/09/16/2c3792cba130000bf2a31c5272999113f4764fd9d874fb257ff588ac779a/greenlet-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1ee8fae0519a337f2329cb78bd7a8e128ec0f881073d43f023c7b8d4831d5246", size = 632472, upload-time = "2025-08-07T13:42:55.044Z" }, { url = "https://files.pythonhosted.org/packages/ae/8f/95d48d7e3d433e6dae5b1682e4292242a53f22df82e6d3dda81b1701a960/greenlet-3.2.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:94abf90142c2a18151632371140b3dba4dee031633fe614cb592dbb6c9e17bc3", size = 644646, upload-time = "2025-08-07T13:45:26.523Z" }, + { url = "https://files.pythonhosted.org/packages/d5/5e/405965351aef8c76b8ef7ad370e5da58d57ef6068df197548b015464001a/greenlet-3.2.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:4d1378601b85e2e5171b99be8d2dc85f594c79967599328f95c1dc1a40f1c633", size = 640519, upload-time = "2025-08-07T13:53:13.928Z" }, { url = "https://files.pythonhosted.org/packages/25/5d/382753b52006ce0218297ec1b628e048c4e64b155379331f25a7316eb749/greenlet-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0db5594dce18db94f7d1650d7489909b57afde4c580806b8d9203b6e79cdc079", size = 639707, upload-time = "2025-08-07T13:18:27.146Z" }, { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, @@ -1974,6 +2015,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, { url = "https://files.pythonhosted.org/packages/3b/16/035dcfcc48715ccd345f3a93183267167cdd162ad123cd93067d86f27ce4/greenlet-3.2.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f28588772bb5fb869a8eb331374ec06f24a83a9c25bfa1f38b6993afe9c1e968", size = 655185, upload-time = "2025-08-07T13:45:27.624Z" }, + { url = "https://files.pythonhosted.org/packages/31/da/0386695eef69ffae1ad726881571dfe28b41970173947e7c558d9998de0f/greenlet-3.2.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5c9320971821a7cb77cfab8d956fa8e39cd07ca44b6070db358ceb7f8797c8c9", size = 649926, upload-time = "2025-08-07T13:53:15.251Z" }, { url = "https://files.pythonhosted.org/packages/68/88/69bf19fd4dc19981928ceacbc5fd4bb6bc2215d53199e367832e98d1d8fe/greenlet-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c60a6d84229b271d44b70fb6e5fa23781abb5d742af7b808ae3f6efd7c9c60f6", size = 651839, upload-time = "2025-08-07T13:18:30.281Z" }, { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, @@ -1984,6 +2026,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, { url = "https://files.pythonhosted.org/packages/f7/0b/bc13f787394920b23073ca3b6c4a7a21396301ed75a655bcb47196b50e6e/greenlet-3.2.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:710638eb93b1fa52823aa91bf75326f9ecdfd5e0466f00789246a5280f4ba0fc", size = 655191, upload-time = "2025-08-07T13:45:29.752Z" }, + { url = "https://files.pythonhosted.org/packages/f2/d6/6adde57d1345a8d0f14d31e4ab9c23cfe8e2cd39c3baf7674b4b0338d266/greenlet-3.2.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:c5111ccdc9c88f423426df3fd1811bfc40ed66264d35aa373420a34377efc98a", size = 649516, upload-time = "2025-08-07T13:53:16.314Z" }, { url = "https://files.pythonhosted.org/packages/7f/3b/3a3328a788d4a473889a2d403199932be55b1b0060f4ddd96ee7cdfcad10/greenlet-3.2.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d76383238584e9711e20ebe14db6c88ddcedc1829a9ad31a584389463b5aa504", size = 652169, upload-time = "2025-08-07T13:18:32.861Z" }, { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, @@ -1994,6 +2037,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, { url = "https://files.pythonhosted.org/packages/c0/aa/687d6b12ffb505a4447567d1f3abea23bd20e73a5bed63871178e0831b7a/greenlet-3.2.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c17b6b34111ea72fc5a4e4beec9711d2226285f0386ea83477cbb97c30a3f3a5", size = 699218, upload-time = "2025-08-07T13:45:30.969Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, @@ -4637,6 +4681,9 @@ daft = [ datafusion = [ { name = "datafusion" }, ] +datasketches = [ + { name = "datasketches" }, +] duckdb = [ { name = "duckdb" }, { name = "pyarrow" }, @@ -4709,6 +4756,7 @@ sql-sqlite = [ dev = [ { name = "coverage", extra = ["toml"] }, { name = "cython" }, + { name = "datasketches" }, { name = "deptry" }, { name = "docutils" }, { name = "fastavro" }, @@ -4760,6 +4808,7 @@ requires-dist = [ { name = "click", specifier = ">=7.1.1,<9.0.0" }, { name = "daft", marker = "extra == 'daft'", specifier = ">=0.7.10" }, { name = "datafusion", marker = "extra == 'datafusion'", specifier = ">=52,<53" }, + { name = "datasketches", marker = "extra == 'datasketches'", specifier = ">=3.4.0,<6.0.0" }, { name = "duckdb", marker = "extra == 'duckdb'", specifier = ">=0.5.0,<2.0.0" }, { name = "fsspec", specifier = ">=2023.1.0" }, { name = "gcsfs", marker = "extra == 'gcsfs'", specifier = ">=2023.1.0" }, @@ -4795,12 +4844,13 @@ requires-dist = [ { name = "thrift-sasl", marker = "extra == 'hive-kerberos'", specifier = ">=0.4.3" }, { name = "zstandard", specifier = ">=0.13.0,<1.0.0" }, ] -provides-extras = ["pyarrow", "pandas", "duckdb", "ray", "bodo", "daft", "polars", "snappy", "hive", "hive-kerberos", "s3fs", "glue", "adlfs", "dynamodb", "bigquery", "sql-postgres", "sql-sqlite", "gcsfs", "rest-sigv4", "hf", "pyiceberg-core", "datafusion", "gcp-auth", "entra-auth", "geoarrow"] +provides-extras = ["pyarrow", "pandas", "duckdb", "ray", "bodo", "daft", "polars", "snappy", "hive", "hive-kerberos", "s3fs", "glue", "adlfs", "dynamodb", "bigquery", "sql-postgres", "sql-sqlite", "gcsfs", "rest-sigv4", "hf", "pyiceberg-core", "datafusion", "gcp-auth", "entra-auth", "geoarrow", "datasketches"] [package.metadata.requires-dev] dev = [ { name = "coverage", extras = ["toml"], specifier = ">=7.4.2,<8" }, { name = "cython", specifier = ">=3.0.0" }, + { name = "datasketches", specifier = ">=3.4.0,<6.0.0" }, { name = "deptry", specifier = ">=0.14,<0.26" }, { name = "docutils", specifier = "!=0.21.post1" }, { name = "fastavro", specifier = "==1.12.2" },