From 54672c80d775a356f954165e47b8efab55b67fb5 Mon Sep 17 00:00:00 2001 From: Dan Bauman Date: Tue, 4 Nov 2025 16:00:43 -0500 Subject: [PATCH 1/4] update --- .ci/generate_fake_data.py | 53 +++++++++++-------- .../workflows/{python.yml => python-test.yml} | 21 +++++--- .github/workflows/python-wheel.yml | 26 +++++++++ .gitignore | 4 ++ README.md | 24 +++++++-- pyproject.toml | 31 +++++++++++ setup.py | 4 +- 7 files changed, 130 insertions(+), 33 deletions(-) rename .github/workflows/{python.yml => python-test.yml} (52%) create mode 100644 .github/workflows/python-wheel.yml create mode 100644 pyproject.toml diff --git a/.ci/generate_fake_data.py b/.ci/generate_fake_data.py index 8900608..775ac48 100644 --- a/.ci/generate_fake_data.py +++ b/.ci/generate_fake_data.py @@ -4,25 +4,34 @@ fake = Faker() - -print("generate 5000 fake profiles, write them all into a bson object," - "read back only those with @gmail.com emails") - - -with open("fake_profiles.bson", "wb") as f: - for _ in range(500): - faked = fake.simple_profile() - del faked['birthdate'] # bson doesn't like date wants datetime - bson_data = bson.BSON.encode(faked) - f.write(bson_data) - -found_gmails = 0 -with open("fake_profiles.bson", "rb") as f: - stream = BSONInput(fh=f, fast_string_prematch=b"@gmail.com") - for doc in stream: - assert "@gmail" in doc['mail'] # bson handles the utf8 decoding by default! - found_gmails += 1 - - -assert found_gmails > 0 -print(f"found {found_gmails} from gmails") +if __name__ == "__main__": + print("generate 5000 fake profiles, write them all into a bson object," + "read back only those with @gmail.com emails") + + + with open("fake_profiles.bson", "wb") as f: + for _ in range(500): + faked = fake.simple_profile() + del faked['birthdate'] # bson doesn't like date wants datetime + bson_data = bson.encode(faked) + f.write(bson_data) + + found_gmails = 0 + with open("fake_profiles.bson", "rb") as f: + stream = BSONInput(fh=f, fast_string_prematch=b"@gmail.com") + for doc in stream: + assert "@gmail" in doc['mail'] # bson handles the utf8 decoding by default! + found_gmails += 1 + + + assert found_gmails > 0 + print(f"found {found_gmails} from gmails") + + found_gmails_raw = 0 + with open("fake_profiles.bson", "rb") as f: + stream = BSONInput(fh=f, fast_string_prematch=b"@gmail.com", decode=False) + for raw_bson in stream: + assert b"@gmail" in raw_bson # not even bothering to decode to a dict + found_gmails_raw += 1 + assert found_gmails_raw > 0 + print(f"found {found_gmails_raw} from gmails without even decoding the BSON") \ No newline at end of file diff --git a/.github/workflows/python.yml b/.github/workflows/python-test.yml similarity index 52% rename from .github/workflows/python.yml rename to .github/workflows/python-test.yml index 8213ce5..3f8078a 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python-test.yml @@ -2,26 +2,35 @@ name: Python package on: push: - branches: [ master ] + branches: [ master, dev ] pull_request: branches: [ master ] jobs: simpletest: runs-on: - - ubuntu-18.04 + - ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.11, 3.12, 3.13, 3.14] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v5 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} + + - name: run a simple test run: | - pip install pymongo Faker + pip install wheel build pymongo faker + python3 -m build cp .ci/generate_fake_data.py . - python generate_fake_data.py + python3 generate_fake_data.py + + - uses: actions/upload-artifact@v4 + with: + name: wheel-file + path: ./dist/*.whl \ No newline at end of file diff --git a/.github/workflows/python-wheel.yml b/.github/workflows/python-wheel.yml new file mode 100644 index 0000000..ea9f9a3 --- /dev/null +++ b/.github/workflows/python-wheel.yml @@ -0,0 +1,26 @@ +name: Python package + +on: + push: + branches: [ master, dev ] + pull_request: + branches: [ master ] + +jobs: + simpletest: + runs-on: + - ubuntu-latest + + steps: + - uses: actions/checkout@v5 + + + - name: build and run a simple test + run: | + pip install wheel build pymongo faker + python3 -m build + + - uses: actions/upload-artifact@v4 + with: + name: wheel-file + path: ./dist/*.whl \ No newline at end of file diff --git a/.gitignore b/.gitignore index b9d6bd9..f0db4b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +#test data +fake_profiles.bson + + ################# ## Eclipse ################# diff --git a/README.md b/README.md index ae3a111..f306753 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ The fast_string_prematch would not bother converting records that do not have "g somewhere in the document as plaintext. ``` python - from bsonstream import KeyValueBSONInput + from bsonstream import BSONInput from sys import argv import gzip for file in argv[1:]: @@ -51,12 +51,30 @@ somewhere in the document as plaintext. f = open(file, 'rb') else: f=gzip.open(file,'rb') - stream = KeyValueBSONInput(fh=f, fast_string_prematch=b"github") + stream = BSONInput(fh=f, fast_string_prematch=b"github") for dict_data in stream: ...process dict_data... ``` +or if you are passing data to another tool that can handle raw bson (like bsonsearch), don't even bother decoding the BSON to a dict + +``` python + from bsonstream import BSONInput + from sys import argv + import gzip + for file in argv[1:]: + f=None + if "gz" not in file: + f = open(file, 'rb') + else: + f=gzip.open(file,'rb') + stream = BSONInput(fh=f, fast_string_prematch=b"github") + for raw_bson in stream: + ...process dict_data... +``` + + ## Benchmark Unfortunately, I cannot make available the test bson file. @@ -85,7 +103,7 @@ With fast string matcher. In this case, documents matching the fast string pate ## Dependencies Required libraries -* [python-bson] +* [pymongo] ## Versioning diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1090147 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,31 @@ +[project] +name = "bsonstream" +version = "0.1.7" +description = "BSON stream raw data into dict or individual BSON format - python" + +requires-python = ">=3.11" +dependencies = [ + "pymongo~=4.15.3" +] + +[project.urls] +GitHub = "https://github.com/bauman/python-bson-streaming" + +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + + +[tool.cibuildwheel.linux] +# This command runs for the manylinux containers (based on CentOS). +archs = ["x86_64"] + + +[tool.cibuildwheel.macos] +archs = ["x86_64", "universal2", "arm64"] + + +[[tool.cibuildwheel.overrides]] +select = "*-musllinux*" +# This command runs for the musllinux containers (based on Alpine Linux). +archs = ["x86_64"] diff --git a/setup.py b/setup.py index 7a8b7e3..e195850 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ VERSION = "0.1.4" setup( - name = "python-bsonstream", + name = "bsonstream", version = VERSION, packages=find_packages(), maintainer= 'Dan Bauman', @@ -12,7 +12,7 @@ license='MIT', url = 'https://github.com/bauman/python-bson-streaming', download_url = 'https://github.com/bauman/python-bson-streaming/archive/%s.tar.gz' %(VERSION), - install_requires = ["six", "pymongo"], + install_requires = ["pymongo"], classifiers = [ 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', From 67eee1bbe724a1c43dc049d1bc05e9da09b1a86c Mon Sep 17 00:00:00 2001 From: Dan Bauman Date: Tue, 4 Nov 2025 16:02:02 -0500 Subject: [PATCH 2/4] update --- .github/workflows/python-test.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 3f8078a..c353913 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -30,7 +30,3 @@ jobs: cp .ci/generate_fake_data.py . python3 generate_fake_data.py - - uses: actions/upload-artifact@v4 - with: - name: wheel-file - path: ./dist/*.whl \ No newline at end of file From b425e3006afbf2c0a58a93b017cefdd3c4118583 Mon Sep 17 00:00:00 2001 From: Dan Bauman Date: Tue, 4 Nov 2025 16:04:37 -0500 Subject: [PATCH 3/4] update --- .github/workflows/python-test.yml | 7 +++---- .github/workflows/python-wheel.yml | 4 ++-- setup.py | 23 ----------------------- 3 files changed, 5 insertions(+), 29 deletions(-) delete mode 100644 setup.py diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index c353913..e9a59cd 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -1,4 +1,4 @@ -name: Python package +name: test the module on: push: @@ -18,15 +18,14 @@ jobs: - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: run a simple test run: | - pip install wheel build pymongo faker - python3 -m build + pip install pymongo faker cp .ci/generate_fake_data.py . python3 generate_fake_data.py diff --git a/.github/workflows/python-wheel.yml b/.github/workflows/python-wheel.yml index ea9f9a3..40b022a 100644 --- a/.github/workflows/python-wheel.yml +++ b/.github/workflows/python-wheel.yml @@ -1,4 +1,4 @@ -name: Python package +name: build the wheel on: push: @@ -7,7 +7,7 @@ on: branches: [ master ] jobs: - simpletest: + wheelbuild: runs-on: - ubuntu-latest diff --git a/setup.py b/setup.py deleted file mode 100644 index e195850..0000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/python -from setuptools import setup, find_packages - -VERSION = "0.1.4" - -setup( - name = "bsonstream", - version = VERSION, - packages=find_packages(), - maintainer= 'Dan Bauman', - maintainer_email='dan@bauman.space', - license='MIT', - url = 'https://github.com/bauman/python-bson-streaming', - download_url = 'https://github.com/bauman/python-bson-streaming/archive/%s.tar.gz' %(VERSION), - install_requires = ["pymongo"], - classifiers = [ - 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 3', -], -) From fd053c13afe3c9d6c9ee99b4f4ccf774581c2e29 Mon Sep 17 00:00:00 2001 From: Dan Bauman Date: Tue, 4 Nov 2025 16:08:15 -0500 Subject: [PATCH 4/4] update --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1090147..0626bd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "BSON stream raw data into dict or individual BSON format - python requires-python = ">=3.11" dependencies = [ - "pymongo~=4.15.3" + "pymongo~=4.15.2" ] [project.urls]