Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d9c7843
Add a Rust implementation of `linecomp()`
akaihola May 3, 2024
fc8328c
Add some helpers for Rust `linecomp()`
akaihola May 3, 2024
049949a
Update `rstest`
akaihola May 4, 2024
9bc33e3
Refactor to iterate over strings
akaihola May 4, 2024
5e687af
No need for character indices, simplify
akaihola May 4, 2024
2d38fe2
Make `linecomp` public, add documentation
akaihola May 4, 2024
49942c2
Grammar fix
akaihola May 4, 2024
cf821d0
Add non-numeric and multi-field test cases
akaihola May 4, 2024
b4b9753
Refactor leading zero skip to a function
akaihola May 4, 2024
ea61f90
Improve identifier names
akaihola May 4, 2024
6823fe2
Two new test cases
akaihola May 4, 2024
e6e21f7
Rust extension module (WIP)
akaihola May 4, 2024
62a4558
Move sorting of buffer to Rust
akaihola May 4, 2024
a5be89b
Do entire COPY sections in Rust
akaihola May 4, 2024
1394845
Make releases 25% faster
akaihola May 5, 2024
10e8fb8
Don't loop forever at the end of input SQL file
akaihola May 5, 2024
04f8156
Write the end marker
akaihola May 5, 2024
c71b7dc
Add profiling profile, useful for `samply`
akaihola May 5, 2024
db9dc06
Remove Python sort implementation, pytest Rust one
akaihola May 5, 2024
cb7e836
Improve Python identifier names
akaihola May 5, 2024
832a583
Modernize a Python test
akaihola May 5, 2024
dc80e25
Hard-code the SQL COPY end marker
akaihola May 5, 2024
4912525
Append COPY lines to what Python already wrote
akaihola May 5, 2024
fc76d2f
Reset negative sign handling for each field
akaihola May 5, 2024
eaeac4d
Fix negative identical int comparison
akaihola May 5, 2024
9963a30
Link to StackOverflow for `.by_ref()` explanation
akaihola May 5, 2024
c2b1c55
Reduce verbosity - bring Ordering::* to namespace
akaihola May 5, 2024
47bcd1a
Import some more objects to reduce verbosity
akaihola May 5, 2024
4fb3c08
Simplify integer part logic
akaihola May 5, 2024
36c04d3
Continue to simplify patterns
akaihola May 5, 2024
2b3adea
Simplify decimal part patterns
akaihola May 5, 2024
0f27bef
No need for peeking, we know the end marker
akaihola May 12, 2024
45cf0f7
Maybe faster by ordering matches?
akaihola May 12, 2024
4c13ec1
Add test cases, some are failing
akaihola May 12, 2024
0058418
Fix sorting test
akaihola May 12, 2024
dfc9cd5
Correct decimal point .42 style notation sorting
akaihola May 12, 2024
acc9a9f
Add `reverse_if_less` helper
akaihola May 12, 2024
9fa6b1e
Correct name for test function
akaihola May 12, 2024
960aaf8
Add Rust dev and test profiles
akaihola Jul 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# This file is autogenerated by maturin v1.5.1
# To update, run
#
# maturin generate-ci github
#
name: CI

on:
push:
branches:
- main
- master
tags:
- '*'
pull_request:
workflow_dispatch:

permissions:
contents: read

jobs:
linux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-latest
target: x86_64
- runner: ubuntu-latest
target: x86
- runner: ubuntu-latest
target: aarch64
- runner: ubuntu-latest
target: armv7
- runner: ubuntu-latest
target: s390x
- runner: ubuntu-latest
target: ppc64le
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-linux-${{ matrix.platform.target }}
path: dist

windows:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: windows-latest
target: x64
- runner: windows-latest
target: x86
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
architecture: ${{ matrix.platform.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-windows-${{ matrix.platform.target }}
path: dist

macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-latest
target: x86_64
- runner: macos-14
target: aarch64
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --find-interpreter
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}
path: dist

sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist
- name: Upload sdist
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
path: dist

release:
name: Release
runs-on: ubuntu-latest
if: "startsWith(github.ref, 'refs/tags/')"
needs: [linux, windows, macos, sdist]
steps:
- uses: actions/download-artifact@v4
- name: Publish to PyPI
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
with:
command: upload
args: --non-interactive --skip-existing wheels-*/*
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@
/*.egg-info
/build
/dist
/*.egg
/*.egg

# Added by cargo

/target
34 changes: 34 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[package]
name = "tsv_sort"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[lib]
name = "pgtricks"
crate-type = ["cdylib"]

[dependencies]
external_sort = "0.1.2"
pyo3 = "0.21.2"
serde = { version = "1.0.200", features = ["derive"] }

[dev-dependencies]
rstest = "0.19.0"

[profile.dev]
debug = true
debug-assertions = true

[profile.test]
debug = true
debug-assertions = true

[profile.release]
lto = true
codegen-units = 1

[profile.profiling]
inherits = "release"
debug = true
76 changes: 0 additions & 76 deletions pgtricks/mergesort.py

This file was deleted.

97 changes: 38 additions & 59 deletions pgtricks/pg_dump_splitsort.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,17 @@

from __future__ import annotations

import functools
import io
import os
import re
from argparse import ArgumentParser
from typing import IO, Iterable, Match, Pattern, cast
from typing import IO, Iterable, Match, Pattern

from pgtricks.mergesort import MergeSort
from pgtricks._tsv_sort import sort_file_lines

COPY_RE = re.compile(r"COPY\s+\S+\s+(\(.*?\)\s+)?FROM\s+stdin;\n$")
KIBIBYTE, MEBIBYTE, GIBIBYTE = 2**10, 2**20, 2**30
MEMORY_UNITS = {"": 1, "k": KIBIBYTE, "m": MEBIBYTE, "g": GIBIBYTE}


def try_float(s1: str, s2: str) -> tuple[str, str] | tuple[float, float]:
"""Convert two strings to floats. Return original ones on conversion error."""
if not s1 or not s2 or s1[0] not in '0123456789.-' or s2[0] not in '0123456789.-':
# optimization
return s1, s2
try:
return float(s1), float(s2)
except ValueError:
return s1, s2


def linecomp(l1: str, l2: str) -> int:
p1 = l1.split('\t', 1)
p2 = l2.split('\t', 1)
# TODO: unquote cast after support for Python 3.8 is dropped
v1, v2 = cast("tuple[float, float]", try_float(p1[0], p2[0]))
result = (v1 > v2) - (v1 < v2)
# modifying a line to see whether Darker works:
if not result and len(p1) == len(p2) == 2:
return linecomp(p1[1], p2[1])
return result

DATA_COMMENT_RE = re.compile('-- Data for Name: (?P<table>.*?); '
'Type: TABLE DATA; '
'Schema: (?P<schema>.*?);')
Expand Down Expand Up @@ -86,43 +61,47 @@ def new_output(filename: str) -> IO[str]:
output.close()
return open(os.path.join(directory, filename), 'w')

sorted_data_lines: MergeSort | None = None
inside_sql_copy: bool = False
counter = 0
output = new_output('0000_prologue.sql')
matcher = Matcher()

for line in open(sql_filepath):
if sorted_data_lines is None:
if line in ('\n', '--\n'):
buf.append(line)
elif line.startswith('SET search_path = '):
writelines([line])
else:
if matcher.match(DATA_COMMENT_RE, line):
counter += 1
output = new_output(
'{counter:04}_{schema}.{table}.sql'.format(
counter=counter,
schema=matcher.group('schema'),
table=matcher.group('table')))
elif COPY_RE.match(line):
sorted_data_lines = MergeSort(
key=functools.cmp_to_key(linecomp),
max_memory=max_memory,
)
elif SEQUENCE_SET_RE.match(line):
pass
elif 1 <= counter < 9999:
counter = 9999
output = new_output('%04d_epilogue.sql' % counter)
writelines([line])
else:
if line == "\\.\n":
writelines(sorted_data_lines)
writelines(line)
sorted_data_lines = None
position = 0
with open(sql_filepath) as sql_file:
while True:
line = sql_file.readline()
if not line:
break
if not inside_sql_copy:
if line in ('\n', '--\n'):
buf.append(line)
elif line.startswith('SET search_path = '):
writelines([line])
else:
if matcher.match(DATA_COMMENT_RE, line):
counter += 1
output = new_output(
'{counter:04}_{schema}.{table}.sql'.format(
counter=counter,
schema=matcher.group('schema'),
table=matcher.group('table')))
elif COPY_RE.match(line):
inside_sql_copy = True
elif SEQUENCE_SET_RE.match(line):
pass
elif 1 <= counter < 9999:
counter = 9999
output = new_output('%04d_epilogue.sql' % counter)
writelines([line])
else:
sorted_data_lines.append(line)
if line != "\\.\n": # don't bother with empty COPY statements
output.close()
position_after_sql_copy = sort_file_lines(sql_filepath, output.name, position)
# print(f"sort_file_lines({sql_filepath!r}, {output.name!r}, {position}) == {new_position}")
sql_file.seek(position_after_sql_copy)
output = open(output.name, "a")
inside_sql_copy = False
position = sql_file.tell()
flush()


Expand Down
Loading