From 02ddd33e2d95450ee672cb5cf1ca3f81378868dc Mon Sep 17 00:00:00 2001 From: Ben Bernard Date: Sun, 22 Feb 2026 06:12:07 -0800 Subject: [PATCH 1/5] feat: address open GitHub issues and parsedate PR Fix #71: Unicode/newline handling in totable and toptable - Use string-width package for proper visual width of CJK chars and emoji - Escape newlines, tabs, and backslashes in table cell values Fix #81: Add --only flag to decollate - New -o/--only option outputs only deaggregated fields, excluding original record fields Fix #59: Add file output to multiplex - New --output-file-key/-o and --output-file-eval/-O options - Supports {{key}} interpolation in file paths - Creates directories automatically Fix #86: Add fromxls input operation - Reads xls/xlsx/xlsb/xlsm files using xlsx library - Supports --sheet, --all-sheets, --no-header, --key/--field options Implement parsedate operation (closes PR #74) - TypeScript implementation of the parsedate transform from the old Perl PR - Supports --key, --output, --format, --output-format, --epoch, --output-epoch, --timezone options - Custom strftime-compatible parsing and formatting Fix #65: Add empty stream tests for collate - Confirms collate handles empty input without crashing for all aggregator types (avg, count, sum, max, min) Update man pages, operation registry, dispatcher, and test counts. Co-Authored-By: Claude Opus 4.6 --- bun.lock | 28 ++ man/man1/recs-decollate.1 | 11 + man/man1/recs-fromxls.1 | 60 ++++ man/man1/recs-multiplex.1 | 14 + man/man1/recs-parsedate.1 | 66 ++++ man/man1/recs.1 | 6 + package.json | 4 +- src/cli/dispatcher.ts | 4 + src/cli/operation-registry.ts | 4 + src/operations/input/fromxls.ts | 196 +++++++++++ src/operations/output/toptable.ts | 11 +- src/operations/output/totable.ts | 39 +- src/operations/transform/decollate.ts | 42 ++- src/operations/transform/index.ts | 1 + src/operations/transform/multiplex.ts | 101 +++++- src/operations/transform/parsedate.ts | 333 ++++++++++++++++++ tests/cli/help.test.ts | 4 +- tests/cli/manpages.test.ts | 8 +- .../operations/output/totable-unicode.test.ts | 50 +++ .../transform/collate-empty-stream.test.ts | 68 ++++ .../transform/decollate-only.test.ts | 59 ++++ tests/operations/transform/parsedate.test.ts | 105 ++++++ 22 files changed, 1183 insertions(+), 31 deletions(-) create mode 100644 man/man1/recs-fromxls.1 create mode 100644 man/man1/recs-parsedate.1 create mode 100644 src/operations/input/fromxls.ts create mode 100644 src/operations/transform/parsedate.ts create mode 100644 tests/operations/output/totable-unicode.test.ts create mode 100644 tests/operations/transform/collate-empty-stream.test.ts create mode 100644 tests/operations/transform/decollate-only.test.ts create mode 100644 tests/operations/transform/parsedate.test.ts diff --git a/bun.lock b/bun.lock index 49850c4f..3f10d151 100644 --- a/bun.lock +++ b/bun.lock @@ -10,6 +10,8 @@ "mongodb": "^7.1.0", "openai": "^6.22.0", "papaparse": "^5.5.3", + "string-width": "^8.2.0", + "xlsx": "^0.18.5", }, "devDependencies": { "@types/better-sqlite3": "^7.6.13", @@ -295,8 +297,12 @@ "@vueuse/shared": ["@vueuse/shared@12.8.2", "", { "dependencies": { "vue": "^3.5.13" } }, "sha512-dznP38YzxZoNloI0qpEfpkms8knDtaoQ6Y/sfS0L7Yki4zh40LFHEhur0odJC6xTHG5dxWVPiUWBXn+wCG2s5w=="], + "adler-32": ["adler-32@1.3.1", "", {}, "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A=="], + "algoliasearch": ["algoliasearch@5.49.0", "", { "dependencies": { "@algolia/abtesting": "1.15.0", "@algolia/client-abtesting": "5.49.0", "@algolia/client-analytics": "5.49.0", "@algolia/client-common": "5.49.0", "@algolia/client-insights": "5.49.0", "@algolia/client-personalization": "5.49.0", "@algolia/client-query-suggestions": "5.49.0", "@algolia/client-search": "5.49.0", "@algolia/ingestion": "1.49.0", "@algolia/monitoring": "1.49.0", "@algolia/recommend": "5.49.0", "@algolia/requester-browser-xhr": "5.49.0", "@algolia/requester-fetch": "5.49.0", "@algolia/requester-node-http": "5.49.0" } }, "sha512-Tse7vx7WOvbU+kpq/L3BrBhSWTPbtMa59zIEhMn+Z2NoxZlpcCRUDCRxQ7kDFs1T3CHxDgvb+mDuILiBBpBaAA=="], + "ansi-regex": ["ansi-regex@6.2.2", "", {}, "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg=="], + "base64-js": ["base64-js@1.5.1", "", {}, "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA=="], "better-sqlite3": ["better-sqlite3@12.6.2", "", { "dependencies": { "bindings": "^1.5.0", "prebuild-install": "^7.1.1" } }, "sha512-8VYKM3MjCa9WcaSAI3hzwhmyHVlH8tiGFwf0RlTsZPWJ1I5MkzjiudCo4KC4DxOaL/53A5B1sI/IbldNFDbsKA=="], @@ -315,16 +321,22 @@ "ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="], + "cfb": ["cfb@1.2.2", "", { "dependencies": { "adler-32": "~1.3.0", "crc-32": "~1.2.0" } }, "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA=="], + "character-entities-html4": ["character-entities-html4@2.1.0", "", {}, "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA=="], "character-entities-legacy": ["character-entities-legacy@3.0.0", "", {}, "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ=="], "chownr": ["chownr@1.1.4", "", {}, "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="], + "codepage": ["codepage@1.15.0", "", {}, "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA=="], + "comma-separated-tokens": ["comma-separated-tokens@2.0.3", "", {}, "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg=="], "copy-anything": ["copy-anything@4.0.5", "", { "dependencies": { "is-what": "^5.2.0" } }, "sha512-7Vv6asjS4gMOuILabD3l739tsaxFQmC+a7pLZm02zyvs8p977bL3zEgq3yDk5rn9B0PbYgIv++jmHcuUab4RhA=="], + "crc-32": ["crc-32@1.2.2", "", { "bin": { "crc32": "bin/crc32.njs" } }, "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ=="], + "csstype": ["csstype@3.2.3", "", {}, "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ=="], "decompress-response": ["decompress-response@6.0.0", "", { "dependencies": { "mimic-response": "^3.1.0" } }, "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ=="], @@ -355,10 +367,14 @@ "focus-trap": ["focus-trap@7.8.0", "", { "dependencies": { "tabbable": "^6.4.0" } }, "sha512-/yNdlIkpWbM0ptxno3ONTuf+2g318kh2ez3KSeZN5dZ8YC6AAmgeWz+GasYYiBJPFaYcSAPeu4GfhUaChzIJXA=="], + "frac": ["frac@1.1.2", "", {}, "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA=="], + "fs-constants": ["fs-constants@1.0.0", "", {}, "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow=="], "fsevents": ["fsevents@2.3.3", "", { "os": "darwin" }, "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw=="], + "get-east-asian-width": ["get-east-asian-width@1.5.0", "", {}, "sha512-CQ+bEO+Tva/qlmw24dCejulK5pMzVnUOFOijVogd3KQs07HnRIgp8TGipvCCRT06xeYEbpbgwaCxglFyiuIcmA=="], + "github-from-package": ["github-from-package@0.0.0", "", {}, "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw=="], "hast-util-to-html": ["hast-util-to-html@9.0.5", "", { "dependencies": { "@types/hast": "^3.0.0", "@types/unist": "^3.0.0", "ccount": "^2.0.0", "comma-separated-tokens": "^2.0.0", "hast-util-whitespace": "^3.0.0", "html-void-elements": "^3.0.0", "mdast-util-to-hast": "^13.0.0", "property-information": "^7.0.0", "space-separated-tokens": "^2.0.0", "stringify-entities": "^4.0.0", "zwitch": "^2.0.4" } }, "sha512-OguPdidb+fbHQSU4Q4ZiLKnzWo8Wwsf5bZfbvu7//a9oTYoqD/fWpe96NuHkoS9h0ccGOTe0C4NGXdtS0iObOw=="], @@ -497,10 +513,16 @@ "speakingurl": ["speakingurl@14.0.1", "", {}, "sha512-1POYv7uv2gXoyGFpBCmpDVSNV74IfsWlDW216UPjbWufNf+bSU6GdbDsxdcxtfwb4xlI3yxzOTKClUosxARYrQ=="], + "ssf": ["ssf@0.11.2", "", { "dependencies": { "frac": "~1.1.2" } }, "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g=="], + + "string-width": ["string-width@8.2.0", "", { "dependencies": { "get-east-asian-width": "^1.5.0", "strip-ansi": "^7.1.2" } }, "sha512-6hJPQ8N0V0P3SNmP6h2J99RLuzrWz2gvT7VnK5tKvrNqJoyS9W4/Fb8mo31UiPvy00z7DQXkP2hnKBVav76thw=="], + "string_decoder": ["string_decoder@1.3.0", "", { "dependencies": { "safe-buffer": "~5.2.0" } }, "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA=="], "stringify-entities": ["stringify-entities@4.0.4", "", { "dependencies": { "character-entities-html4": "^2.0.0", "character-entities-legacy": "^3.0.0" } }, "sha512-IwfBptatlO+QCJUo19AqvrPNqlVMpW9YEL2LIVY+Rpv2qsjCGxaDLNRgeGsQWJhfItebuJhsGSLjaBbNSQ+ieg=="], + "strip-ansi": ["strip-ansi@7.1.2", "", { "dependencies": { "ansi-regex": "^6.0.1" } }, "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA=="], + "strip-json-comments": ["strip-json-comments@2.0.1", "", {}, "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ=="], "strnum": ["strnum@2.1.2", "", {}, "sha512-l63NF9y/cLROq/yqKXSLtcMeeyOfnSQlfMSlzFt/K73oIaD8DGaQWd7Z34X9GPiKqP5rbSh84Hl4bOlLcjiSrQ=="], @@ -549,8 +571,14 @@ "whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="], + "wmf": ["wmf@1.0.2", "", {}, "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw=="], + + "word": ["word@0.3.0", "", {}, "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA=="], + "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="], + "xlsx": ["xlsx@0.18.5", "", { "dependencies": { "adler-32": "~1.3.0", "cfb": "~1.2.1", "codepage": "~1.15.0", "crc-32": "~1.2.1", "ssf": "~0.11.2", "wmf": "~1.0.1", "word": "~0.3.0" }, "bin": { "xlsx": "bin/xlsx.njs" } }, "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ=="], + "zwitch": ["zwitch@2.0.4", "", {}, "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A=="], } } diff --git a/man/man1/recs-decollate.1 b/man/man1/recs-decollate.1 index 1554dcdb..b9911d5f 100644 --- a/man/man1/recs-decollate.1 +++ b/man/man1/recs-decollate.1 @@ -15,6 +15,9 @@ Reverse of collate: takes a single record and produces multiple records using de \fB--deaggregator\fR, \fB-d\fR \fI\fR Deaggregator specification (colon\-separated). .TP +\fB--only\fR, \fB-o\fR +Only output deaggregated fields, excluding original record fields. Useful when you only want the expanded data, not the source record. +.TP \fB--list-deaggregators\fR List available deaggregators and exit. @@ -27,6 +30,14 @@ Split the \'hosts\' field into individual \'host\' fields .fi .RE +Decollate and only keep deaggregated fields +.PP +.RS 4 +.nf +\fBrecs decollate --only -d \'unarray,items,,item\'\fR +.fi +.RE + .SH SEE ALSO \fBrecs\-collate\fR(1) diff --git a/man/man1/recs-fromxls.1 b/man/man1/recs-fromxls.1 new file mode 100644 index 00000000..621bc66a --- /dev/null +++ b/man/man1/recs-fromxls.1 @@ -0,0 +1,60 @@ +.TH RECS\-FROMXLS 1 "2026-02-22" "recs 0.1.0" "RecordStream Manual" + +.SH NAME +recs\-fromxls \- Parse Excel files (xls, xlsx, xlsb, xlsm) into records + +.SH SYNOPSIS +.B recs fromxls [options] + +.SH DESCRIPTION +Parse Excel files (xls, xlsx, xlsb, xlsm) into records. By default, reads the first sheet and uses the first row as header names. +.PP + +.SH OPTIONS +.TP +\fB--key\fR, \fB-k\fR \fI\fR +Comma separated list of field names. Overrides header detection. +.TP +\fB--field\fR, \fB-f\fR \fI\fR +Comma separated list of field names. Overrides header detection. +.TP +\fB--no-header\fR, \fB-n\fR +Do not treat the first row as a header. Fields will be named numerically (0, 1, 2, ...). +.TP +\fB--sheet\fR, \fB-s\fR \fI\fR +Specify a sheet name to read. Defaults to the first sheet. +.TP +\fB--all-sheets\fR +Read all sheets in the workbook, adding a \'sheet\' field to each record. + +.SH EXAMPLES +Read an Excel file using headers from the first row +.PP +.RS 4 +.nf +\fBrecs fromxls data.xlsx\fR +.fi +.RE + +Read a specific sheet without headers +.PP +.RS 4 +.nf +\fBrecs fromxls --sheet \'Sheet2\' --no-header -k name,value data.xlsx\fR +.fi +.RE + +Read all sheets +.PP +.RS 4 +.nf +\fBrecs fromxls --all-sheets data.xlsx\fR +.fi +.RE + +.SH SEE ALSO +\fBrecs\-fromcsv\fR(1) + +.SH AUTHOR +Benjamin Bernard + diff --git a/man/man1/recs-multiplex.1 b/man/man1/recs-multiplex.1 index 225f74d2..33b3bac6 100644 --- a/man/man1/recs-multiplex.1 +++ b/man/man1/recs-multiplex.1 @@ -21,6 +21,12 @@ Domain language key: name=expression where the expression evaluates as a valuati \fB--line-key\fR, \fB-L\fR \fI\fR Use the value of this key as line input for the nested operation (rather than the entire record). Use with recs from* operations generally. .TP +\fB--output-file-key\fR, \fB-o\fR \fI\fR +Write each group\'s output to a separate file, using the value of the given key as the filename. +.TP +\fB--output-file-eval\fR, \fB-O\fR \fI\fR +Write each group\'s output to a separate file, with filename determined by the given expression. Supports {{key}} interpolation with group key values. +.TP \fB--adjacent\fR, \fB-1\fR Only group together adjacent records. Avoids spooling records into memory. .TP @@ -59,6 +65,14 @@ Separate out a stream of text by PID into separate invocations of an operation .fi .RE +Write each group\'s CSV output to separate files by department +.PP +.RS 4 +.nf +\fBrecs multiplex -k department -O \'output-{{department}}.csv\' -- recs tocsv\fR +.fi +.RE + .SH SEE ALSO \fBrecs\-collate\fR(1), \fBrecs\-chain\fR(1) diff --git a/man/man1/recs-parsedate.1 b/man/man1/recs-parsedate.1 new file mode 100644 index 00000000..f8883652 --- /dev/null +++ b/man/man1/recs-parsedate.1 @@ -0,0 +1,66 @@ +.TH RECS\-PARSEDATE 1 "2026-02-22" "recs 0.1.0" "RecordStream Manual" + +.SH NAME +recs\-parsedate \- Parse date/time strings from a field and output them in a normalized format + +.SH SYNOPSIS +.B recs parsedate [options] [files...] + +.SH DESCRIPTION +Parse date/time strings from a field and output them in a normalized format. Supports epoch seconds, ISO 8601, and custom strftime\-style format strings for both input and output. +.PP + +.SH OPTIONS +.TP +\fB--key\fR, \fB-k\fR \fI\fR +Key field containing the date/time string to parse. +.TP +\fB--output\fR, \fB-o\fR \fI\fR +Output key for the parsed date (defaults to \'parsed_\'). +.TP +\fB--format\fR, \fB-f\fR \fI\fR +Input format for parsing (strftime\-style). Common directives: %Y (4\-digit year), %m (month), %d (day), %H (hour 24h), %M (minute), %S (second), %b (abbreviated month name), %p (AM/PM). +.TP +\fB--output-format\fR, \fB-F\fR \fI\fR +Output format (strftime\-style). Defaults to ISO 8601 if not specified. +.TP +\fB--epoch\fR, \fB-e\fR +Input date is in epoch seconds. +.TP +\fB--output-epoch\fR, \fB-E\fR +Output as epoch seconds instead of a formatted string. +.TP +\fB--timezone\fR, \fB-z\fR \fI\fR +Timezone for output formatting (IANA name like \'America/New_York\'). + +.SH EXAMPLES +Parse dates and output as epoch seconds +.PP +.RS 4 +.nf +\fBrecs parsedate -k date -E\fR +.fi +.RE + +Parse a custom date format and reformat it +.PP +.RS 4 +.nf +\fBrecs parsedate -k timestamp -f \'%d/%b/%Y:%H:%M:%S\' -F \'%Y-%m-%d %H:%M:%S\'\fR +.fi +.RE + +Convert epoch seconds to ISO 8601 +.PP +.RS 4 +.nf +\fBrecs parsedate -k time -e -o iso_time\fR +.fi +.RE + +.SH SEE ALSO +\fBrecs\-normalizetime\fR(1) + +.SH AUTHOR +Benjamin Bernard + diff --git a/man/man1/recs.1 b/man/man1/recs.1 index 5253ba38..4b09d800 100644 --- a/man/man1/recs.1 +++ b/man/man1/recs.1 @@ -53,6 +53,9 @@ Runs tcpdump and puts out records, one for each packet \fBrecs fromxferlog\fR Each line of input (or lines of ) is parsed as an FTP transfer log (xferlog format) to produce an output record .TP +\fBrecs fromxls\fR +Parse Excel files (xls, xlsx, xlsb, xlsm) into records +.TP \fBrecs fromxml\fR Reads either from STDIN or from the specified URIs @@ -100,6 +103,9 @@ Take records, grouped together by \-\-keys, and run a separate operation instanc \fBrecs normalizetime\fR Given a single key field containing a date/time value, construct a normalized version of the value and place it into a field named \'n_\' .TP +\fBrecs parsedate\fR +Parse date/time strings from a field and output them in a normalized format +.TP \fBrecs sort\fR Sort records from input or from files .TP diff --git a/package.json b/package.json index 12d625db..7ec4f891 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,8 @@ "fast-xml-parser": "^5.3.7", "mongodb": "^7.1.0", "openai": "^6.22.0", - "papaparse": "^5.5.3" + "papaparse": "^5.5.3", + "string-width": "^8.2.0", + "xlsx": "^0.18.5" } } diff --git a/src/cli/dispatcher.ts b/src/cli/dispatcher.ts index f4f6962c..8dcd9875 100644 --- a/src/cli/dispatcher.ts +++ b/src/cli/dispatcher.ts @@ -20,6 +20,7 @@ import { FromRe } from "../operations/input/fromre.ts"; import { FromSplit } from "../operations/input/fromsplit.ts"; import { FromTcpdump } from "../operations/input/fromtcpdump.ts"; import { FromXferlog } from "../operations/input/fromxferlog.ts"; +import { FromXls } from "../operations/input/fromxls.ts"; import { FromXml } from "../operations/input/fromxml.ts"; // -- Transform operations -- @@ -40,6 +41,7 @@ import { JoinOperation } from "../operations/transform/join.ts"; import { CollateOperation } from "../operations/transform/collate.ts"; import { DecollateOperation } from "../operations/transform/decollate.ts"; import { ExpandJsonOperation } from "../operations/transform/expandjson.ts"; +import { ParseDateOperation } from "../operations/transform/parsedate.ts"; import { ChainOperation, registerOperationFactory } from "../operations/transform/chain.ts"; import { MultiplexOperation } from "../operations/transform/multiplex.ts"; @@ -75,6 +77,7 @@ const operationRegistry = new Map([ ["fromsplit", FromSplit], ["fromtcpdump", FromTcpdump], ["fromxferlog", FromXferlog], + ["fromxls", FromXls], ["fromxml", FromXml], // Transform ["grep", GrepOperation], @@ -94,6 +97,7 @@ const operationRegistry = new Map([ ["join", JoinOperation], ["collate", CollateOperation], ["decollate", DecollateOperation], + ["parsedate", ParseDateOperation], ["chain", ChainOperation], ["multiplex", MultiplexOperation], // Output diff --git a/src/cli/operation-registry.ts b/src/cli/operation-registry.ts index 96c4a6c4..0c1ca5fc 100644 --- a/src/cli/operation-registry.ts +++ b/src/cli/operation-registry.ts @@ -22,6 +22,7 @@ import { documentation as fromre } from "../operations/input/fromre.ts"; import { documentation as fromsplit } from "../operations/input/fromsplit.ts"; import { documentation as fromtcpdump } from "../operations/input/fromtcpdump.ts"; import { documentation as fromxferlog } from "../operations/input/fromxferlog.ts"; +import { documentation as fromxls } from "../operations/input/fromxls.ts"; import { documentation as fromxml } from "../operations/input/fromxml.ts"; // ── Transform operations ──────────────────────────────────────── @@ -39,6 +40,7 @@ import { documentation as grep } from "../operations/transform/grep.ts"; import { documentation as join } from "../operations/transform/join.ts"; import { documentation as multiplex } from "../operations/transform/multiplex.ts"; import { documentation as normalizetime } from "../operations/transform/normalizetime.ts"; +import { documentation as parsedate } from "../operations/transform/parsedate.ts"; import { documentation as sort } from "../operations/transform/sort.ts"; import { documentation as stream2table } from "../operations/transform/stream2table.ts"; import { documentation as substream } from "../operations/transform/substream.ts"; @@ -72,6 +74,7 @@ export const allDocs: CommandDoc[] = [ fromsplit, fromtcpdump, fromxferlog, + fromxls, fromxml, // Transform annotate, @@ -88,6 +91,7 @@ export const allDocs: CommandDoc[] = [ join, multiplex, normalizetime, + parsedate, sort, stream2table, substream, diff --git a/src/operations/input/fromxls.ts b/src/operations/input/fromxls.ts new file mode 100644 index 00000000..856ea90a --- /dev/null +++ b/src/operations/input/fromxls.ts @@ -0,0 +1,196 @@ +import { readFileSync } from "node:fs"; +import { read, utils } from "xlsx"; +import { Operation, type OptionDef } from "../../Operation.ts"; +import { Record } from "../../Record.ts"; +import { setKey } from "../../KeySpec.ts"; + +/** + * Parse Excel files (xls, xlsx) into records. + * Each row becomes a record with field names taken from headers or numeric indices. + */ +export class FromXls extends Operation { + fields: string[] = []; + headerLine = true; + sheet: string | null = null; + allSheets = false; + extraArgs: string[] = []; + + init(args: string[]): void { + const defs: OptionDef[] = [ + { + long: "key", + short: "k", + type: "string", + handler: (v) => { + this.fields.push(...(v as string).split(",")); + }, + description: "Comma separated list of field names", + }, + { + long: "field", + short: "f", + type: "string", + handler: (v) => { + this.fields.push(...(v as string).split(",")); + }, + description: "Comma separated list of field names", + }, + { + long: "no-header", + short: "n", + type: "boolean", + handler: () => { this.headerLine = false; }, + description: "Do not treat the first row as a header line", + }, + { + long: "sheet", + short: "s", + type: "string", + handler: (v) => { this.sheet = v as string; }, + description: "Sheet name to read (defaults to first sheet)", + }, + { + long: "all-sheets", + type: "boolean", + handler: () => { this.allSheets = true; }, + description: "Read all sheets, adding a 'sheet' field to each record", + }, + ]; + + this.extraArgs = this.parseOptions(args, defs); + + if (this.extraArgs.length === 0) { + throw new Error("fromxls requires at least one file argument"); + } + } + + acceptRecord(_record: Record): boolean { + return true; + } + + override wantsInput(): boolean { + return false; + } + + override streamDone(): void { + for (const file of this.extraArgs) { + this.updateCurrentFilename(file); + this.parseFile(file); + } + } + + parseFile(file: string): void { + const buffer = readFileSync(file); + const workbook = read(buffer, { type: "buffer" }); + + const sheetNames = this.allSheets + ? workbook.SheetNames + : [this.sheet ?? workbook.SheetNames[0]!]; + + for (const sheetName of sheetNames) { + const sheet = workbook.Sheets[sheetName]; + if (!sheet) { + throw new Error(`Sheet '${sheetName}' not found in ${file}`); + } + + const rows = utils.sheet_to_json(sheet, { + header: 1, + defval: "", + raw: false, + }) as string[][]; + + if (rows.length === 0) continue; + + let fields = [...this.fields]; + let startRow = 0; + + if (this.headerLine && fields.length === 0) { + fields = rows[0]!.map((v) => String(v).trim()); + startRow = 1; + } + + for (let i = startRow; i < rows.length; i++) { + const row = rows[i]!; + // Skip entirely empty rows + if (row.every((v) => String(v).trim() === "")) continue; + + const record = new Record(); + const data = record.dataRef(); + + for (let j = 0; j < row.length; j++) { + const key = fields[j] ?? String(j); + const val = String(row[j] ?? ""); + // Try to parse numbers + const num = Number(val); + if (val !== "" && !isNaN(num) && isFinite(num)) { + setKey(data, key, num); + } else { + setKey(data, key, val); + } + } + + if (this.allSheets) { + setKey(data, "sheet", sheetName); + } + + this.pushRecord(record); + } + } + } +} + +import type { CommandDoc } from "../../types/CommandDoc.ts"; + +export const documentation: CommandDoc = { + name: "fromxls", + category: "input", + synopsis: "recs fromxls [options] ", + description: + "Parse Excel files (xls, xlsx, xlsb, xlsm) into records. " + + "By default, reads the first sheet and uses the first row as header names.", + options: [ + { + flags: ["--key", "-k"], + argument: "", + description: + "Comma separated list of field names. Overrides header detection.", + }, + { + flags: ["--field", "-f"], + argument: "", + description: + "Comma separated list of field names. Overrides header detection.", + }, + { + flags: ["--no-header", "-n"], + description: + "Do not treat the first row as a header. Fields will be named numerically (0, 1, 2, ...).", + }, + { + flags: ["--sheet", "-s"], + argument: "", + description: + "Specify a sheet name to read. Defaults to the first sheet.", + }, + { + flags: ["--all-sheets"], + description: + "Read all sheets in the workbook, adding a 'sheet' field to each record.", + }, + ], + examples: [ + { + description: "Read an Excel file using headers from the first row", + command: "recs fromxls data.xlsx", + }, + { + description: "Read a specific sheet without headers", + command: "recs fromxls --sheet 'Sheet2' --no-header -k name,value data.xlsx", + }, + { + description: "Read all sheets", + command: "recs fromxls --all-sheets data.xlsx", + }, + ], + seeAlso: ["fromcsv"], +}; diff --git a/src/operations/output/toptable.ts b/src/operations/output/toptable.ts index db305543..95e1f64c 100644 --- a/src/operations/output/toptable.ts +++ b/src/operations/output/toptable.ts @@ -1,3 +1,4 @@ +import stringWidth from "string-width"; import { Record } from "../../Record.ts"; import { Operation, type RecordReceiver, type OptionDef } from "../../Operation.ts"; import { Accumulator } from "../../Accumulator.ts"; @@ -5,6 +6,10 @@ import { KeyGroups } from "../../KeyGroups.ts"; import { KeySpec } from "../../KeySpec.ts"; import type { JsonObject } from "../../types/json.ts"; +function displayWidth(str: string): number { + return stringWidth(str); +} + /** * A node in the values tree used to track unique value tuples. * [0] = children map, [1] = insertion-order keys, [2] = assigned index (-1 if unset) @@ -433,14 +438,14 @@ export class ToPtable extends Operation { table[heightOffset + j]![widthOffset + i] = v; } - // Calculate column widths + // Calculate column widths using visual display width const colWidths: number[] = []; for (const row of table) { while (colWidths.length < row.length) { colWidths.push(0); } for (let i = 0; i < row.length; i++) { - const l = row[i]!.length; + const l = displayWidth(row[i]!); if (l > colWidths[i]!) { colWidths[i] = l; } @@ -615,7 +620,7 @@ function formatTableRow( let s = delim; for (let i = 0; i < widths.length; i++) { let cell = cellFn(i, widths[i]!); - cell += " ".repeat(Math.max(0, widths[i]! - cell.length)); + cell += " ".repeat(Math.max(0, widths[i]! - displayWidth(cell))); s += cell + delim; } return s; diff --git a/src/operations/output/totable.ts b/src/operations/output/totable.ts index c04d5d9a..0ee9cf8d 100644 --- a/src/operations/output/totable.ts +++ b/src/operations/output/totable.ts @@ -1,9 +1,30 @@ +import stringWidth from "string-width"; import { Record } from "../../Record.ts"; import { Operation, type RecordReceiver, type OptionDef } from "../../Operation.ts"; import { Accumulator } from "../../Accumulator.ts"; import { KeyGroups } from "../../KeyGroups.ts"; import { findKey } from "../../KeySpec.ts"; +/** + * Compute the visual display width of a string, accounting for + * East Asian wide characters, emoji, and other Unicode oddities. + */ +function displayWidth(str: string): number { + return stringWidth(str); +} + +/** + * Escape control characters (newlines, tabs, etc.) in a string so + * they don't break table alignment. + */ +function escapeForTable(str: string): string { + return str + .replace(/\\/g, "\\\\") + .replace(/\n/g, "\\n") + .replace(/\r/g, "\\r") + .replace(/\t/g, "\\t"); +} + /** * Pretty prints records as an ASCII table with column alignment. * Reads the entire record stream to determine column sizes. @@ -99,8 +120,9 @@ export class ToTable extends Operation { } const val = this.extractField(record, field); const currentMax = widths.get(field)!; - if (val.length > currentMax) { - widths.set(field, val.length); + const w = displayWidth(val); + if (w > currentMax) { + widths.set(field, w); } } } @@ -109,8 +131,9 @@ export class ToTable extends Operation { if (!this.noHeader) { for (const field of fields) { const currentMax = widths.get(field)!; - if (field.length > currentMax) { - widths.set(field, field.length); + const w = displayWidth(field); + if (w > currentMax) { + widths.set(field, w); } } } @@ -179,8 +202,8 @@ export class ToTable extends Operation { for (const field of fields) { let fieldStr = formatter(field, field); - if (!this.spreadsheet && fieldStr.length < widths.get(field)!) { - fieldStr += " ".repeat(widths.get(field)! - fieldStr.length); + if (!this.spreadsheet && displayWidth(fieldStr) < widths.get(field)!) { + fieldStr += " ".repeat(widths.get(field)! - displayWidth(fieldStr)); } if (first) { @@ -199,8 +222,8 @@ export class ToTable extends Operation { const data = record.dataRef(); const val = findKey(data, field, true); if (val === undefined || val === null) return ""; - if (typeof val === "object") return JSON.stringify(val); - return String(val); + if (typeof val === "object") return escapeForTable(JSON.stringify(val)); + return escapeForTable(String(val)); } override doesRecordOutput(): boolean { diff --git a/src/operations/transform/decollate.ts b/src/operations/transform/decollate.ts index 8ad77ea7..fca92336 100644 --- a/src/operations/transform/decollate.ts +++ b/src/operations/transform/decollate.ts @@ -3,6 +3,7 @@ import type { OptionDef } from "../../Operation.ts"; import { deaggregatorRegistry } from "../../Deaggregator.ts"; import type { Deaggregator } from "../../Deaggregator.ts"; import { Record } from "../../Record.ts"; +import type { JsonValue } from "../../types/json.ts"; /** * Reverse of collate: takes a single record and produces multiple records @@ -12,6 +13,7 @@ import { Record } from "../../Record.ts"; */ export class DecollateOperation extends Operation { deaggregators: Deaggregator[] = []; + onlyDeaggregated = false; init(args: string[]): void { const deaggSpecs: string[] = []; @@ -27,6 +29,13 @@ export class DecollateOperation extends Operation { }, description: "Deaggregator specification (colon-separated)", }, + { + long: "only", + short: "o", + type: "boolean", + handler: () => { this.onlyDeaggregated = true; }, + description: "Only output deaggregated fields, excluding original record fields", + }, { long: "list-deaggregators", type: "boolean", @@ -55,11 +64,24 @@ export class DecollateOperation extends Operation { const results = deaggregator.deaggregate(record); for (const deaggregated of results) { - // Merge original record with deaggregated fields - const merged = new Record({ - ...record.toJSON(), - ...deaggregated.toJSON(), - }); + let merged: Record; + if (this.onlyDeaggregated) { + // Extract only the fields that differ from the original record + const origData = record.toJSON(); + const deaggData = deaggregated.toJSON(); + const onlyNew: { [key: string]: JsonValue } = {}; + for (const [k, v] of Object.entries(deaggData)) { + if (!(k in origData) || origData[k] !== v) { + onlyNew[k] = v; + } + } + merged = new Record(onlyNew); + } else { + merged = new Record({ + ...record.toJSON(), + ...deaggregated.toJSON(), + }); + } this.deaggregateRecursive(depth + 1, merged); } } else { @@ -83,6 +105,12 @@ export const documentation: CommandDoc = { description: "Deaggregator specification (colon-separated).", argument: "", }, + { + flags: ["--only", "-o"], + description: + "Only output deaggregated fields, excluding original record fields. " + + "Useful when you only want the expanded data, not the source record.", + }, { flags: ["--list-deaggregators"], description: "List available deaggregators and exit.", @@ -93,6 +121,10 @@ export const documentation: CommandDoc = { description: "Split the 'hosts' field into individual 'host' fields", command: "recs decollate --deaggregator 'split,hosts,/\\s*,\\s*/,host'", }, + { + description: "Decollate and only keep deaggregated fields", + command: "recs decollate --only -d 'unarray,items,,item'", + }, ], seeAlso: ["collate"], }; diff --git a/src/operations/transform/index.ts b/src/operations/transform/index.ts index 42ebdcdb..c9a47ff5 100644 --- a/src/operations/transform/index.ts +++ b/src/operations/transform/index.ts @@ -14,5 +14,6 @@ export { SubstreamOperation } from "./substream.ts"; export { JoinOperation } from "./join.ts"; export { CollateOperation } from "./collate.ts"; export { DecollateOperation } from "./decollate.ts"; +export { ParseDateOperation } from "./parsedate.ts"; export { ChainOperation, registerOperationFactory, isRecsOperation, createOperation } from "./chain.ts"; export { MultiplexOperation } from "./multiplex.ts"; diff --git a/src/operations/transform/multiplex.ts b/src/operations/transform/multiplex.ts index 746ae443..e45bf893 100644 --- a/src/operations/transform/multiplex.ts +++ b/src/operations/transform/multiplex.ts @@ -1,3 +1,5 @@ +import { writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs"; +import { dirname } from "node:path"; import { Operation, CollectorReceiver } from "../../Operation.ts"; import type { OptionDef } from "../../Operation.ts"; import type { ClumperCallback } from "../../Clumper.ts"; @@ -16,34 +18,63 @@ class MultiplexClumperCallback implements ClumperCallback { operationName: string; operationArgs: string[]; lineKey: string | null; + outputFileKey: string | null; + outputFileEval: string | null; pushRecordCb: (record: Record) => boolean; pushLineCb: (line: string) => void; + initializedFiles = new Set(); constructor( operationName: string, operationArgs: string[], lineKey: string | null, + outputFileKey: string | null, + outputFileEval: string | null, pushRecordCb: (record: Record) => boolean, pushLineCb: (line: string) => void ) { this.operationName = operationName; this.operationArgs = operationArgs; this.lineKey = lineKey; + this.outputFileKey = outputFileKey; + this.outputFileEval = outputFileEval; this.pushRecordCb = pushRecordCb; this.pushLineCb = pushLineCb; } - clumperCallbackBegin(_options: { [key: string]: unknown }): unknown { + resolveOutputFile(options: { [key: string]: unknown }): string | null { + if (this.outputFileKey) { + const val = options[this.outputFileKey]; + if (val !== undefined && val !== null) { + return String(val); + } + return null; + } + if (this.outputFileEval) { + // Replace {{key}} placeholders with group values + let filename = this.outputFileEval; + for (const [key, val] of Object.entries(options)) { + filename = filename.replace( + new RegExp(`\\{\\{${key.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\}\\}`, "g"), + String(val ?? "") + ); + } + return filename; + } + return null; + } + + clumperCallbackBegin(options: { [key: string]: unknown }): unknown { const collector = new CollectorReceiver(); const op = createOperation(this.operationName, [...this.operationArgs], collector); - return { operation: op, collector }; + const outputFile = this.resolveOutputFile(options); + return { operation: op, collector, outputFile }; } clumperCallbackPushRecord(cookie: unknown, record: Record): void { - const state = cookie as { operation: Operation; collector: CollectorReceiver }; + const state = cookie as { operation: Operation; collector: CollectorReceiver; outputFile: string | null }; if (this.lineKey) { - // Use the value of lineKey as input line to the operation const data = record.dataRef() as JsonObject; const lineValue = findKey(data, this.lineKey, true); if (lineValue !== undefined && lineValue !== null) { @@ -55,12 +86,30 @@ class MultiplexClumperCallback implements ClumperCallback { } clumperCallbackEnd(cookie: unknown): void { - const state = cookie as { operation: Operation; collector: CollectorReceiver }; + const state = cookie as { operation: Operation; collector: CollectorReceiver; outputFile: string | null }; state.operation.finish(); - // Push all collected records - for (const record of state.collector.records) { - this.pushRecordCb(record); + if (state.outputFile) { + // Write output to file + const dir = dirname(state.outputFile); + if (dir && dir !== "." && !existsSync(dir)) { + mkdirSync(dir, { recursive: true }); + } + + // Initialize file (truncate) on first write, then append + if (!this.initializedFiles.has(state.outputFile)) { + writeFileSync(state.outputFile, ""); + this.initializedFiles.add(state.outputFile); + } + + for (const record of state.collector.records) { + appendFileSync(state.outputFile, record.toString() + "\n"); + } + } else { + // Push records to downstream + for (const record of state.collector.records) { + this.pushRecordCb(record); + } } } } @@ -85,6 +134,8 @@ export class MultiplexOperation extends Operation { init(args: string[]): void { const clumperOptions = new ClumperOptions(); let lineKey: string | null = null; + let outputFileKey: string | null = null; + let outputFileEval: string | null = null; const defs: OptionDef[] = [ { @@ -118,6 +169,20 @@ export class MultiplexOperation extends Operation { handler: (v) => { lineKey = v as string; }, description: "Use this key's value as line input for the nested operation", }, + { + long: "output-file-key", + short: "o", + type: "string", + handler: (v) => { outputFileKey = v as string; }, + description: "Write each group's output to a file named by the value of this key field", + }, + { + long: "output-file-eval", + short: "O", + type: "string", + handler: (v) => { outputFileEval = v as string; }, + description: "Write each group's output to a filename derived from the given expression (supports {{key}} interpolation)", + }, { long: "adjacent", short: "1", @@ -186,6 +251,8 @@ export class MultiplexOperation extends Operation { operationName, operationArgs, lineKey, + outputFileKey, + outputFileEval, (record: Record) => this.pushRecord(record), (line: string) => this.pushLine(line) ); @@ -231,6 +298,19 @@ export const documentation: CommandDoc = { "(rather than the entire record). Use with recs from* operations generally.", argument: "", }, + { + flags: ["--output-file-key", "-o"], + description: + "Write each group's output to a separate file, using the value of the given key as the filename.", + argument: "", + }, + { + flags: ["--output-file-eval", "-O"], + description: + "Write each group's output to a separate file, with filename determined by the given expression. " + + "Supports {{key}} interpolation with group key values.", + argument: "", + }, { flags: ["--adjacent", "-1"], description: "Only group together adjacent records. Avoids spooling records into memory.", @@ -275,6 +355,11 @@ export const documentation: CommandDoc = { command: "recs fromre '^(.*PID=([0-9]*).*)$' -f line,pid | recs multiplex -L line -k pid -- recs frommultire ...", }, + { + description: "Write each group's CSV output to separate files by department", + command: + "recs multiplex -k department -O 'output-{{department}}.csv' -- recs tocsv", + }, ], seeAlso: ["collate", "chain"], }; diff --git a/src/operations/transform/parsedate.ts b/src/operations/transform/parsedate.ts new file mode 100644 index 00000000..b7adf337 --- /dev/null +++ b/src/operations/transform/parsedate.ts @@ -0,0 +1,333 @@ +import { Operation } from "../../Operation.ts"; +import type { OptionDef } from "../../Operation.ts"; +import { findKey, setKey } from "../../KeySpec.ts"; +import { Record } from "../../Record.ts"; +import type { JsonObject } from "../../types/json.ts"; + +/** + * Parse date/time strings and reformat them. Reads a date string from a key, + * parses it, and writes the result (epoch seconds or formatted string) to an + * output key. + * + * Inspired by the Perl PR #74 parsedate operation. + */ +export class ParseDateOperation extends Operation { + inputKey = ""; + outputKey = ""; + inputFormat: string | null = null; + outputFormat: string | null = null; + epoch = false; + outputEpoch = false; + timezone: string | null = null; + + init(args: string[]): void { + const defs: OptionDef[] = [ + { + long: "key", + short: "k", + type: "string", + handler: (v) => { this.inputKey = v as string; }, + description: "Key field containing the date/time string to parse", + }, + { + long: "output", + short: "o", + type: "string", + handler: (v) => { this.outputKey = v as string; }, + description: "Output key for the parsed date (defaults to parsed_)", + }, + { + long: "format", + short: "f", + type: "string", + handler: (v) => { this.inputFormat = v as string; }, + description: "Input format for parsing (strftime-style: %Y-%m-%d %H:%M:%S)", + }, + { + long: "output-format", + short: "F", + type: "string", + handler: (v) => { this.outputFormat = v as string; }, + description: "Output format (strftime-style: %Y-%m-%d %H:%M:%S). Default is ISO 8601.", + }, + { + long: "epoch", + short: "e", + type: "boolean", + handler: () => { this.epoch = true; }, + description: "Input date is in epoch seconds", + }, + { + long: "output-epoch", + short: "E", + type: "boolean", + handler: () => { this.outputEpoch = true; }, + description: "Output as epoch seconds instead of a formatted string", + }, + { + long: "timezone", + short: "z", + type: "string", + handler: (v) => { this.timezone = v as string; }, + description: "Timezone for output (IANA name like America/New_York)", + }, + ]; + + this.parseOptions(args, defs); + + if (!this.inputKey) { + throw new Error("Must specify --key"); + } + if (!this.outputKey) { + this.outputKey = `parsed_${this.inputKey.replace(/\//g, "_")}`; + } + } + + acceptRecord(record: Record): boolean { + const data = record.dataRef() as JsonObject; + const value = findKey(data, this.inputKey, true); + + if (value === undefined || value === null || value === "") { + this.pushRecord(record); + return true; + } + + let date: Date; + + if (this.epoch) { + date = new Date(Number(value) * 1000); + } else if (this.inputFormat) { + date = parseWithFormat(String(value), this.inputFormat); + } else { + // Try JavaScript's native date parsing + date = new Date(String(value)); + } + + if (isNaN(date.getTime())) { + throw new Error(`Cannot parse date from key '${this.inputKey}', value: ${String(value)}`); + } + + let output: string | number; + if (this.outputEpoch) { + output = Math.floor(date.getTime() / 1000); + } else if (this.outputFormat) { + output = formatDate(date, this.outputFormat, this.timezone ?? undefined); + } else { + output = date.toISOString(); + } + + setKey(data, this.outputKey, output); + this.pushRecord(record); + return true; + } +} + +/** + * Parse a date string given a strftime-style format. + * Supports common directives: %Y, %m, %d, %H, %M, %S, %b, %B, %p, %Z, %z + */ +function parseWithFormat(input: string, format: string): Date { + const monthNames: { [key: string]: number } = { + jan: 0, january: 0, feb: 1, february: 1, mar: 2, march: 2, + apr: 3, april: 3, may: 4, jun: 5, june: 5, + jul: 6, july: 6, aug: 7, august: 7, sep: 8, september: 8, + oct: 9, october: 9, nov: 10, november: 10, dec: 11, december: 11, + }; + + let year = 1970, month = 0, day = 1, hour = 0, minute = 0, second = 0; + let isPM = false; + + // Build a regex from the format string + let regexStr = "^"; + const captures: string[] = []; + let i = 0; + while (i < format.length) { + if (format[i] === "%") { + i++; + const directive = format[i]!; + switch (directive) { + case "Y": regexStr += "(\\d{4})"; captures.push("Y"); break; + case "y": regexStr += "(\\d{2})"; captures.push("y"); break; + case "m": regexStr += "(\\d{1,2})"; captures.push("m"); break; + case "d": regexStr += "(\\d{1,2})"; captures.push("d"); break; + case "H": regexStr += "(\\d{1,2})"; captures.push("H"); break; + case "I": regexStr += "(\\d{1,2})"; captures.push("I"); break; + case "M": regexStr += "(\\d{1,2})"; captures.push("M"); break; + case "S": regexStr += "(\\d{1,2})"; captures.push("S"); break; + case "b": case "B": + regexStr += "([A-Za-z]+)"; captures.push("b"); break; + case "p": regexStr += "(AM|PM|am|pm)"; captures.push("p"); break; + case "Z": regexStr += "(\\S+)"; captures.push("Z"); break; + case "z": regexStr += "([+-]\\d{2}:?\\d{2})"; captures.push("z"); break; + case "%": regexStr += "%"; break; + default: regexStr += directive; break; + } + } else { + regexStr += format[i]!.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + } + i++; + } + regexStr += "$"; + + const match = input.match(new RegExp(regexStr)); + if (!match) { + return new Date(NaN); + } + + for (let j = 0; j < captures.length; j++) { + const val = match[j + 1]!; + switch (captures[j]) { + case "Y": year = parseInt(val, 10); break; + case "y": year = 2000 + parseInt(val, 10); break; + case "m": month = parseInt(val, 10) - 1; break; + case "d": day = parseInt(val, 10); break; + case "H": hour = parseInt(val, 10); break; + case "I": hour = parseInt(val, 10); break; + case "M": minute = parseInt(val, 10); break; + case "S": second = parseInt(val, 10); break; + case "b": { + const idx = monthNames[val.toLowerCase()]; + if (idx !== undefined) month = idx; + break; + } + case "p": isPM = val.toLowerCase() === "pm"; break; + } + } + + if (isPM && hour < 12) hour += 12; + if (!isPM && hour === 12 && captures.includes("p")) hour = 0; + + return new Date(year, month, day, hour, minute, second); +} + +/** + * Format a Date object using strftime-style directives. + */ +function formatDate(date: Date, format: string, timezone?: string): string { + // Use Intl.DateTimeFormat for timezone conversion if needed + let d = date; + if (timezone) { + // Create a new date adjusted for the target timezone + const formatter = new Intl.DateTimeFormat("en-US", { + timeZone: timezone, + year: "numeric", month: "2-digit", day: "2-digit", + hour: "2-digit", minute: "2-digit", second: "2-digit", + hour12: false, + }); + const parts = formatter.formatToParts(date); + const get = (type: string) => parts.find((p) => p.type === type)?.value ?? "0"; + d = new Date( + parseInt(get("year"), 10), + parseInt(get("month"), 10) - 1, + parseInt(get("day"), 10), + parseInt(get("hour"), 10), + parseInt(get("minute"), 10), + parseInt(get("second"), 10), + ); + } + + const pad = (n: number, len = 2) => String(n).padStart(len, "0"); + const monthNames = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]; + const fullMonths = ["January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December"]; + const dayNames = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]; + + let result = ""; + let i = 0; + while (i < format.length) { + if (format[i] === "%") { + i++; + const directive = format[i]!; + switch (directive) { + case "Y": result += pad(d.getFullYear(), 4); break; + case "y": result += pad(d.getFullYear() % 100); break; + case "m": result += pad(d.getMonth() + 1); break; + case "d": result += pad(d.getDate()); break; + case "H": result += pad(d.getHours()); break; + case "I": result += pad(d.getHours() % 12 || 12); break; + case "M": result += pad(d.getMinutes()); break; + case "S": result += pad(d.getSeconds()); break; + case "b": result += monthNames[d.getMonth()]; break; + case "B": result += fullMonths[d.getMonth()]; break; + case "a": result += dayNames[d.getDay()]; break; + case "p": result += d.getHours() < 12 ? "AM" : "PM"; break; + case "s": result += Math.floor(d.getTime() / 1000); break; + case "%": result += "%"; break; + default: result += "%" + directive; break; + } + } else { + result += format[i]; + } + i++; + } + return result; +} + +import type { CommandDoc } from "../../types/CommandDoc.ts"; + +export const documentation: CommandDoc = { + name: "parsedate", + category: "transform", + synopsis: "recs parsedate [options] [files...]", + description: + "Parse date/time strings from a field and output them in a normalized format. " + + "Supports epoch seconds, ISO 8601, and custom strftime-style format strings for " + + "both input and output.", + options: [ + { + flags: ["--key", "-k"], + argument: "", + description: "Key field containing the date/time string to parse.", + required: true, + }, + { + flags: ["--output", "-o"], + argument: "", + description: "Output key for the parsed date (defaults to 'parsed_').", + }, + { + flags: ["--format", "-f"], + argument: "", + description: + "Input format for parsing (strftime-style). Common directives: " + + "%Y (4-digit year), %m (month), %d (day), %H (hour 24h), %M (minute), %S (second), " + + "%b (abbreviated month name), %p (AM/PM).", + }, + { + flags: ["--output-format", "-F"], + argument: "", + description: + "Output format (strftime-style). Defaults to ISO 8601 if not specified.", + }, + { + flags: ["--epoch", "-e"], + description: "Input date is in epoch seconds.", + }, + { + flags: ["--output-epoch", "-E"], + description: "Output as epoch seconds instead of a formatted string.", + }, + { + flags: ["--timezone", "-z"], + argument: "", + description: + "Timezone for output formatting (IANA name like 'America/New_York').", + }, + ], + examples: [ + { + description: "Parse dates and output as epoch seconds", + command: "recs parsedate -k date -E", + }, + { + description: "Parse a custom date format and reformat it", + command: "recs parsedate -k timestamp -f '%d/%b/%Y:%H:%M:%S' -F '%Y-%m-%d %H:%M:%S'", + }, + { + description: "Convert epoch seconds to ISO 8601", + command: "recs parsedate -k time -e -o iso_time", + }, + ], + seeAlso: ["normalizetime"], +}; diff --git a/tests/cli/help.test.ts b/tests/cli/help.test.ts index 1b670103..82994f8e 100644 --- a/tests/cli/help.test.ts +++ b/tests/cli/help.test.ts @@ -8,9 +8,9 @@ import { import type { CommandDoc } from "../../src/types/CommandDoc.ts"; describe("loadAllDocs", () => { - test("loads documentation for all 43 operations", async () => { + test("loads documentation for all 45 operations", async () => { const docs = await loadAllDocs(); - expect(docs.length).toBe(43); + expect(docs.length).toBe(45); }); test("every doc has required fields", async () => { diff --git a/tests/cli/manpages.test.ts b/tests/cli/manpages.test.ts index dc6462e5..080b3879 100644 --- a/tests/cli/manpages.test.ts +++ b/tests/cli/manpages.test.ts @@ -25,10 +25,10 @@ describe("man page generation", () => { const expectedCommands = [ "fromapache", "fromatomfeed", "fromcsv", "fromdb", "fromjsonarray", "fromkv", "frommongo", "frommultire", "fromps", "fromre", - "fromsplit", "fromtcpdump", "fromxferlog", "fromxml", + "fromsplit", "fromtcpdump", "fromxferlog", "fromxls", "fromxml", "annotate", "assert", "chain", "collate", "decollate", "delta", "eval", "expandjson", "flatten", "generate", "grep", "join", "multiplex", - "normalizetime", "sort", "stream2table", "substream", "topn", "xform", + "normalizetime", "parsedate", "sort", "stream2table", "substream", "topn", "xform", "tochart", "tocsv", "todb", "togdgraph", "tognuplot", "tohtml", "tojsonarray", "toprettyprint", "toptable", "totable", ]; @@ -39,12 +39,12 @@ describe("man page generation", () => { } }); - test("generates exactly 44 man pages (43 commands + recs.1)", () => { + test("generates exactly 46 man pages (45 commands + recs.1)", () => { const { readdirSync } = require("node:fs"); const files = (readdirSync(MAN_DIR) as string[]).filter( (f: string) => f.endsWith(".1") ); - expect(files.length).toBe(44); + expect(files.length).toBe(46); }); test("recs.1 contains proper troff header", () => { diff --git a/tests/operations/output/totable-unicode.test.ts b/tests/operations/output/totable-unicode.test.ts new file mode 100644 index 00000000..948d91b4 --- /dev/null +++ b/tests/operations/output/totable-unicode.test.ts @@ -0,0 +1,50 @@ +import { describe, test, expect } from "bun:test"; +import { ToTable } from "../../../src/operations/output/totable.ts"; +import { testOutput } from "./testHelper.ts"; + +describe("ToTable - Unicode and special characters (issue #71)", () => { + test("CJK wide characters align correctly", () => { + const stream = `{"name":"你好","val":"1"} +{"name":"hi","val":"2"}`; + const actual = testOutput(ToTable, [], stream); + // "你好" is 4 display columns wide (2 chars * 2 width each) + // "name" is 4 display columns wide + // So both should be padded to 4 + const lines = actual.split("\n").filter(l => l !== ""); + // All lines should have the same visual width pattern + expect(lines.length).toBe(4); // header, dash, 2 data rows + // The separator line should use the correct width + expect(lines[1]).toContain("----"); + }); + + test("emoji characters align correctly", () => { + const stream = `{"icon":"🎉","val":"a"} +{"icon":"x","val":"b"}`; + const actual = testOutput(ToTable, [], stream); + const lines = actual.split("\n").filter(l => l !== ""); + expect(lines.length).toBe(4); + // 🎉 is 2 display columns; "icon" is 4; so column width should be 4 + }); + + test("newlines in field values are escaped", () => { + const stream = `{"text":"line1\\nline2","val":"ok"}`; + const actual = testOutput(ToTable, [], stream); + // Newlines should be escaped as \\n so they don't break alignment + expect(actual).toContain("\\n"); + expect(actual).not.toContain("line1\nline2"); + }); + + test("tab characters in field values are escaped", () => { + const stream = `{"text":"col1\\tcol2","val":"ok"}`; + const actual = testOutput(ToTable, [], stream); + expect(actual).toContain("\\t"); + }); + + test("ellipsis character aligns correctly", () => { + const stream = `{"text":"hello…","val":"1"} +{"text":"world!","val":"2"}`; + const actual = testOutput(ToTable, [], stream); + const lines = actual.split("\n").filter(l => l !== ""); + expect(lines.length).toBe(4); + }); +}); diff --git a/tests/operations/transform/collate-empty-stream.test.ts b/tests/operations/transform/collate-empty-stream.test.ts new file mode 100644 index 00000000..7bf474e3 --- /dev/null +++ b/tests/operations/transform/collate-empty-stream.test.ts @@ -0,0 +1,68 @@ +import { describe, test, expect } from "bun:test"; +import { CollectorReceiver } from "../../../src/Operation.ts"; +import { CollateOperation } from "../../../src/operations/transform/collate.ts"; + +// Import aggregators to ensure they're registered +import "../../../src/aggregators/Average.ts"; +import "../../../src/aggregators/Sum.ts"; +import "../../../src/aggregators/Count.ts"; +import "../../../src/aggregators/Maximum.ts"; +import "../../../src/aggregators/Minimum.ts"; + +function makeOp(args: string[]): { op: CollateOperation; collector: CollectorReceiver } { + const collector = new CollectorReceiver(); + const op = new CollateOperation(collector); + op.init(args); + return { op, collector }; +} + +describe("CollateOperation - empty stream (issue #65)", () => { + test("avg on empty stream does not crash", () => { + const { op, collector } = makeOp(["--aggregator", "avg,value"]); + // Feed no records at all - should not throw + op.finish(); + + // With no keys and no input, collate produces nothing (correct per issue discussion) + expect(collector.records.length).toBe(0); + }); + + test("count on empty stream does not crash", () => { + const { op, collector } = makeOp(["--aggregator", "count"]); + op.finish(); + expect(collector.records.length).toBe(0); + }); + + test("sum on empty stream does not crash", () => { + const { op, collector } = makeOp(["--aggregator", "sum,x"]); + op.finish(); + expect(collector.records.length).toBe(0); + }); + + test("max on empty stream does not crash", () => { + const { op, collector } = makeOp(["--aggregator", "max,x"]); + op.finish(); + expect(collector.records.length).toBe(0); + }); + + test("min on empty stream does not crash", () => { + const { op, collector } = makeOp(["--aggregator", "min,x"]); + op.finish(); + expect(collector.records.length).toBe(0); + }); + + test("avg on empty stream with key produces no records", () => { + const { op, collector } = makeOp(["--key", "group", "--aggregator", "avg,value"]); + op.finish(); + expect(collector.records.length).toBe(0); + }); + + test("multiple aggregators on empty stream does not crash", () => { + const { op, collector } = makeOp([ + "--aggregator", "avg,value", + "--aggregator", "count", + "--aggregator", "sum,value", + ]); + op.finish(); + expect(collector.records.length).toBe(0); + }); +}); diff --git a/tests/operations/transform/decollate-only.test.ts b/tests/operations/transform/decollate-only.test.ts new file mode 100644 index 00000000..3666486e --- /dev/null +++ b/tests/operations/transform/decollate-only.test.ts @@ -0,0 +1,59 @@ +import { describe, test, expect } from "bun:test"; +import { Record } from "../../../src/Record.ts"; +import { CollectorReceiver } from "../../../src/Operation.ts"; +import { DecollateOperation } from "../../../src/operations/transform/decollate.ts"; +import { deaggregatorRegistry } from "../../../src/Deaggregator.ts"; +import type { Deaggregator } from "../../../src/Deaggregator.ts"; + +function registerTestDeaggregators(): void { + if (deaggregatorRegistry.has("split")) return; + + deaggregatorRegistry.register("split", { + create: (field: string, delim: string, outputField: string) => ({ + deaggregate: (record: Record): Record[] => { + const val = record.get(field); + if (typeof val !== "string") return [new Record()]; + const parts = val.split(delim); + return parts.map((part) => new Record({ [outputField]: part })); + }, + }) as Deaggregator, + argCounts: [3], + shortUsage: "Split a field", + longUsage: "Splits a field by delimiter into separate records", + }); +} + +function makeOp(args: string[]): { op: DecollateOperation; collector: CollectorReceiver } { + registerTestDeaggregators(); + const collector = new CollectorReceiver(); + const op = new DecollateOperation(collector); + op.init(args); + return { op, collector }; +} + +describe("DecollateOperation --only flag (issue #81)", () => { + test("without --only: includes original fields", () => { + const { op, collector } = makeOp(["--deaggregator", "split,hosts, ,host"]); + + op.acceptRecord(new Record({ hosts: "a b", group: "g1" })); + op.finish(); + + expect(collector.records.length).toBe(2); + expect(collector.records[0]!.get("host")).toBe("a"); + expect(collector.records[0]!.get("group")).toBe("g1"); // Original field preserved + expect(collector.records[0]!.get("hosts")).toBe("a b"); // Original field preserved + }); + + test("with --only: excludes original fields", () => { + const { op, collector } = makeOp(["--only", "--deaggregator", "split,hosts, ,host"]); + + op.acceptRecord(new Record({ hosts: "a b", group: "g1" })); + op.finish(); + + expect(collector.records.length).toBe(2); + expect(collector.records[0]!.get("host")).toBe("a"); + expect(collector.records[0]!.get("group")).toBeUndefined(); // Original excluded + expect(collector.records[0]!.get("hosts")).toBeUndefined(); // Original excluded + expect(collector.records[1]!.get("host")).toBe("b"); + }); +}); diff --git a/tests/operations/transform/parsedate.test.ts b/tests/operations/transform/parsedate.test.ts new file mode 100644 index 00000000..2f4438f8 --- /dev/null +++ b/tests/operations/transform/parsedate.test.ts @@ -0,0 +1,105 @@ +import { describe, test, expect } from "bun:test"; +import { Record } from "../../../src/Record.ts"; +import { CollectorReceiver } from "../../../src/Operation.ts"; +import { ParseDateOperation } from "../../../src/operations/transform/parsedate.ts"; + +function makeOp(args: string[]): { op: ParseDateOperation; collector: CollectorReceiver } { + const collector = new CollectorReceiver(); + const op = new ParseDateOperation(collector); + op.init(args); + return { op, collector }; +} + +describe("ParseDateOperation", () => { + test("parses ISO date string to ISO output", () => { + const { op, collector } = makeOp(["-k", "date"]); + op.acceptRecord(new Record({ date: "2024-01-15T12:30:00Z", val: 1 })); + op.finish(); + + expect(collector.records.length).toBe(1); + expect(collector.records[0]!.get("parsed_date")).toBe("2024-01-15T12:30:00.000Z"); + expect(collector.records[0]!.get("val")).toBe(1); // Original field preserved + }); + + test("parses epoch seconds to ISO", () => { + const { op, collector } = makeOp(["-k", "ts", "-e"]); + op.acceptRecord(new Record({ ts: 1705312200 })); + op.finish(); + + expect(collector.records.length).toBe(1); + const parsed = collector.records[0]!.get("parsed_ts") as string; + expect(parsed).toContain("2024-01-15"); + }); + + test("outputs epoch seconds with --output-epoch", () => { + const { op, collector } = makeOp(["-k", "date", "-E"]); + op.acceptRecord(new Record({ date: "2024-01-15T12:30:00Z" })); + op.finish(); + + expect(collector.records.length).toBe(1); + // The exact epoch depends on timezone, so just verify it's a reasonable number + const epoch = collector.records[0]!.get("parsed_date") as number; + expect(typeof epoch).toBe("number"); + // Should be within a day of the expected value + expect(Math.abs(epoch - 1705318200)).toBeLessThan(86400); + }); + + test("custom output format", () => { + const { op, collector } = makeOp(["-k", "date", "-F", "%Y/%m/%d"]); + op.acceptRecord(new Record({ date: "2024-01-15T00:00:00Z" })); + op.finish(); + + // Since the date is parsed at UTC but formatted at local time, + // we just check the format pattern + const parsed = collector.records[0]!.get("parsed_date") as string; + expect(parsed).toMatch(/^\d{4}\/\d{2}\/\d{2}$/); + }); + + test("custom input format", () => { + const { op, collector } = makeOp(["-k", "date", "-f", "%d/%m/%Y", "-E"]); + op.acceptRecord(new Record({ date: "15/01/2024" })); + op.finish(); + + const epoch = collector.records[0]!.get("parsed_date") as number; + const parsed = new Date(epoch * 1000); + expect(parsed.getFullYear()).toBe(2024); + expect(parsed.getMonth()).toBe(0); // January + expect(parsed.getDate()).toBe(15); + }); + + test("custom output key", () => { + const { op, collector } = makeOp(["-k", "date", "-o", "my_date"]); + op.acceptRecord(new Record({ date: "2024-01-15T00:00:00Z" })); + op.finish(); + + expect(collector.records[0]!.get("my_date")).toBeDefined(); + expect(collector.records[0]!.get("parsed_date")).toBeUndefined(); + }); + + test("null/empty values pass through", () => { + const { op, collector } = makeOp(["-k", "date"]); + op.acceptRecord(new Record({ date: null, val: 1 })); + op.acceptRecord(new Record({ date: "", val: 2 })); + op.finish(); + + expect(collector.records.length).toBe(2); + // Records should pass through without parsed_date being set + expect(collector.records[0]!.get("val")).toBe(1); + expect(collector.records[1]!.get("val")).toBe(2); + }); + + test("throws on unparseable date", () => { + const { op } = makeOp(["-k", "date"]); + expect(() => { + op.acceptRecord(new Record({ date: "not-a-date" })); + }).toThrow("Cannot parse date"); + }); + + test("requires --key", () => { + expect(() => { + const collector = new CollectorReceiver(); + const op = new ParseDateOperation(collector); + op.init([]); + }).toThrow("Must specify --key"); + }); +}); From 4459b862f41f849c18890ab9a30be7f23428182d Mon Sep 17 00:00:00 2001 From: Ben Bernard Date: Sun, 22 Feb 2026 06:27:56 -0800 Subject: [PATCH 2/5] fix: multiplex output with line-based operations (tocsv, totable, etc.) CollectorReceiver was missing acceptLine(), so output from operations that emit lines (tocsv, totable, toptable, etc.) was silently dropped. This affected both stdout and file output modes in multiplex. - Add lines[] collection and acceptLine() to CollectorReceiver - Update multiplex clumperCallbackEnd to write collected lines - Both file output and stdout now correctly emit line-based output Co-Authored-By: Claude Opus 4.6 --- src/Operation.ts | 6 ++++++ src/operations/transform/multiplex.ts | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/Operation.ts b/src/Operation.ts index ff8eb297..38f90fb2 100644 --- a/src/Operation.ts +++ b/src/Operation.ts @@ -50,12 +50,18 @@ export class PrinterReceiver implements RecordReceiver { */ export class CollectorReceiver implements RecordReceiver { readonly records: Record[] = []; + readonly lines: string[] = []; acceptRecord(record: Record): boolean { this.records.push(record); return true; } + acceptLine(line: string): boolean { + this.lines.push(line); + return true; + } + finish(): void { // nothing to do } diff --git a/src/operations/transform/multiplex.ts b/src/operations/transform/multiplex.ts index e45bf893..8f7b7bf4 100644 --- a/src/operations/transform/multiplex.ts +++ b/src/operations/transform/multiplex.ts @@ -102,10 +102,20 @@ class MultiplexClumperCallback implements ClumperCallback { this.initializedFiles.add(state.outputFile); } + // Write collected lines (from output operations like tocsv) + for (const line of state.collector.lines) { + appendFileSync(state.outputFile, line + "\n"); + } + + // Write collected records (from transform operations) for (const record of state.collector.records) { appendFileSync(state.outputFile, record.toString() + "\n"); } } else { + // Push lines to downstream + for (const line of state.collector.lines) { + this.pushLineCb(line); + } // Push records to downstream for (const record of state.collector.records) { this.pushRecordCb(record); From a47fdd2704c9a2d4879b5a4fada4cd6e56405c35 Mon Sep 17 00:00:00 2001 From: Ben Bernard Date: Sun, 22 Feb 2026 06:33:09 -0800 Subject: [PATCH 3/5] =?UTF-8?q?fix:=20improve=20benchmark=20PR=20comments?= =?UTF-8?q?=20=E2=80=94=20short=20summary=20+=20collapsible=20details?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add concise pass/fail summary at top of PR comment - Put full benchmark tables inside
collapsible section - Group results by suite for readability - Raise visual indicator threshold from 5% to 10% to reduce CI noise - Pass fail threshold to markdown generator for accurate regression display - Remove redundant footer from CI workflow (info now in report itself) - Track suite names through CIResult for grouped display Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 4 +- tests/perf/bench.ts | 105 ++++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8846ee86..197fc3e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,11 +77,9 @@ jobs: console.log('No benchmark report found, skipping comment.'); return; } - let body = fs.readFileSync(reportPath, 'utf-8').trim(); + const body = fs.readFileSync(reportPath, 'utf-8').trim(); if (!body) return; - body += '\n\n---\n*Compared against master baseline. Threshold: 25% regression.*'; - const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, diff --git a/tests/perf/bench.ts b/tests/perf/bench.ts index 86670c08..4c0d443e 100644 --- a/tests/perf/bench.ts +++ b/tests/perf/bench.ts @@ -30,6 +30,7 @@ export interface BenchmarkResult { export interface CIResult { name: string; + suite: string; median: number; mean: number; min: number; @@ -151,11 +152,11 @@ function deltaPercent(current: number, baseline: number): number { return ((current - baseline) / baseline) * 100; } -function fmtDeltaMarkdown(pct: number): string { +function fmtDeltaMarkdown(pct: number, threshold: number = 10): string { const sign = pct > 0 ? "+" : ""; const text = `${sign}${pct.toFixed(1)}%`; - if (pct > 5) return `**${text}** :red_circle:`; - if (pct < -5) return `${text} :green_circle:`; + if (pct > threshold) return `**${text}** :red_circle:`; + if (pct < -threshold) return `${text} :green_circle:`; return `${text}`; } @@ -305,12 +306,14 @@ export class BenchmarkSuite { function buildCIResults( results: BenchmarkResult[], baseline: BaselineData | null, + suite: string = "", ): CIResult[] { return results.map((r) => { const base = baseline?.[r.name]; const delta = base ? deltaPercent(r.median, base.median) : undefined; return { name: r.name, + suite, median: r.median, mean: r.mean, min: r.min, @@ -324,49 +327,90 @@ function buildCIResults( }); } -export function generateMarkdown(ciResults: CIResult[]): string { +export function generateMarkdown(ciResults: CIResult[], failThreshold: number = 25): string { const lines: string[] = []; - lines.push("## Performance Benchmark Results\n"); - lines.push( - "| Benchmark | Median | Baseline | Delta | Throughput |", - ); - lines.push( - "|-----------|--------|----------|-------|------------|", - ); + // Classify results using the visual indicator threshold (10%) + const indicatorThreshold = 10; let faster = 0; let slower = 0; let unchanged = 0; + const regressions: CIResult[] = []; for (const r of ciResults) { - const medianStr = fmtMs(r.median); - const baseStr = r.baselineMedian != null ? fmtMs(r.baselineMedian) : "—"; - let deltaStr = "—"; if (r.deltaPercent != null) { - deltaStr = fmtDeltaMarkdown(r.deltaPercent); - if (r.deltaPercent > 5) slower++; - else if (r.deltaPercent < -5) faster++; + if (r.deltaPercent > indicatorThreshold) slower++; + else if (r.deltaPercent < -indicatorThreshold) faster++; else unchanged++; + if (r.deltaPercent > failThreshold) regressions.push(r); } else { unchanged++; } + } - const throughputParts: string[] = []; - if (r.recordsPerSec) throughputParts.push(`${fmtRate(r.recordsPerSec)} rec/s`); - if (r.mbPerSec) throughputParts.push(`${r.mbPerSec.toFixed(1)} MB/s`); - const throughputStr = throughputParts.join(", ") || "—"; + // --- Top-level summary --- + lines.push("## Performance Benchmark Results\n"); - lines.push( - `| ${r.name} | ${medianStr} | ${baseStr} | ${deltaStr} | ${throughputStr} |`, - ); + if (regressions.length === 0) { + lines.push(`:white_check_mark: **All ${ciResults.length} benchmarks passed** (threshold: ${failThreshold}%)`); + } else { + lines.push(`:warning: **${regressions.length} regression${regressions.length > 1 ? "s" : ""} detected** out of ${ciResults.length} benchmarks (threshold: ${failThreshold}%)`); + lines.push(""); + lines.push("| Benchmark | Median | Baseline | Delta |"); + lines.push("|-----------|--------|----------|-------|"); + for (const r of regressions) { + lines.push( + `| ${r.name} | ${fmtMs(r.median)} | ${r.baselineMedian != null ? fmtMs(r.baselineMedian) : "—"} | ${fmtDeltaMarkdown(r.deltaPercent!, indicatorThreshold)} |`, + ); + } } lines.push(""); - const total = ciResults.length; lines.push( - `**Summary**: ${total} benchmarks: ${faster} faster, ${slower} slower, ${unchanged} unchanged`, + `${ciResults.length} benchmarks: ${faster} faster, ${slower} slower, ${unchanged} within noise (${indicatorThreshold}%)`, ); + // --- Grouped details per suite --- + const suites = new Map(); + for (const r of ciResults) { + const key = r.suite || "other"; + if (!suites.has(key)) suites.set(key, []); + suites.get(key)!.push(r); + } + + lines.push(""); + lines.push("
"); + lines.push("Full benchmark results"); + lines.push(""); + + for (const [suite, results] of suites) { + lines.push(`### ${suite}\n`); + lines.push("| Benchmark | Median | Baseline | Delta | Throughput |"); + lines.push("|-----------|--------|----------|-------|------------|"); + + for (const r of results) { + const medianStr = fmtMs(r.median); + const baseStr = r.baselineMedian != null ? fmtMs(r.baselineMedian) : "—"; + let deltaStr = "—"; + if (r.deltaPercent != null) { + deltaStr = fmtDeltaMarkdown(r.deltaPercent, indicatorThreshold); + } + + const throughputParts: string[] = []; + if (r.recordsPerSec) throughputParts.push(`${fmtRate(r.recordsPerSec)} rec/s`); + if (r.mbPerSec) throughputParts.push(`${r.mbPerSec.toFixed(1)} MB/s`); + const throughputStr = throughputParts.join(", ") || "—"; + + lines.push( + `| ${r.name} | ${medianStr} | ${baseStr} | ${deltaStr} | ${throughputStr} |`, + ); + } + + lines.push(""); + } + + lines.push("
"); + return lines.join("\n"); } @@ -404,10 +448,12 @@ export async function runAllSuites( console.log(`Date: ${new Date().toISOString()}`); const allResults: BenchmarkResult[] = []; + const suiteResults: { suite: BenchmarkSuite; results: BenchmarkResult[] }[] = []; for (const suite of suites) { const results = await suite.run(); allResults.push(...results); + suiteResults.push({ suite, results }); } console.log(`\n${"=".repeat(70)}`); @@ -442,7 +488,10 @@ export async function runAllSuites( baseline = loadBaseline(); } - const ciResults = buildCIResults(allResults, baseline); + const ciResults: CIResult[] = []; + for (const { suite, results } of suiteResults) { + ciResults.push(...buildCIResults(results, baseline, suite.name)); + } if (options.jsonFile) { writeFileSync(options.jsonFile, JSON.stringify(ciResults, null, 2) + "\n"); @@ -450,7 +499,7 @@ export async function runAllSuites( } if (options.markdownFile) { - const md = generateMarkdown(ciResults); + const md = generateMarkdown(ciResults, options.failThreshold ?? 25); writeFileSync(options.markdownFile, md + "\n"); console.log(`Markdown report saved to ${options.markdownFile}`); } From 357d73d05952a3c6c82a18f63546b2df8beb6182 Mon Sep 17 00:00:00 2001 From: Ben Bernard Date: Sun, 22 Feb 2026 06:33:59 -0800 Subject: [PATCH 4/5] test: add multiplex tests for line-based output and file writing Cover the bug where CollectorReceiver was missing acceptLine(), causing output from line-based operations (tocsv, totable) to be silently dropped when run through multiplex. Tests added: - multiplex with tocsv to stdout (lines collected, not records) - multiplex with tocsv headers emitted per group - multiplex with --output-file-key writing CSV to separate files - multiplex with --output-file-eval and {{key}} interpolation - multiplex with xform (record-based transform) through multiplex - multiplex with passthrough records written to --output-file-key Co-Authored-By: Claude Opus 4.6 --- tests/operations/transform/multiplex.test.ts | 180 ++++++++++++++++++- 1 file changed, 176 insertions(+), 4 deletions(-) diff --git a/tests/operations/transform/multiplex.test.ts b/tests/operations/transform/multiplex.test.ts index d4e22588..88283130 100644 --- a/tests/operations/transform/multiplex.test.ts +++ b/tests/operations/transform/multiplex.test.ts @@ -1,11 +1,16 @@ -import { describe, test, expect } from "bun:test"; +import { describe, test, expect, afterAll } from "bun:test"; +import { readFileSync, mkdirSync, rmSync, existsSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; import { Record } from "../../../src/Record.ts"; import { CollectorReceiver, Operation } from "../../../src/Operation.ts"; import type { RecordReceiver } from "../../../src/Operation.ts"; import { MultiplexOperation } from "../../../src/operations/transform/multiplex.ts"; import { registerOperationFactory } from "../../../src/operations/transform/chain.ts"; +import { ToCsv } from "../../../src/operations/output/tocsv.ts"; +import { XformOperation } from "../../../src/operations/transform/xform.ts"; -// Register a test passthrough operation so multiplex can create it +// Register operations so multiplex can create them class PassthroughOp extends Operation { init(_args: string[]): void { // no-op @@ -16,6 +21,18 @@ class PassthroughOp extends Operation { } registerOperationFactory("passthrough", (next: RecordReceiver) => new PassthroughOp(next)); +registerOperationFactory("tocsv", (next: RecordReceiver) => new ToCsv(next)); +registerOperationFactory("xform", (next: RecordReceiver) => new XformOperation(next)); + +// Temp directory for file-output tests +const tmpBase = join(tmpdir(), `recs-multiplex-test-${process.pid}-${Date.now()}`); +mkdirSync(tmpBase, { recursive: true }); + +afterAll(() => { + if (existsSync(tmpBase)) { + rmSync(tmpBase, { recursive: true, force: true }); + } +}); function makeOp(args: string[]): { op: MultiplexOperation; collector: CollectorReceiver } { const collector = new CollectorReceiver(); @@ -39,7 +56,6 @@ describe("MultiplexOperation", () => { new Record({ x: 2 }), ]); - // Passthrough operation just passes records through expect(collector.records.length).toBe(2); expect(collector.records[0]!.get("x")).toBe(1); expect(collector.records[1]!.get("x")).toBe(2); @@ -53,7 +69,6 @@ describe("MultiplexOperation", () => { new Record({ group: "b", val: 3 }), ]); - // All records pass through, grouped by key expect(collector.records.length).toBe(3); }); @@ -62,4 +77,161 @@ describe("MultiplexOperation", () => { makeOp([]); }).toThrow("multiplex requires an operation"); }); + + test("multiplex with line-based output op (tocsv) to stdout", () => { + const { op, collector } = makeOp(["-k", "dept", "--", "tocsv", "--noheader", "-k", "name,score"]); + feedRecords(op, [ + new Record({ dept: "eng", name: "alice", score: 90 }), + new Record({ dept: "eng", name: "bob", score: 85 }), + new Record({ dept: "sales", name: "carol", score: 70 }), + ]); + + // tocsv pushes lines, not records — the collector must receive them + expect(collector.lines.length).toBe(3); + expect(collector.records.length).toBe(0); + + // Verify CSV content: each group gets its own tocsv instance + const allLines = collector.lines.join("\n"); + expect(allLines).toContain("alice,90"); + expect(allLines).toContain("bob,85"); + expect(allLines).toContain("carol,70"); + }); + + test("multiplex with tocsv including header per group", () => { + const { op, collector } = makeOp(["-k", "dept", "--", "tocsv", "-k", "name,score"]); + feedRecords(op, [ + new Record({ dept: "eng", name: "alice", score: 90 }), + new Record({ dept: "eng", name: "bob", score: 85 }), + new Record({ dept: "sales", name: "carol", score: 70 }), + ]); + + // Each group gets its own tocsv instance, so each emits a header + // eng group: header + 2 data rows = 3 lines + // sales group: header + 1 data row = 2 lines + expect(collector.lines.length).toBe(5); + + // Both groups should produce a "name,score" header line + const headerCount = collector.lines.filter(l => l === "name,score").length; + expect(headerCount).toBe(2); + }); + + test("multiplex with --output-file-key writes line-based output to files", () => { + const outDir = join(tmpBase, "output-file-key"); + mkdirSync(outDir, { recursive: true }); + + const fileA = join(outDir, "eng.csv"); + const fileB = join(outDir, "sales.csv"); + + // Group by outfile so it's available in group options for resolveOutputFile + const { op, collector } = makeOp([ + "-k", "outfile", + "--output-file-key", "outfile", + "--", "tocsv", "--noheader", "-k", "name,score", + ]); + feedRecords(op, [ + new Record({ dept: "eng", name: "alice", score: 90, outfile: fileA }), + new Record({ dept: "eng", name: "bob", score: 85, outfile: fileA }), + new Record({ dept: "sales", name: "carol", score: 70, outfile: fileB }), + ]); + + // Lines should NOT go to the collector when writing to files + expect(collector.lines.length).toBe(0); + expect(collector.records.length).toBe(0); + + // Files should exist with correct CSV content + const contentA = readFileSync(fileA, "utf-8"); + expect(contentA).toContain("alice,90"); + expect(contentA).toContain("bob,85"); + expect(contentA.trim().split("\n").length).toBe(2); + + const contentB = readFileSync(fileB, "utf-8"); + expect(contentB).toContain("carol,70"); + expect(contentB.trim().split("\n").length).toBe(1); + }); + + test("multiplex with --output-file-eval and {{key}} interpolation", () => { + const outDir = join(tmpBase, "output-file-eval"); + mkdirSync(outDir, { recursive: true }); + + const { op, collector } = makeOp([ + "-k", "dept", + "--output-file-eval", join(outDir, "report-{{dept}}.csv"), + "--", "tocsv", "--noheader", "-k", "name,score", + ]); + feedRecords(op, [ + new Record({ dept: "eng", name: "alice", score: 90 }), + new Record({ dept: "eng", name: "bob", score: 85 }), + new Record({ dept: "sales", name: "carol", score: 70 }), + ]); + + expect(collector.lines.length).toBe(0); + expect(collector.records.length).toBe(0); + + // Verify files created with interpolated names + const engFile = join(outDir, "report-eng.csv"); + const salesFile = join(outDir, "report-sales.csv"); + + expect(existsSync(engFile)).toBe(true); + expect(existsSync(salesFile)).toBe(true); + + const engContent = readFileSync(engFile, "utf-8"); + expect(engContent).toContain("alice,90"); + expect(engContent).toContain("bob,85"); + + const salesContent = readFileSync(salesFile, "utf-8"); + expect(salesContent).toContain("carol,70"); + }); + + test("multiplex with record-based transform op (xform)", () => { + const { op, collector } = makeOp([ + "-k", "dept", "--", + "xform", "{{tagged}} = {{dept}} + '-' + {{name}}", + ]); + feedRecords(op, [ + new Record({ dept: "eng", name: "alice" }), + new Record({ dept: "eng", name: "bob" }), + new Record({ dept: "sales", name: "carol" }), + ]); + + // xform pushes records, not lines + expect(collector.records.length).toBe(3); + expect(collector.lines.length).toBe(0); + + const tags = collector.records.map(r => r.get("tagged") as string); + expect(tags).toContain("eng-alice"); + expect(tags).toContain("eng-bob"); + expect(tags).toContain("sales-carol"); + }); + + test("multiplex with record-based passthrough and --output-file-key", () => { + const outDir = join(tmpBase, "record-output-file"); + mkdirSync(outDir, { recursive: true }); + + const fileA = join(outDir, "groupA.jsonl"); + const fileB = join(outDir, "groupB.jsonl"); + + // Group by outfile so it's available in group options + const { op, collector } = makeOp([ + "-k", "outfile", + "--output-file-key", "outfile", + "--", "passthrough", + ]); + feedRecords(op, [ + new Record({ group: "a", val: 1, outfile: fileA }), + new Record({ group: "a", val: 2, outfile: fileA }), + new Record({ group: "b", val: 3, outfile: fileB }), + ]); + + expect(collector.records.length).toBe(0); + expect(collector.lines.length).toBe(0); + + const contentA = readFileSync(fileA, "utf-8").trim().split("\n"); + expect(contentA.length).toBe(2); + expect(JSON.parse(contentA[0]!)).toMatchObject({ group: "a", val: 1 }); + expect(JSON.parse(contentA[1]!)).toMatchObject({ group: "a", val: 2 }); + + const contentB = readFileSync(fileB, "utf-8").trim().split("\n"); + expect(contentB.length).toBe(1); + expect(JSON.parse(contentB[0]!)).toMatchObject({ group: "b", val: 3 }); + }); }); From 57322f90be4d39ca9129c99e7911d87ebac37a98 Mon Sep 17 00:00:00 2001 From: Ben Bernard Date: Sun, 22 Feb 2026 06:44:09 -0800 Subject: [PATCH 5/5] fix: make benchmark CI step advisory-only (never fail build) GitHub Actions shared runners have inconsistent performance that causes benchmark regressions to be unreliable. Changes: - Remove process.exit(1) from bench.ts on regression detection; log a warning instead - Add continue-on-error: true to the CI benchmark step as a safety net - Add advisory note to the PR comment explaining runner variability - Update --fail-threshold help text to reflect advisory-only behavior Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 1 + tests/perf/bench.ts | 10 +++++++--- tests/perf/run.ts | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 197fc3e1..f69aa38d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,7 @@ jobs: - name: Run benchmarks id: bench + continue-on-error: true run: | BENCH_ARGS="--ci --markdown-file benchmark-report.md --json-file benchmark-results.json" if [ -f perf-baseline.json ]; then diff --git a/tests/perf/bench.ts b/tests/perf/bench.ts index 4c0d443e..878b98be 100644 --- a/tests/perf/bench.ts +++ b/tests/perf/bench.ts @@ -369,6 +369,10 @@ export function generateMarkdown(ciResults: CIResult[], failThreshold: number = lines.push( `${ciResults.length} benchmarks: ${faster} faster, ${slower} slower, ${unchanged} within noise (${indicatorThreshold}%)`, ); + lines.push(""); + lines.push( + "> ℹ️ **Note:** Benchmarks are advisory-only. GitHub Actions shared runners have variable performance, so results may fluctuate ±25% between runs. For reliable benchmarking, run locally with `bun run bench`.", + ); // --- Grouped details per suite --- const suites = new Map(); @@ -507,11 +511,11 @@ export async function runAllSuites( if (options.failThreshold != null) { const failures = checkThreshold(ciResults, options.failThreshold); if (failures.length > 0) { - console.error(`\nPerformance regression detected (threshold: ${options.failThreshold}%):`); + console.warn(`\nPerformance regression detected (threshold: ${options.failThreshold}%):`); for (const f of failures) { - console.error(` - ${f}`); + console.warn(` - ${f}`); } - process.exit(1); + console.warn(`\nBenchmarks are advisory-only — not failing the build.`); } else { console.log(`\nAll benchmarks within ${options.failThreshold}% threshold.`); } diff --git a/tests/perf/run.ts b/tests/perf/run.ts index 610fd027..6b0b9dea 100644 --- a/tests/perf/run.ts +++ b/tests/perf/run.ts @@ -63,8 +63,8 @@ Options: --save-baseline Save results as baseline for future comparison --ci Enable CI mode (machine-readable output, threshold checking, and markdown/JSON report generation) - --fail-threshold Exit with code 1 if any benchmark regresses more - than % vs baseline (requires --ci, default: 25) + --fail-threshold Warn if any benchmark regresses more than % + vs baseline (advisory-only, requires --ci, default: 25) --markdown-file Write a markdown report table to (requires --ci) --json-file Write JSON results to (requires --ci) --baseline-file Read/write baseline from instead of the